# Lasso & Ridge Regression
---

# Load packages and data

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Regression
from sklearn.model_selection import train_test_split # split data into train and test sets
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # calculate model performance
from sklearn.linear_model import Lasso # lasso regression
from sklearn.linear_model import Ridge # ridge regression
from sklearn.model_selection import RepeatedKFold # perform cross-validation for tuning the penalty parameter
from sklearn.model_selection import GridSearchCV # perform cross-validation for tuning the penalty parameter
from sklearn.preprocessing import StandardScaler # standardize the data

df = pd.read_csv('/home/simon/Predict_House_Prices/kc_house_data_cleaned_featured.csv', index_col=0)
df.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_sqft_living,bathrooms_floors,renovated
0,7129300520,13-10-2014,221900.0,3,1.0,1180,5650,1.0,0,0,...,1955,0,98178,47.5112,-122.257,1340,5650,0.002542,1.0,0
1,6414100192,09-12-2014,538000.0,3,2.25,2570,7242,2.0,0,0,...,1951,1991,98125,47.721,-122.319,1690,7639,0.001167,1.125,1
2,5631500400,25-02-2015,180000.0,2,1.0,770,10000,1.0,0,0,...,1933,0,98028,47.7379,-122.233,2720,8062,0.002597,1.0,0
3,2487200875,09-12-2014,604000.0,4,3.0,1960,5000,1.0,0,0,...,1965,0,98136,47.5208,-122.393,1360,5000,0.002041,3.0,0
4,1954400510,18-02-2015,510000.0,3,2.0,1680,8080,1.0,0,0,...,1987,0,98074,47.6168,-122.045,1800,7503,0.001786,2.0,0


# Prepare data for models

In [2]:
# Create dummy features for categorical variables
df_dummy = pd.get_dummies(df, prefix='Category_', columns=['zipcode'])

# Define dependent variables by dropping unneeded features
X_dummy = df_dummy.drop(columns=['id', 'date', 'price', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'bedrooms_sqft_living', 'bathrooms_floors'], axis=1)

# Define independent variable
y = df['price']

# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, test_size=0.25, random_state=1)

# List of numerical features
numerical = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition',
    'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'renovated']

# Standardize numerical features
scaler = StandardScaler().fit(X_train[numerical]) 
X_train[numerical] = scaler.transform(X_train[numerical])
X_test[numerical] = scaler.transform(X_test[numerical])

# Lasso Regression

## Model tuning

In [None]:
# Define the model
lassoModel = Lasso(random_state=1)

# K-Fold Cross-Validation
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=1)

# Tuning grid
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.02)

# Define grid search
search = GridSearchCV(lassoModel, grid, cv=cv, n_jobs=-1)

# Perform search
results = search.fit(X_train, y_train)

In [4]:
# Best shrinkage parameter
bestParams = results.best_params_
print(bestParams)

{'alpha': 0.9}


## Train and test model

In [None]:
# Define tuned model
tunedLassoModel = Lasso(**bestParams, random_state=1)
tunedLassoModel.fit(X_train,y_train)

# Make predictions on the test data
y_pred = tunedLassoModel.predict(X_test)

In [6]:
# Calculate the error scores and R² on the test set
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R²:', r2_score(y_test, y_pred))

Mean Absolute Error: 99669.09760855601
Mean Squared Error: 34879675173.324936
Root Mean Squared Error: 186761.0108489589
R²: 0.7821673025781308


## Comments:
- 75% train split, 25% test split
- numerical values are standardized to not favor features on a larger scale
- 3-fold cross-validation (computational constraints)
- best alpha at 0.9
- Lasso slightly outperforms the OLS linear regression

# Ridge Regression

## Model tuning

In [7]:
# Define the model
ridgeModel = Ridge(random_state=1)

# K-Fold Cross-Validation
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=1)

# Tuning grid
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.02)

# Define grid search
search = GridSearchCV(ridgeModel, grid, cv=cv, n_jobs=-1)

# Perform search
results = search.fit(X_train, y_train)

In [8]:
# Best shrinkage parameter
bestParams = results.best_params_
print(bestParams)

{'alpha': 0.68}


## Train and test model

In [None]:
# Define tuned model
tunedRidgeModel = Lasso(**bestParams, random_state=1)
tunedRidgeModel.fit(X_train,y_train)

# Make predictions on the test data
y_pred = tunedRidgeModel.predict(X_test)

In [10]:
# Calculate the error scores and R² on the test set
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R²:', r2_score(y_test, y_pred))

Mean Absolute Error: 99669.5414652452
Mean Squared Error: 34879507229.41604
Root Mean Squared Error: 186760.56122590776
R²: 0.782168351431783


## Comments:
- identical training setup to Lasso
- best alpha at 0.68
- model performs equal to Lasso
- Ridge runs a lot faster than Lasso, therefore preferred