# Ridge and Lasso

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# from sklearn.datasets import fetch_california_housing
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
from sklearn.metrics import r2_score

In [17]:
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
boston = pd.read_csv(url)

# Match the old sklearn structure
data = boston.drop('medv', axis=1)
# target = pd.DataFrame(boston['medv'], columns=['Price'])
target = pd.DataFrame({'Price': boston['medv']})
df = pd.concat([data, target], axis=1)
df.head()


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,Price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [18]:
df.to_csv('../data_sets/boston.csv')

In [21]:
# split data into features X, and target y
X = df.drop('Price', axis=1)
y = df['Price']

In [22]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

## Apply Linear Regression

Not doing any feature engineering or normalization because this is an R&L demo)

Create 3 models: 
1. Standard Linear Regression
2. Range Regression
3. Lasso's Regression.

Then, check each with an R2-score.

### Standard Linear Regression

In [26]:
# SLR
model = LinearRegression()
model.fit(X_train, y_train)

# SLR R2-Score
y_pred = model.predict(X_test)
r2_model = r2_score(y_test, y_pred)
print('R2 score of base model is: ', r2_model)

R2 score of base model is:  0.5892223849182514


### Ridge Regression

In [None]:
from sklearn.linear_model import Ridge, Lasso

In [38]:
ridge_model = Ridge(alpha=1) # aplha is a hyper parameter, which can be optimized. Best around 0-10
ridge_model.fit(X_train, y_train)

# Ridge R2-Score
y_pred_ridge = ridge_model.predict(X_test)
r2_model_ridge = r2_score(y_test, y_pred_ridge)
print('R2 score of Ridge model, alpha 1.0 is: ', r2_model_ridge)

# hyperparameter optimization... you don't know the best alpha
# manual
# Grid search CV
# Randomized Search CV
# sklearn.linear_model.Ridge has several parameters. Use these to optimize the hyperparameter instead of just hardcoding.

R2 score of Ridge model, alpha 1.0 is:  0.5796111714164925


### Lasso's Regression

In [39]:
lasso_model = Lasso(alpha=1) # aplha is a hyper parameter, which can be optimized. Best around 0-10
lasso_model.fit(X_train, y_train)

# Lasso R2-Score
y_pred_lasso = lasso_model.predict(X_test)
r2_model_lasso = r2_score(y_test, y_pred_lasso)
print('R2 score of Lasso model, alpha 1.0 is: ', r2_model_lasso)

R2 score of Lasso model, alpha 1.0 is:  0.48789271561192615


Lasso regression appears to be selecting certain features, therefore producing the best R2 score.

### Cleaning Up

In [None]:
# Identifying coefficients with bad slope results, which means features that are not working for us.
# A coef_ of 0 means this feature is providing literally zero value to the prediction model.
bad_features = np.where(lasso_model.coef_ == 0)[0]
print('bad features: ', list(X.columns[bad_features]))

bad features:  ['indus', 'chas', 'nox']
