In [1]:
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import pandas as pd

In [2]:
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_2/datasets/real-estate-evaluation.csv')
df.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [3]:
# Separate the data into features and target 
X = df.drop('Y house price of unit area', axis=1)
y = df['Y house price of unit area']

In [4]:
# Check the features shape 
X.shape

(414, 6)

### Perform ridge regression

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
# Scale the training data
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)

In [7]:
# Create and train the model
model = Ridge(alpha=1)
model.fit(X_train_transformed, y_train)

In [8]:
# Scale the testing data and create predictions
X_test_transformed = scaler.transform(X_test)
y_predicted = model.predict(X_test_transformed)

In [9]:
# Assess the MSE
mean_squared_error(y_test, y_predicted)

66.57207822876354

In [10]:
# Use RidgeCV to optimize for alpha
from sklearn.linear_model import RidgeCV
model_cv = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10])
model_cv = model_cv.fit(X_train_transformed, y_train)

In [11]:
# Identify the optimzied alpha value
model_cv.alpha_

10.0

In [13]:
model2 = Ridge(alpha=10)
model2.fit(X_train_transformed, y_train)
y_predicted2 = model2.predict(X_test_transformed)
mean_squared_error(y_test, y_predicted2)

65.88993532237089

In [19]:
model2.coef_

array([ 1.58698744, -2.91315096, -5.60283009,  3.3772345 ,  2.79500169,
       -0.22635251])

### Compare performance with a linear regression model

In [14]:
# Create and train a linear regression model, create predictions with the model, and evaluate its MSE
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train_transformed, y_train)
y_predicted_lr = lr_model.predict(X_test_transformed)
mean_squared_error(y_test, y_predicted_lr)

66.67127577951871

In [20]:
lr_model.coef_

array([ 1.66479781, -3.01256969, -6.15522388,  3.36402546,  2.74315411,
       -0.64792098])

### Lasso regression

In [16]:
from sklearn.linear_model import Lasso

In [17]:
# Create and train a lasso regression model
lasso_model = Lasso(alpha=1)
lasso_model.fit(X_train_transformed, y_train)

In [18]:
# Get the model coeffcients
lasso_model.coef_

array([ 0.57862013, -1.95842663, -5.25287755,  2.78975492,  2.2176361 ,
        0.        ])

In [22]:
# Create predictions with the model
y_predicted_lasso = lasso_model.predict(X_test_transformed)

### Assess the lasso regression MSE and compare to ridge regression

In [23]:
# Evaluate the MSE
mean_squared_error(y_test,y_predicted_lasso)

67.35725559008489

In [26]:
print('lasso:', mean_squared_error(y_test,y_predicted_lasso))
print('lr:', mean_squared_error(y_test, y_predicted_lr))

print('ridge 10:', mean_squared_error(y_test, y_predicted2))
print('ridge 1:', mean_squared_error(y_test, y_predicted))

lasso: 67.35725559008489
lr: 66.67127577951871
ridge 10: 65.88993532237089
ridge 1: 66.57207822876354
