In [None]:
# 1. In this situation the OLS model doesn't even have a unique solution. A better model would be ridge regression or lasso
# 2. c)
# 3. b)
# 4. In this case, the model underfit on both the training and test datasets. The model suffers from a high bias.

In [4]:
import boto3
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from statistics import mode

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'CarPrice_Assignment.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
price = pd.read_csv(file_content_stream)
price.head()



Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [5]:
X = price[['wheelbase', 'enginesize', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']]
Y = price['price']
coef = list()

for i in range(0, 1000):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
    lasso_cv = LassoCV(alphas = [0.001, 0.01, 0.1, 1, 1, 100], normalize = True, cv = 5).fit(X_train, Y_train)
    cv_lambda = lasso_cv.alpha_
    lasso_md = Lasso(alpha = cv_lambda, normalize = True).fit(X_train, Y_train)
    coef.append(lasso_md.coef_)
    
coef_data = pd.DataFrame(coef)
coef_data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,183.024435,118.098142,280.63766,52.119731,2.149685,-44.115478,0.0
1,164.047467,115.063867,299.56141,44.521936,2.103817,-150.295941,16.30786
2,198.468642,112.041599,296.408019,51.392318,2.023053,-73.114229,0.0
3,159.842472,115.126306,252.380683,50.187867,1.699377,-60.973681,-14.72074
4,230.670064,83.14028,263.470257,63.895165,1.815626,-127.952317,62.282571


In [6]:
print('wheelbase', sum(coef_data[0] == 0.0))
print('engine size', sum(coef_data[1] == 0.0))
print('compression ratio', sum(coef_data[2] == 0.0))
print('horsepower', sum(coef_data[3] == 0.0))
print('peakrpm', sum(coef_data[4] == 0.0))
print('citympg', sum(coef_data[5] == 0.0))
print('highwaympg', sum(coef_data[6] == 0.0))

wheelbase 0
engine size 0
compression ratio 0
horsepower 0
peakrpm 0
citympg 14
highwaympg 368


In [7]:
X_train = X_train.drop(columns = ['highwaympg'], axis = 1)
X_test = X_test.drop(columns = ['highwaympg'], axis = 1)

def l2_normalization(X):
    x_mean = np.mean(X)
    l2 = np.sqrt(sum(X**2))
    return (X - x_mean) / l2

X_train = X_train.apply(l2_normalization, axis = 1)
X_test = X_test.apply(l2_normalization, axis = 1)

lm_md = LinearRegression().fit(X_train, Y_train)
lm_pred = lm_md.predict(X_test)
mse1 = np.mean(np.power(Y_test - lm_pred, 2))
print('MSE of the linear model is', mse1)

MSE of the linear model is 22320571.558828007


In [8]:
lambda_list = list()
for i in range(100):
    ridge_cv = RidgeCV(alphas = [0.001, 0.01, 0.1, 1, 1, 100], normalize = True, cv = 5).fit(X_train, Y_train)
    lambda_list.append(ridge_cv.alpha_)
    
cv_lambda = mode(lambda_list)

ridge_md = Ridge(alpha = cv_lambda).fit(X_train, Y_train)
ridge_pred = ridge_md.predict(X_test)

mse2 = np.mean(np.power(Y_test - ridge_pred, 2))

print('The ideal value for lambda is', cv_lambda)
print('The MSE of the ridge model is', mse2)

The ideal value for lambda is 0.01
The MSE of the ridge model is 32026691.16724264


In [None]:
# The ridge regression model is best because it gives a much lower MSE than the linear model. 

In [10]:
lambda_list

[0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01,
 0.01]