In [3]:
!pip install -U kaleido
import seaborn as sns
import pandas as pd
sns.set(font_scale=1.5)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import plotly.graph_objects as go
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.utils import shuffle
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector


Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [4]:
#np.random.seed(105)
np.random.seed(319)

In [5]:
#load the fuel dataset, and drop any rows that have missing data
vehicle_data = sns.load_dataset('mpg').dropna()
vehicle_data  = vehicle_data.rename(columns = {"horsepower": "hp"})
px.scatter(vehicle_data, x = "hp", y = "mpg")

In [6]:
vehicle_data

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [7]:
vehicle_data[["hp", "weight"]]

Unnamed: 0,hp,weight
0,130.0,3504
1,165.0,3693
2,150.0,3436
3,150.0,3433
4,140.0,3449
...,...,...
393,86.0,2790
394,52.0,2130
395,84.0,2295
396,79.0,2625


In [8]:
poly_transform = PolynomialFeatures(degree = 2, include_bias = False)
vehicle_data_with_squared_features = \
             pd.DataFrame(poly_transform.fit_transform(vehicle_data[["hp", "weight"]]),
             columns = poly_transform.get_feature_names_out())

In [9]:
vehicle_data_with_squared_features

Unnamed: 0,hp,weight,hp^2,hp weight,weight^2
0,130.0,3504.0,16900.0,455520.0,12278016.0
1,165.0,3693.0,27225.0,609345.0,13638249.0
2,150.0,3436.0,22500.0,515400.0,11806096.0
3,150.0,3433.0,22500.0,514950.0,11785489.0
4,140.0,3449.0,19600.0,482860.0,11895601.0
...,...,...,...,...,...
387,86.0,2790.0,7396.0,239940.0,7784100.0
388,52.0,2130.0,2704.0,110760.0,4536900.0
389,84.0,2295.0,7056.0,192780.0,5267025.0
390,79.0,2625.0,6241.0,207375.0,6890625.0


In [10]:
poly_transform = PolynomialFeatures(degree = 3, include_bias = False)
vehicle_data_with_cubic_features = \
             pd.DataFrame(poly_transform.fit_transform(vehicle_data[["hp", "weight"]]),
             columns = poly_transform.get_feature_names_out())

In [11]:
vehicle_data_with_cubic_features

Unnamed: 0,hp,weight,hp^2,hp weight,weight^2,hp^3,hp^2 weight,hp weight^2,weight^3
0,130.0,3504.0,16900.0,455520.0,12278016.0,2197000.0,59217600.0,1.596142e+09,4.302217e+10
1,165.0,3693.0,27225.0,609345.0,13638249.0,4492125.0,100541925.0,2.250311e+09,5.036605e+10
2,150.0,3436.0,22500.0,515400.0,11806096.0,3375000.0,77310000.0,1.770914e+09,4.056575e+10
3,150.0,3433.0,22500.0,514950.0,11785489.0,3375000.0,77242500.0,1.767823e+09,4.045958e+10
4,140.0,3449.0,19600.0,482860.0,11895601.0,2744000.0,67600400.0,1.665384e+09,4.102793e+10
...,...,...,...,...,...,...,...,...,...
387,86.0,2790.0,7396.0,239940.0,7784100.0,636056.0,20634840.0,6.694326e+08,2.171764e+10
388,52.0,2130.0,2704.0,110760.0,4536900.0,140608.0,5759520.0,2.359188e+08,9.663597e+09
389,84.0,2295.0,7056.0,192780.0,5267025.0,592704.0,16193520.0,4.424301e+08,1.208782e+10
390,79.0,2625.0,6241.0,207375.0,6890625.0,493039.0,16382625.0,5.443594e+08,1.808789e+10


In [12]:
vehicle_data[["hp", "weight", "displacement"]]

Unnamed: 0,hp,weight,displacement
0,130.0,3504,307.0
1,165.0,3693,350.0
2,150.0,3436,318.0
3,150.0,3433,304.0
4,140.0,3449,302.0
...,...,...,...
393,86.0,2790,140.0
394,52.0,2130,97.0
395,84.0,2295,135.0
396,79.0,2625,120.0


In [13]:
poly_transform = PolynomialFeatures(degree = 2, include_bias = False)
vehicle_data_with_squared_features = \
             pd.DataFrame(poly_transform.fit_transform(vehicle_data[["hp", "weight", "displacement"]]),
             columns = poly_transform.get_feature_names_out())

In [14]:
vehicle_data_with_squared_features

Unnamed: 0,hp,weight,displacement,hp^2,hp weight,hp displacement,weight^2,weight displacement,displacement^2
0,130.0,3504.0,307.0,16900.0,455520.0,39910.0,12278016.0,1075728.0,94249.0
1,165.0,3693.0,350.0,27225.0,609345.0,57750.0,13638249.0,1292550.0,122500.0
2,150.0,3436.0,318.0,22500.0,515400.0,47700.0,11806096.0,1092648.0,101124.0
3,150.0,3433.0,304.0,22500.0,514950.0,45600.0,11785489.0,1043632.0,92416.0
4,140.0,3449.0,302.0,19600.0,482860.0,42280.0,11895601.0,1041598.0,91204.0
...,...,...,...,...,...,...,...,...,...
387,86.0,2790.0,140.0,7396.0,239940.0,12040.0,7784100.0,390600.0,19600.0
388,52.0,2130.0,97.0,2704.0,110760.0,5044.0,4536900.0,206610.0,9409.0
389,84.0,2295.0,135.0,7056.0,192780.0,11340.0,5267025.0,309825.0,18225.0
390,79.0,2625.0,120.0,6241.0,207375.0,9480.0,6890625.0,315000.0,14400.0


### Video 2: Forward Selection

In [15]:
# for reasons that will become clear later, we set up the training and dev indices as separate arrays first
all_indices = range(0, len(vehicle_data))
all_indices = shuffle(all_indices)
training_indices, dev_indices = np.split(all_indices, [320])
training_vehicle_data = vehicle_data.iloc[training_indices]
dev_vehicle_data = vehicle_data.iloc[dev_indices]

In [16]:
best_old_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 2, include_bias = False)),
    ('josh_regression', LinearRegression())
])
best_old_model.fit(training_vehicle_data[["hp"]], training_vehicle_data["mpg"])

In [17]:
mean_squared_error(best_old_model.predict(training_vehicle_data[["hp"]]), training_vehicle_data["mpg"])

18.948788936311274

In [18]:
mean_squared_error(best_old_model.predict(dev_vehicle_data[["hp"]]), dev_vehicle_data["mpg"]) #e1

19.453686826337034

In [19]:
numeric_features = ["cylinders", "displacement", "hp", "weight", "acceleration"]
massive_new_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('josh_regression', LinearRegression())
])
massive_new_model.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [20]:
mean_squared_error(massive_new_model.predict(training_vehicle_data[numeric_features]), training_vehicle_data["mpg"])

10.151573882857129

In [21]:
mean_squared_error(massive_new_model.predict(dev_vehicle_data[numeric_features]), dev_vehicle_data["mpg"]) #e2

18.193708544421117

### Video 3: SequentialFeatureSelection

In [22]:
poly3 = PolynomialFeatures(degree = 3, include_bias = False)
all_degree_3_combinations = poly3.fit_transform(vehicle_data[["cylinders", "displacement", "hp",
                                                              "weight", "acceleration"]])
all_degree_3_combinations = pd.DataFrame(all_degree_3_combinations, columns = poly3.get_feature_names_out())

In [23]:
all_degree_3_combinations

Unnamed: 0,cylinders,displacement,hp,weight,acceleration,cylinders^2,cylinders displacement,cylinders hp,cylinders weight,cylinders acceleration,...,hp^3,hp^2 weight,hp^2 acceleration,hp weight^2,hp weight acceleration,hp acceleration^2,weight^3,weight^2 acceleration,weight acceleration^2,acceleration^3
0,8.0,307.0,130.0,3504.0,12.0,64.0,2456.0,1040.0,28032.0,96.0,...,2197000.0,59217600.0,202800.0,1.596142e+09,5466240.0,18720.00,4.302217e+10,147336192.0,504576.00,1728.000
1,8.0,350.0,165.0,3693.0,11.5,64.0,2800.0,1320.0,29544.0,92.0,...,4492125.0,100541925.0,313087.5,2.250311e+09,7007467.5,21821.25,5.036605e+10,156839863.5,488399.25,1520.875
2,8.0,318.0,150.0,3436.0,11.0,64.0,2544.0,1200.0,27488.0,88.0,...,3375000.0,77310000.0,247500.0,1.770914e+09,5669400.0,18150.00,4.056575e+10,129867056.0,415756.00,1331.000
3,8.0,304.0,150.0,3433.0,12.0,64.0,2432.0,1200.0,27464.0,96.0,...,3375000.0,77242500.0,270000.0,1.767823e+09,6179400.0,21600.00,4.045958e+10,141425868.0,494352.00,1728.000
4,8.0,302.0,140.0,3449.0,10.5,64.0,2416.0,1120.0,27592.0,84.0,...,2744000.0,67600400.0,205800.0,1.665384e+09,5070030.0,15435.00,4.102793e+10,124903810.5,380252.25,1157.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,4.0,140.0,86.0,2790.0,15.6,16.0,560.0,344.0,11160.0,62.4,...,636056.0,20634840.0,115377.6,6.694326e+08,3743064.0,20928.96,2.171764e+10,121431960.0,678974.40,3796.416
388,4.0,97.0,52.0,2130.0,24.6,16.0,388.0,208.0,8520.0,98.4,...,140608.0,5759520.0,66518.4,2.359188e+08,2724696.0,31468.32,9.663597e+09,111607740.0,1288990.80,14886.936
389,4.0,135.0,84.0,2295.0,11.6,16.0,540.0,336.0,9180.0,46.4,...,592704.0,16193520.0,81849.6,4.424301e+08,2236248.0,11303.04,1.208782e+10,61097490.0,308815.20,1560.896
390,4.0,120.0,79.0,2625.0,18.6,16.0,480.0,316.0,10500.0,74.4,...,493039.0,16382625.0,116082.6,5.443594e+08,3857175.0,27330.84,1.808789e+10,128165625.0,908145.00,6434.856


In [24]:
#all_indices = range(0, len(vehicle_data))
#all_indices = shuffle(all_indices)
#training_indices, dev_indices = np.split(all_indices, [320])

In [25]:
feature_select = SequentialFeatureSelector(LinearRegression(),
                                           scoring='neg_mean_squared_error',
                                           cv=[[training_indices, dev_indices]],
                                           n_features_to_select = 4)
feature_select.fit(all_degree_3_combinations, vehicle_data["mpg"])

In [26]:
feature_select = SequentialFeatureSelector(estimator = LinearRegression(),
                                           scoring='neg_mean_squared_error',
                                           cv=[[training_indices, dev_indices]],
                                           n_features_to_select = 4)

best_four = pd.DataFrame(feature_select.fit_transform(all_degree_3_combinations, vehicle_data["mpg"]),
                         columns = feature_select.get_feature_names_out())

In [27]:
best_four
# the specific four parameters you observe will depend heavily on whatever training sample you happen to have split off from the full dataset

Unnamed: 0,hp,weight,hp weight,weight^2
0,130.0,3504.0,455520.0,12278016.0
1,165.0,3693.0,609345.0,13638249.0
2,150.0,3436.0,515400.0,11806096.0
3,150.0,3433.0,514950.0,11785489.0
4,140.0,3449.0,482860.0,11895601.0
...,...,...,...,...
387,86.0,2790.0,239940.0,7784100.0
388,52.0,2130.0,110760.0,4536900.0
389,84.0,2295.0,192780.0,5267025.0
390,79.0,2625.0,207375.0,6890625.0


In [28]:
len(vehicle_data)

392

In [29]:
best_four_sfs_model = LinearRegression()
best_four_sfs_model.fit(best_four.iloc[training_indices], vehicle_data.iloc[training_indices]["mpg"])

In [30]:
mean_squared_error(best_four_sfs_model.predict(best_four.iloc[training_indices]),
                                               vehicle_data.iloc[training_indices]["mpg"])

15.144620842922468

In [31]:
mean_squared_error(best_four_sfs_model.predict(best_four.iloc[dev_indices]),
                                               vehicle_data.iloc[dev_indices]["mpg"]) #e3

16.319391679212913

### Video 4: Regularization

### Regularization on a Simple Model with 5 Features

In [32]:
lm_model = LinearRegression()
lm_model.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [33]:
param_df = pd.DataFrame([lm_model.coef_], columns = numeric_features)
param_df

Unnamed: 0,cylinders,displacement,hp,weight,acceleration
0,-0.56448,8.7e-05,-0.046525,-0.004874,-0.128084


In [34]:
lm_with_ridge_model = Ridge(alpha = 0.001)
lm_with_ridge_model.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [35]:
param_df = pd.DataFrame([lm_with_ridge_model.coef_], columns = numeric_features)
param_df

Unnamed: 0,cylinders,displacement,hp,weight,acceleration
0,-0.564474,8.7e-05,-0.046525,-0.004874,-0.128084


In [36]:
lm_with_ridge_model = Ridge(alpha = 100)
lm_with_ridge_model.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [37]:
param_df = pd.DataFrame([lm_with_ridge_model.coef_], columns = numeric_features)
param_df

Unnamed: 0,cylinders,displacement,hp,weight,acceleration
0,-0.262533,-0.004226,-0.043977,-0.005005,-0.110041


In [38]:
lm_with_ridge_model.coef_

array([-0.26253268, -0.00422633, -0.04397693, -0.00500508, -0.11004116])

In [39]:
def get_parameters_for_given_alpha(alpha):
    lm_with_ridge_model = Ridge(alpha = alpha)
    lm_with_ridge_model.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])
    return alpha, *lm_with_ridge_model.coef_

In [40]:
param_df = pd.DataFrame([get_parameters_for_given_alpha(alpha) for alpha in [0.01, 0.1, 1, 10, 100, 1000, 10000]],
                        columns = ["alpha", *numeric_features])
param_df["SoS"] = param_df["cylinders"]**2 + param_df["displacement"]**2 + param_df["hp"]**2 + \
                  param_df["weight"]**2 + param_df["acceleration"]**2
param_df

Unnamed: 0,alpha,cylinders,displacement,hp,weight,acceleration,SoS
0,0.01,-0.564416,8.6e-05,-0.046525,-0.004874,-0.128082,0.337158
1,0.1,-0.563834,7.8e-05,-0.046521,-0.004874,-0.128058,0.336496
2,1.0,-0.558085,-5e-06,-0.046481,-0.004876,-0.127822,0.329982
3,10.0,-0.506427,-0.000751,-0.046119,-0.004896,-0.125592,0.274393
4,100.0,-0.262533,-0.004226,-0.043977,-0.005005,-0.110041,0.083009
5,1000.0,-0.044487,-0.007014,-0.038392,-0.005229,-0.054869,0.00654
6,10000.0,-0.004729,-0.007734,-0.031588,-0.005424,-0.00744,0.001165


In [41]:
param_df = pd.DataFrame([get_parameters_for_given_alpha(alpha) for alpha in [0.01, 0.1, 1, 10, 100, 1000, 10000, 1e5, 1e6, 1e7]],
                        columns = ["alpha", *numeric_features])
param_df["SoS"] = param_df["cylinders"]**2 + param_df["displacement"]**2 + param_df["hp"]**2 + \
                  param_df["weight"]**2 + param_df["acceleration"]**2
param_df

Unnamed: 0,alpha,cylinders,displacement,hp,weight,acceleration,SoS
0,0.01,-0.564416,8.6e-05,-0.046525,-0.004874,-0.128082,0.337158
1,0.1,-0.563834,7.8e-05,-0.046521,-0.004874,-0.128058,0.336496
2,1.0,-0.558085,-5e-06,-0.046481,-0.004876,-0.127822,0.329982
3,10.0,-0.506427,-0.000751,-0.046119,-0.004896,-0.125592,0.274393
4,100.0,-0.262533,-0.004226,-0.043977,-0.005005,-0.110041,0.083009
5,1000.0,-0.044487,-0.007014,-0.038392,-0.005229,-0.054869,0.00654
6,10000.0,-0.004729,-0.007734,-0.031588,-0.005424,-0.00744,0.001165
7,100000.0,-0.000554,-0.009375,-0.017656,-0.005784,0.00043,0.000434
8,1000000.0,-0.000112,-0.005178,-0.004158,-0.006778,0.000259,9e-05
9,10000000.0,-2.6e-05,-0.001464,-0.000755,-0.007058,4.4e-05,5.3e-05


In [42]:
def get_parameters_and_training_MSE_for_given_alpha(alpha):
    lm_with_ridge_model = Ridge(alpha = alpha)
    lm_with_ridge_model.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])
    training_mse = mean_squared_error(lm_with_ridge_model.predict(training_vehicle_data[numeric_features]), training_vehicle_data["mpg"])
    return alpha, *lm_with_ridge_model.coef_, training_mse

In [43]:
param_df = pd.DataFrame([get_parameters_and_training_MSE_for_given_alpha(alpha) for alpha in [0.01, 0.1, 1, 10, 100, 1000, 10000, 1e5, 1e6, 1e7, 1e8]],
                        columns = ["alpha", *numeric_features, "Training MSE"])
param_df

fig = px.line(param_df, x = "alpha", y = "Training MSE", log_x = True, markers = True)
fig.write_image("MSE_vs_alpha_most_basic.png", scale = 3)
fig.show()

In [44]:
param_df = pd.DataFrame([get_parameters_and_training_MSE_for_given_alpha(alpha) for alpha in [0.01, 0.1, 1, 10, 100, 1000, 10000, 1e5, 1e6, 1e7, 1e8]],
                        columns = ["alpha", *numeric_features, "Training MSE"])
param_df["1/alpha"] = 1.0/param_df["alpha"]

fig = px.line(param_df, x = "1/alpha", y = "Training MSE", log_x = True, markers = True)
fig.write_image("MSE_vs_1_over_alpha_most_basic.png", scale = 3)
fig.show()

In [45]:
param_df = pd.DataFrame([get_parameters_and_training_MSE_for_given_alpha(alpha) for alpha in 10**np.linspace(-2, 8, 100)],
                        columns = ["alpha", *numeric_features, "Training MSE"])
param_df["1/alpha"] = 1.0/param_df["alpha"]

fig = px.line(param_df, x = "1/alpha", y = "Training MSE", log_x = True, markers = True)
fig.write_image("MSE_vs_1_over_alpha_smooth.png", scale = 3)
fig.show()

## Video 5: How Regularization Works (no code)

## Video 6: Scaling

In [46]:
all_degree_3_combinations

Unnamed: 0,cylinders,displacement,hp,weight,acceleration,cylinders^2,cylinders displacement,cylinders hp,cylinders weight,cylinders acceleration,...,hp^3,hp^2 weight,hp^2 acceleration,hp weight^2,hp weight acceleration,hp acceleration^2,weight^3,weight^2 acceleration,weight acceleration^2,acceleration^3
0,8.0,307.0,130.0,3504.0,12.0,64.0,2456.0,1040.0,28032.0,96.0,...,2197000.0,59217600.0,202800.0,1.596142e+09,5466240.0,18720.00,4.302217e+10,147336192.0,504576.00,1728.000
1,8.0,350.0,165.0,3693.0,11.5,64.0,2800.0,1320.0,29544.0,92.0,...,4492125.0,100541925.0,313087.5,2.250311e+09,7007467.5,21821.25,5.036605e+10,156839863.5,488399.25,1520.875
2,8.0,318.0,150.0,3436.0,11.0,64.0,2544.0,1200.0,27488.0,88.0,...,3375000.0,77310000.0,247500.0,1.770914e+09,5669400.0,18150.00,4.056575e+10,129867056.0,415756.00,1331.000
3,8.0,304.0,150.0,3433.0,12.0,64.0,2432.0,1200.0,27464.0,96.0,...,3375000.0,77242500.0,270000.0,1.767823e+09,6179400.0,21600.00,4.045958e+10,141425868.0,494352.00,1728.000
4,8.0,302.0,140.0,3449.0,10.5,64.0,2416.0,1120.0,27592.0,84.0,...,2744000.0,67600400.0,205800.0,1.665384e+09,5070030.0,15435.00,4.102793e+10,124903810.5,380252.25,1157.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,4.0,140.0,86.0,2790.0,15.6,16.0,560.0,344.0,11160.0,62.4,...,636056.0,20634840.0,115377.6,6.694326e+08,3743064.0,20928.96,2.171764e+10,121431960.0,678974.40,3796.416
388,4.0,97.0,52.0,2130.0,24.6,16.0,388.0,208.0,8520.0,98.4,...,140608.0,5759520.0,66518.4,2.359188e+08,2724696.0,31468.32,9.663597e+09,111607740.0,1288990.80,14886.936
389,4.0,135.0,84.0,2295.0,11.6,16.0,540.0,336.0,9180.0,46.4,...,592704.0,16193520.0,81849.6,4.424301e+08,2236248.0,11303.04,1.208782e+10,61097490.0,308815.20,1560.896
390,4.0,120.0,79.0,2625.0,18.6,16.0,480.0,316.0,10500.0,74.4,...,493039.0,16382625.0,116082.6,5.443594e+08,3857175.0,27330.84,1.808789e+10,128165625.0,908145.00,6434.856


In [47]:
ss = StandardScaler()
rescaled_df = pd.DataFrame(ss.fit_transform(all_degree_3_combinations),
                           columns = ss.get_feature_names_out())

In [48]:
rescaled_df

Unnamed: 0,cylinders,displacement,hp,weight,acceleration,cylinders^2,cylinders displacement,cylinders hp,cylinders weight,cylinders acceleration,...,hp^3,hp^2 weight,hp^2 acceleration,hp weight^2,hp weight acceleration,hp acceleration^2,weight^3,weight^2 acceleration,weight acceleration^2,acceleration^3
0,1.483947,1.077290,0.664133,0.620540,-1.285258,1.543439,1.225979,0.985504,1.023186,0.609812,...,0.253777,0.334709,0.255559,0.375014,0.212022,-0.777902,0.352053,0.065767,-0.781142,-1.063297
1,1.483947,1.488732,1.574594,0.843334,-1.466724,1.543439,1.570844,1.653412,1.171347,0.426793,...,1.348294,1.224887,1.261545,0.970575,0.803569,-0.305269,0.613777,0.196284,-0.841789,-1.155608
2,1.483947,1.182542,1.184397,0.540382,-1.648189,1.543439,1.314200,1.367166,0.969880,0.243773,...,0.815550,0.724442,0.663290,0.534128,0.289998,-0.864771,0.264510,-0.174141,-1.114132,-1.240232
3,1.483947,1.048584,1.184397,0.536845,-1.285258,1.543439,1.201918,1.367166,0.967528,0.609812,...,0.815550,0.722988,0.868523,0.531314,0.485744,-0.338988,0.260727,-0.015401,-0.819472,-1.063297
4,1.483947,1.029447,0.924265,0.555706,-1.829655,1.543439,1.185878,1.176335,0.980071,0.060754,...,0.514634,0.515285,0.282924,0.438053,0.059951,-1.278539,0.280982,-0.242303,-1.247238,-1.317502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,-0.864014,-0.520637,-0.480448,-0.221125,0.021294,-0.834466,-0.674794,-0.674723,-0.630102,-0.927552,...,-0.490619,-0.496412,-0.541863,-0.468670,-0.449359,-0.441255,-0.407205,-0.289983,-0.127314,-0.141444
388,-0.864014,-0.932079,-1.364896,-0.999134,3.287676,-0.834466,-0.847227,-0.999135,-0.888796,0.719624,...,-0.726892,-0.816845,-0.987532,-0.863344,-0.840224,1.164952,-0.836791,-0.424902,2.159666,4.801385
389,-0.864014,-0.568479,-0.532474,-0.804632,-1.430430,-0.834466,-0.694844,-0.693806,-0.824123,-1.659631,...,-0.511293,-0.592083,-0.847689,-0.675335,-1.027698,-1.908253,-0.750396,-1.118573,-1.515059,-1.137772
390,-0.864014,-0.712005,-0.662540,-0.415627,1.110088,-0.834466,-0.754995,-0.741514,-0.694776,-0.378493,...,-0.558822,-0.588010,-0.535433,-0.582537,-0.405562,0.534397,-0.536563,-0.197507,0.731857,1.034458


In [49]:
np.mean(rescaled_df)

2.1092177684900933e-16

In [50]:
np.var(rescaled_df)


The behavior of DataFrame.var with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)



Unnamed: 0,0
cylinders,1.0
displacement,1.0
hp,1.0
weight,1.0
acceleration,1.0
cylinders^2,1.0
cylinders displacement,1.0
cylinders hp,1.0
cylinders weight,1.0
cylinders acceleration,1.0


In [51]:
pd.DataFrame(ss.inverse_transform(rescaled_df),
             columns = ss.get_feature_names_out())

Unnamed: 0,cylinders,displacement,hp,weight,acceleration,cylinders^2,cylinders displacement,cylinders hp,cylinders weight,cylinders acceleration,...,hp^3,hp^2 weight,hp^2 acceleration,hp weight^2,hp weight acceleration,hp acceleration^2,weight^3,weight^2 acceleration,weight acceleration^2,acceleration^3
0,8.0,307.0,130.0,3504.0,12.0,64.0,2456.0,1040.0,28032.0,96.0,...,2197000.0,59217600.0,202800.0,1.596142e+09,5466240.0,18720.00,4.302217e+10,147336192.0,504576.00,1728.000
1,8.0,350.0,165.0,3693.0,11.5,64.0,2800.0,1320.0,29544.0,92.0,...,4492125.0,100541925.0,313087.5,2.250311e+09,7007467.5,21821.25,5.036605e+10,156839863.5,488399.25,1520.875
2,8.0,318.0,150.0,3436.0,11.0,64.0,2544.0,1200.0,27488.0,88.0,...,3375000.0,77310000.0,247500.0,1.770914e+09,5669400.0,18150.00,4.056575e+10,129867056.0,415756.00,1331.000
3,8.0,304.0,150.0,3433.0,12.0,64.0,2432.0,1200.0,27464.0,96.0,...,3375000.0,77242500.0,270000.0,1.767823e+09,6179400.0,21600.00,4.045958e+10,141425868.0,494352.00,1728.000
4,8.0,302.0,140.0,3449.0,10.5,64.0,2416.0,1120.0,27592.0,84.0,...,2744000.0,67600400.0,205800.0,1.665384e+09,5070030.0,15435.00,4.102793e+10,124903810.5,380252.25,1157.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,4.0,140.0,86.0,2790.0,15.6,16.0,560.0,344.0,11160.0,62.4,...,636056.0,20634840.0,115377.6,6.694326e+08,3743064.0,20928.96,2.171764e+10,121431960.0,678974.40,3796.416
388,4.0,97.0,52.0,2130.0,24.6,16.0,388.0,208.0,8520.0,98.4,...,140608.0,5759520.0,66518.4,2.359188e+08,2724696.0,31468.32,9.663597e+09,111607740.0,1288990.80,14886.936
389,4.0,135.0,84.0,2295.0,11.6,16.0,540.0,336.0,9180.0,46.4,...,592704.0,16193520.0,81849.6,4.424301e+08,2236248.0,11303.04,1.208782e+10,61097490.0,308815.20,1560.896
390,4.0,120.0,79.0,2625.0,18.6,16.0,480.0,316.0,10500.0,74.4,...,493039.0,16382625.0,116082.6,5.443594e+08,3857175.0,27330.84,1.808789e+10,128165625.0,908145.00,6434.856


In [52]:
scaled_ridge_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge(alpha = 1))
])
scaled_ridge_model.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

## Video 7: GridSearchCV

In [53]:
scaled_ridge_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge(alpha = 1))
])
scaled_ridge_model.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [54]:
def get_training_and_dev_MSE_for_model_with_alpha(alpha, training, dev):
    scaled_ridge_model = Pipeline([
        ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
        ('scale', StandardScaler()),
        ('josh_regression', Ridge(alpha = alpha))
    ])
    scaled_ridge_model.fit(training[numeric_features], training["mpg"])
    training_error = mean_squared_error(scaled_ridge_model.predict(training[numeric_features]), training["mpg"])
    validation_error = mean_squared_error(scaled_ridge_model.predict(dev[numeric_features]), dev["mpg"])
    return alpha, training_error, validation_error

In [55]:
get_training_and_dev_MSE_for_model_with_alpha(10, training_vehicle_data, dev_vehicle_data)

(10, 14.424024606452516, 15.637901365900639)

In [56]:
results = [get_training_and_dev_MSE_for_model_with_alpha(alpha, training_vehicle_data, dev_vehicle_data) \
             for alpha in 10**np.linspace(-5, 4, 100)]
results = pd.DataFrame(results, columns = ["alpha", "Training MSE", "Dev MSE"])

In [57]:
results["1/alpha"] = 1.0/results["alpha"]

fig = px.line(results, x = "1/alpha", y = ["Training MSE", "Dev MSE"], log_x = True)
fig.write_image("MSE_vs_1_over_alpha_for_regularized_55_parameter_model.png", scale = 3)
fig.show()

In [58]:
results

Unnamed: 0,alpha,Training MSE,Dev MSE,1/alpha
0,0.000010,10.161073,18.088539,100000.000000
1,0.000012,10.164086,18.067380,81113.083079
2,0.000015,10.167841,18.042118,65793.322466
3,0.000019,10.172459,18.012132,53366.992312
4,0.000023,10.178066,17.976840,43287.612811
...,...,...,...,...
95,4328.761281,21.999616,28.890391,0.000231
96,5336.699231,22.866702,30.072831,0.000187
97,6579.332247,23.935122,31.491170,0.000152
98,8111.308308,25.230633,33.172013,0.000123


In [59]:
results.sort_values("Dev MSE").iloc[0, :]

Unnamed: 0,47
alpha,0.187382
Training MSE,12.955621
Dev MSE,14.91936
1/alpha,5.336699


In [60]:
scaled_ridge_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge())
])

parameters_to_try = {'josh_regression__alpha': 10**np.linspace(-5, 4, 100)}

from sklearn.model_selection import GridSearchCV
model_finder = GridSearchCV(estimator = scaled_ridge_model,
                               param_grid = parameters_to_try,
                               scoring = "neg_mean_squared_error",
                               cv=[[training_indices, dev_indices]])

In [61]:
model_finder.fit(vehicle_data[numeric_features], vehicle_data["mpg"])

In [62]:
best_model = model_finder.best_estimator_
best_model

In [63]:
print(f"Dev MSE for our best Ridge Regression model: {-model_finder.best_score_}") #e4

Dev MSE for our best Ridge Regression model: 14.919359682999515


In [64]:
mean_squared_error(best_model.predict(training_vehicle_data[numeric_features]), training_vehicle_data["mpg"])

13.117880075653806

In [65]:
best_model.named_steps["josh_regression"].coef_

array([-0.73530146,  1.75813672, -2.46936302, -2.25615871, -0.43855591,
       -5.8966427 , -8.78224878,  4.71180015, -1.67507671, -3.1155934 ,
       -6.52792997,  5.89534164,  0.09106638, -2.63718664,  0.19428043,
        0.80252557, -5.06081536, -2.96912185,  1.06313392, -0.90177626,
        0.9576364 , -1.31198808, -0.10363051,  1.62934063,  3.82843492,
        0.65387865, -3.94457103, -0.06559292,  1.55403677,  0.77701981,
        2.75919058,  6.64525538,  0.82720006,  0.68679851,  1.69426684,
        3.61354074, -5.78649891,  0.41620389,  6.60086005, -1.23490938,
        2.2620615 , 10.84849258,  1.4014152 ,  1.12620482, -7.35774836,
        0.71444529,  0.36462511, -6.947313  ,  1.25991107, -2.11921906,
       -0.38827504, -1.80387919, -4.41125974,  7.46613999, -0.3811871 ])

In [66]:
pd.DataFrame([best_model.named_steps["josh_regression"].coef_],
             columns = best_model.named_steps["josh_transform"].get_feature_names_out())

Unnamed: 0,cylinders,displacement,hp,weight,acceleration,cylinders^2,cylinders displacement,cylinders hp,cylinders weight,cylinders acceleration,...,hp^3,hp^2 weight,hp^2 acceleration,hp weight^2,hp weight acceleration,hp acceleration^2,weight^3,weight^2 acceleration,weight acceleration^2,acceleration^3
0,-0.735301,1.758137,-2.469363,-2.256159,-0.438556,-5.896643,-8.782249,4.7118,-1.675077,-3.115593,...,0.714445,0.364625,-6.947313,1.259911,-2.119219,-0.388275,-1.803879,-4.41126,7.46614,-0.381187


In [67]:
scaled_ridge_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge())
])

parameters_to_try = {'josh_regression__alpha': [0.001, 1, 1000]}

from sklearn.model_selection import GridSearchCV
model_finder = GridSearchCV(estimator = scaled_ridge_model,
                               param_grid = parameters_to_try,
                               scoring = "neg_mean_squared_error",
                               cv=[[training_indices, dev_indices]])

model_finder.fit(vehicle_data[numeric_features], vehicle_data["mpg"])
model_finder.cv_results_

{'mean_fit_time': array([0.0095613 , 0.00885034, 0.00758672]),
 'std_fit_time': array([0., 0., 0.]),
 'mean_score_time': array([0.00329328, 0.0031383 , 0.00617647]),
 'std_score_time': array([0., 0., 0.]),
 'param_josh_regression__alpha': masked_array(data=[0.001, 1.0, 1000.0],
              mask=[False, False, False],
        fill_value=1e+20),
 'params': [{'josh_regression__alpha': 0.001},
  {'josh_regression__alpha': 1},
  {'josh_regression__alpha': 1000}],
 'split0_test_score': array([-16.37659428, -15.06212874, -24.20924068]),
 'mean_test_score': array([-16.37659428, -15.06212874, -24.20924068]),
 'std_test_score': array([0., 0., 0.]),
 'rank_test_score': array([2, 1, 3], dtype=int32)}

In [68]:
scaled_ridge_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge())
])
vhd_shuffled = shuffle(vehicle_data.copy())

parameters_to_try = {'josh_regression__alpha': [0.001, 1, 1000]}

from sklearn.model_selection import GridSearchCV
model_finder = GridSearchCV(estimator = scaled_ridge_model,
                               param_grid = parameters_to_try,
                               scoring = "neg_mean_squared_error",
                               cv=len(vhd_shuffled))

model_finder.fit(vhd_shuffled[numeric_features], vhd_shuffled["mpg"])
model_finder.cv_results_

{'mean_fit_time': array([0.01614952, 0.02755315, 0.01275449]),
 'std_fit_time': array([0.01546922, 0.01538167, 0.00683765]),
 'mean_score_time': array([0.00481343, 0.00730493, 0.00356616]),
 'std_score_time': array([0.0047313 , 0.00579241, 0.0015338 ]),
 'param_josh_regression__alpha': masked_array(data=[0.001, 1.0, 1000.0],
              mask=[False, False, False],
        fill_value=1e+20),
 'params': [{'josh_regression__alpha': 0.001},
  {'josh_regression__alpha': 1},
  {'josh_regression__alpha': 1000}],
 'split0_test_score': array([-7.30262227, -3.28574471, -6.18145941]),
 'split1_test_score': array([-5.92595161, -0.36275215, -0.02292337]),
 'split2_test_score': array([-0.3818171 , -0.64345717, -0.03742495]),
 'split3_test_score': array([-27.57295791,  -7.25582059, -24.49467345]),
 'split4_test_score': array([-41.83213724, -32.94134559, -31.8728667 ]),
 'split5_test_score': array([-67.01211045,  -1.75862386,  -5.98732606]),
 'split6_test_score': array([-1.69927733, -2.73172956, -6.

## Video 8: Lasso

In [69]:
# with lm, MSE is around 31, but with lasso it's much lower. convergence issues cause lasso to get lucky and quit early before overfitting
lm = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', LinearRegression())
])
lm.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])
mean_squared_error(lm.predict(dev_vehicle_data[numeric_features]), dev_vehicle_data["mpg"])

18.193679879058053

In [70]:
from sklearn.linear_model import Lasso

scaled_lasso_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_lasso', Lasso())
])

parameters_to_try = {'josh_lasso__alpha': 10**np.linspace(-4, 4, 100)}

model_finder = GridSearchCV(estimator = scaled_lasso_model,
                               param_grid = parameters_to_try,
                               scoring = "neg_mean_squared_error",
                               cv=[[training_indices, dev_indices]])
model_finder.fit(vehicle_data[numeric_features], vehicle_data["mpg"])


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.063e+03, tolerance: 1.861e+00


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.052e+03, tolerance: 1.861e+00


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.040e+03, tolerance: 1.861e+00


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.025e+03, tolerance: 1.861e+00


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.008e+03, tolerance: 1.861e+00


Obje

In [71]:
model_finder.best_estimator_.named_steps['josh_lasso'].coef_

array([-7.95206806, -2.70765972, -4.2878518 , -2.99754917, -2.84327527,
       -2.42622551, -1.33830876,  8.37520517, -0.        ,  2.39565192,
       -3.23452763,  4.43948954,  0.        ,  0.28356743, -0.57049576,
       -0.        , -4.26223813, -1.05313044,  0.17398078,  0.        ,
       -3.82545684, -1.24879275,  2.61529324,  1.76525906,  3.49628931,
       -0.50881667, -0.90181507,  1.22975842,  1.60737703,  0.        ,
        1.7836755 ,  3.94772021, -2.71025222, -0.03789689, -0.        ,
       -0.        , -1.31634806,  3.38918265,  3.41432587, -1.15342026,
        0.6154467 ,  4.69992345, -1.40914613, -0.68821243, -5.58772732,
       -0.8319492 , -0.        , -3.71128819,  1.43485194, -0.        ,
       -0.64370142, -0.32372119, -1.64003059,  5.19216381,  0.41254236])

In [72]:
best_model = model_finder.best_estimator_

In [73]:
lasso_weights = pd.DataFrame([best_model.named_steps["josh_lasso"].coef_],
             columns = best_model.named_steps["josh_transform"].get_feature_names_out())

### K-Fold Cross Validation

In [75]:
scaled_ridge_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge())
])

parameters_to_try = {'josh_regression__alpha': [0.001, 1, 1000]}

from sklearn.model_selection import GridSearchCV

model_finder = GridSearchCV(estimator = scaled_ridge_model,
                               param_grid = parameters_to_try,
                               scoring = "neg_mean_squared_error",
                               cv = 5)

model_finder.fit(vehicle_data[numeric_features], vehicle_data["mpg"])
model_finder.cv_results_

{'mean_fit_time': array([0.02976642, 0.02654524, 0.02941175]),
 'std_fit_time': array([0.00590685, 0.00546999, 0.0135911 ]),
 'mean_score_time': array([0.00383658, 0.00856967, 0.00432076]),
 'std_score_time': array([0.00024806, 0.0042218 , 0.00171799]),
 'param_josh_regression__alpha': masked_array(data=[0.001, 1.0, 1000.0],
              mask=[False, False, False],
        fill_value=1e+20),
 'params': [{'josh_regression__alpha': 0.001},
  {'josh_regression__alpha': 1},
  {'josh_regression__alpha': 1000}],
 'split0_test_score': array([-18.53896299, -12.92016987, -14.84924206]),
 'split1_test_score': array([-16.08488308, -17.91648827, -18.87742526]),
 'split2_test_score': array([-10.43515779,  -7.8811933 ,  -9.79665953]),
 'split3_test_score': array([-11.79765903, -12.4320557 , -20.2963363 ]),
 'split4_test_score': array([-48.55697894, -50.64783524, -66.99972503]),
 'mean_test_score': array([-21.08272837, -20.35954848, -26.16387764]),
 'std_test_score': array([14.04184401, 15.47415718,

In [76]:
model_finder

In [78]:
x_values = np.linspace(0, 100, 100) + np.random.random(100)

In [79]:
y_values = x_values * 13.9 + np.random.random(len(x_values)) * 30

In [80]:
x_values = x_values / 1000

In [81]:
x2 = (x_values + np.random.random(100) * 3 * np.mean(x_values)) * 1000

In [82]:
x3 = (x_values + np.random.random(100) * 3 * np.mean(x_values)) * 1000

In [83]:
df = pd.DataFrame({"x": x_values, "x2": x2, "x3": x3, "y": y_values})
df

Unnamed: 0,x,x2,x3,y
0,0.000327,19.582711,88.056726,11.853319
1,0.001657,130.548372,41.644841,35.241831
2,0.002100,32.509799,87.470641,46.252581
3,0.003218,139.535202,102.358495,49.120082
4,0.004253,107.077193,6.923975,80.662256
...,...,...,...,...
95,0.096894,177.603673,226.562081,1375.416579
96,0.097420,211.614510,103.492134,1357.601264
97,0.098266,214.862042,102.624505,1374.409900
98,0.099527,121.970362,209.389334,1399.740637


In [84]:
px.scatter(df, x = "x", y = "y")

In [85]:
rm = Ridge(alpha = 10)
rm.fit(df[["x", "x2", "x3"]], df["y"])
df["yhat"] = rm.predict(df[["x", "x2", "x3"]])

In [86]:
rm.coef_

array([57.60748071,  3.60393313,  3.34658706])

In [87]:
mean_squared_error(df["y"], df["yhat"])

79887.52427217203

In [88]:
px.scatter(df, x = "y", y = "yhat")

In [89]:
pm = Pipeline([
    ('scaler', StandardScaler()),
    ('josh_regression', Ridge(alpha = 10))
])
pm.fit(df[["x", "x2", "x3"]], df["y"])
df["yhat"] = pm.predict(df[["x", "x2", "x3"]])
mean_squared_error(df["y"], df["yhat"])

2114.317803105908

In [90]:
df

Unnamed: 0,x,x2,x3,y,yhat
0,0.000327,19.582711,88.056726,11.853319,57.006323
1,0.001657,130.548372,41.644841,35.241831,108.146006
2,0.002100,32.509799,87.470641,46.252581,84.318586
3,0.003218,139.535202,102.358495,49.120082,161.765057
4,0.004253,107.077193,6.923975,80.662256,108.550382
...,...,...,...,...,...
95,0.096894,177.603673,226.562081,1375.416579,1340.619403
96,0.097420,211.614510,103.492134,1357.601264,1302.843519
97,0.098266,214.862042,102.624505,1374.409900,1314.028717
98,0.099527,121.970362,209.389334,1399.740637,1333.216908


In [91]:
pm.named_steps['josh_regression'].coef_

array([341.14444113,  27.14001616,  28.26808806])

In [92]:
px.line(x = x_values, y = y_values)

In [93]:
vehicle_data_weight_in_gigatons = vehicle_data.copy()[["weight", "model_year"]]
vehicle_data_weight_in_gigatons["weight"] = vehicle_data_weight_in_gigatons["weight"] / 2204622621848.8
vehicle_data_weight_in_gigatons

Unnamed: 0,weight,model_year
0,1.589388e-09,70
1,1.675117e-09,70
2,1.558543e-09,70
3,1.557183e-09,70
4,1.564440e-09,70
...,...,...
393,1.265523e-09,82
394,9.661517e-10,82
395,1.040994e-09,82
396,1.190680e-09,82


In [94]:
vehicle_data_weight_in_mg = vehicle_data.copy()
vehicle_data_weight_in_mg["weight"] = vehicle_data_weight_in_mg["weight"] * 453592
vehicle_data_weight_in_mg

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,1589386368,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,1675115256,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,1558542112,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,1557181336,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,1564438808,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,1265521680,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,966150960,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,1040993640,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,1190679000,18.6,82,usa,ford ranger


In [95]:
training_vehicle_data = vehicle_data_weight_in_mg

In [96]:
param_df = pd.DataFrame([get_parameters_and_training_MSE_for_given_alpha(alpha) for alpha in [0.01, 0.1, 1, 10, 100, 1000, 10000, 1e5, 1e6, 1e7, 1e8]],
                        columns = ["alpha", *numeric_features, "Training MSE"])
param_df["1/alpha"] = 1.0/param_df["alpha"]


Ill-conditioned matrix (rcond=1.78339e-18): result may not be accurate.


Ill-conditioned matrix (rcond=1.7849e-18): result may not be accurate.


Ill-conditioned matrix (rcond=1.79992e-18): result may not be accurate.


Ill-conditioned matrix (rcond=1.9502e-18): result may not be accurate.


Ill-conditioned matrix (rcond=3.45467e-18): result may not be accurate.


Ill-conditioned matrix (rcond=1.85901e-17): result may not be accurate.



In [98]:
param_df

Unnamed: 0,alpha,cylinders,displacement,hp,weight,acceleration,Training MSE,1/alpha
0,0.01,-0.397891,-8.4e-05,-0.045257,-1.143523e-08,-0.029104,17.761396,100.0
1,0.1,-0.397556,-8.9e-05,-0.045255,-1.143542e-08,-0.029096,17.761396,10.0
2,1.0,-0.394239,-0.000138,-0.045239,-1.143732e-08,-0.029021,17.7614,1.0
3,10.0,-0.363871,-0.000588,-0.045088,-1.145485e-08,-0.028314,17.761713,0.1
4,100.0,-0.205469,-0.002931,-0.044218,-1.155228e-08,-0.023815,17.771526,0.01
5,1000.0,-0.038234,-0.005391,-0.042328,-1.171834e-08,-0.010978,17.79723,0.001
6,10000.0,-0.004159,-0.006445,-0.038297,-1.190035e-08,0.000548,17.808517,0.0001
7,100000.0,-0.000489,-0.008949,-0.022813,-1.261423e-08,0.001711,17.907946,1e-05
8,1000000.0,-0.000108,-0.005387,-0.005361,-1.50296e-08,0.000475,18.307474,1e-06
9,10000000.0,-1.5e-05,-0.000858,-0.00067,-1.658408e-08,6.2e-05,18.614347,1e-07


In [99]:
param_df = pd.DataFrame([get_parameters_and_training_MSE_for_given_alpha(alpha) for alpha in [0.01, 0.1, 1, 10, 100, 1000, 10000, 1e5, 1e6, 1e7, 1e8]],
                        columns = ["alpha", *numeric_features, "Training MSE"])
param_df["1/alpha"] = 1.0/param_df["alpha"]


Ill-conditioned matrix (rcond=1.78339e-18): result may not be accurate.


Ill-conditioned matrix (rcond=1.7849e-18): result may not be accurate.


Ill-conditioned matrix (rcond=1.79992e-18): result may not be accurate.


Ill-conditioned matrix (rcond=1.9502e-18): result may not be accurate.


Ill-conditioned matrix (rcond=3.45467e-18): result may not be accurate.


Ill-conditioned matrix (rcond=1.85901e-17): result may not be accurate.



In [100]:
param_df

Unnamed: 0,alpha,cylinders,displacement,hp,weight,acceleration,Training MSE,1/alpha
0,0.01,-0.397891,-8.4e-05,-0.045257,-1.143523e-08,-0.029104,17.761396,100.0
1,0.1,-0.397556,-8.9e-05,-0.045255,-1.143542e-08,-0.029096,17.761396,10.0
2,1.0,-0.394239,-0.000138,-0.045239,-1.143732e-08,-0.029021,17.7614,1.0
3,10.0,-0.363871,-0.000588,-0.045088,-1.145485e-08,-0.028314,17.761713,0.1
4,100.0,-0.205469,-0.002931,-0.044218,-1.155228e-08,-0.023815,17.771526,0.01
5,1000.0,-0.038234,-0.005391,-0.042328,-1.171834e-08,-0.010978,17.79723,0.001
6,10000.0,-0.004159,-0.006445,-0.038297,-1.190035e-08,0.000548,17.808517,0.0001
7,100000.0,-0.000489,-0.008949,-0.022813,-1.261423e-08,0.001711,17.907946,1e-05
8,1000000.0,-0.000108,-0.005387,-0.005361,-1.50296e-08,0.000475,18.307474,1e-06
9,10000000.0,-1.5e-05,-0.000858,-0.00067,-1.658408e-08,6.2e-05,18.614347,1e-07


In [101]:
numeric_features = ["cylinders", "displacement", "hp", "weight", "acceleration"]

all_55_features = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('josh_regression', LinearRegression())
])
all_55_features.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [102]:
mean_squared_error(all_55_features.predict(dev_vehicle_data[numeric_features]), dev_vehicle_data["mpg"])

488.14191030632696

In [103]:
all_55_features_regularized = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('josh_regression', Ridge(alpha = 100))
])
all_55_features_regularized.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])


Ill-conditioned matrix (rcond=3.66449e-56): result may not be accurate.



In [104]:
def get_training_and_dev_MSE(model):
    training_mse = mean_squared_error(model.predict(vehicle_data.iloc[training_indices][numeric_features]),
                                      vehicle_data.iloc[training_indices]["mpg"])
    dev_mse = mean_squared_error(model.predict(vehicle_data.iloc[dev_indices][numeric_features]),
                                      vehicle_data.iloc[dev_indices]["mpg"])
    return training_mse, dev_mse

In [105]:
get_training_and_dev_MSE(all_55_features)

(506.2501042675676, 488.14191030632696)

In [106]:
get_training_and_dev_MSE(all_55_features_regularized)

(35174.55686311289, 31509.18307154614)

In [107]:
def get_training_and_dev_MSEs_for_alpha(alpha):
    all_55_features_regularized = Pipeline([
        ('josh_transform', PolynomialFeatures(degree = 1, include_bias = False)),
        ('scaler', StandardScaler()),
        ('josh_regression', Ridge(alpha = alpha))
    ])
    all_55_features_regularized.fit(vehicle_data.iloc[training_indices][numeric_features], vehicle_data.iloc[training_indices]["mpg"])
    tmse, dmse = get_training_and_dev_MSE(all_55_features_regularized)
    return alpha, tmse, dmse

In [108]:
get_training_and_dev_MSEs_for_alpha(10)

(10, 17.06993727654615, 21.66552005527506)

In [109]:
alphas = 10**np.linspace(4, -4, 100)
mses = pd.DataFrame(np.array([get_training_and_dev_MSEs_for_alpha(alpha) for alpha in alphas]),
                    columns = ["alpha", "train MSE", "dev MSE"])

In [110]:
mses

Unnamed: 0,alpha,train MSE,dev MSE
0,10000.000000,49.726388,62.591636
1,8302.175681,48.340678,60.957822
2,6892.612104,46.792420,59.127643
3,5722.367659,45.084600,57.102642
4,4750.810162,43.227937,54.893119
...,...,...,...
95,0.000210,17.010202,21.460268
96,0.000175,17.010202,21.460268
97,0.000145,17.010202,21.460267
98,0.000120,17.010202,21.460267


In [111]:
px.line(mses, x = "alpha", y = "train MSE", log_x = True)

In [112]:
mean_squared_error(all_55_features_regularized.predict(training_vehicle_data[numeric_features]), training_vehicle_data["mpg"])

11.257905912038932

In [113]:
mean_squared_error(all_55_features_regularized.predict(dev_vehicle_data[numeric_features]), dev_vehicle_data["mpg"])

31509.18307154614

In [114]:
pipelined_model3_scale_first = Pipeline([
    ('scale', StandardScaler()),
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('josh_regression', Ridge(alpha = 1))
])
pipelined_model3_scale_first.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [115]:
mean_squared_error(pipelined_model3_scale_first.predict(dev_vehicle_data[numeric_features]), dev_vehicle_data["mpg"])

2717.5535976536185

In [116]:
pipelined_model3_features_first = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge(alpha = 1))
])
pipelined_model3_features_first.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [117]:
mean_squared_error(pipelined_model3_features_first.predict(dev_vehicle_data[numeric_features]), dev_vehicle_data["mpg"])

23.980539938873033

In [118]:
from sklearn.preprocessing import FunctionTransformer
div100 = FunctionTransformer(lambda x : x, validate=True)

In [119]:
pipelined_model4_features_first = Pipeline([
    ('div100', div100),
    ('josh_transform', PolynomialFeatures(degree = 8, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge(alpha = 1))
])
pipelined_model4_features_first.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [120]:
mean_squared_error(pipelined_model4_features_first.predict(dev_vehicle_data[numeric_features]), dev_vehicle_data["mpg"])

117.50227649087907

### IGNORE JUST SANITY CHECKING

In [121]:
best_four

Unnamed: 0,hp,weight,hp weight,weight^2
0,130.0,3504.0,455520.0,12278016.0
1,165.0,3693.0,609345.0,13638249.0
2,150.0,3436.0,515400.0,11806096.0
3,150.0,3433.0,514950.0,11785489.0
4,140.0,3449.0,482860.0,11895601.0
...,...,...,...,...
387,86.0,2790.0,239940.0,7784100.0
388,52.0,2130.0,110760.0,4536900.0
389,84.0,2295.0,192780.0,5267025.0
390,79.0,2625.0,207375.0,6890625.0


In [122]:
vehicle_data_small = vehicle_data.copy().iloc[0:7, :]

In [123]:
vehicle_data_small["mpg"] = vehicle_data["hp"]

In [124]:
vehicle_data_small["cylinders"] = -vehicle_data["hp"]

In [125]:
vehicle_data_small.iloc[5, 3] = -vehicle_data_small.iloc[5, 3]
vehicle_data_small.iloc[6, 3] = -vehicle_data_small.iloc[6, 3]
vehicle_data_small

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,model_year,origin,name
0,130.0,-130.0,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,165.0,-165.0,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,150.0,-150.0,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,150.0,-150.0,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,140.0,-140.0,302.0,140.0,3449,10.5,70,usa,ford torino
5,198.0,-198.0,429.0,-198.0,4341,10.0,70,usa,ford galaxie 500
6,220.0,-220.0,454.0,-220.0,4354,9.0,70,usa,chevrolet impala


In [126]:
cylmodel = LinearRegression().fit(vehicle_data_small.iloc[0:5][["cylinders"]], vehicle_data_small.iloc[0:5]["mpg"])

In [127]:
cylmodel.coef_

array([-1.])

In [128]:
hpmodel = LinearRegression().fit(vehicle_data_small.iloc[0:5][["hp"]], vehicle_data_small.iloc[0:5]["mpg"])

In [129]:
hpmodel.coef_

array([1.])

In [130]:
mean_squared_error(cylmodel.predict(vehicle_data_small.iloc[5:7][["cylinders"]]), vehicle_data_small.iloc[5:7]["mpg"])

8.077935669463161e-28

In [131]:
cylmodel.predict(vehicle_data_small.iloc[5:7][["cylinders"]])

array([198., 220.])

In [132]:
hpmodel.predict(vehicle_data_small.iloc[5:7][["hp"]])

array([-198., -220.])

In [133]:
mean_squared_error(hpmodel.predict(vehicle_data_small.iloc[5:7][["hp"]]), vehicle_data_small.iloc[5:7]["mpg"])

175207.9999999999

In [134]:
feature_select = SequentialFeatureSelector(LinearRegression(),
                                           scoring='r2',
                                           cv=[[[0, 1, 2, 3, 4], [5, 6]]],
                                           n_features_to_select = 1)
best_four = pd.DataFrame(feature_select.fit_transform(vehicle_data_small[['hp', 'cylinders']], vehicle_data_small["mpg"]),
                         columns = feature_select.get_feature_names_out())
best_four

Unnamed: 0,cylinders
0,-130.0
1,-165.0
2,-150.0
3,-150.0
4,-140.0
5,-198.0
6,-220.0


## Scale vs. No Scale Experiment

In [135]:
from sklearn.model_selection import GridSearchCV

In [136]:
model_with_no_scaling = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge())
])
#model_with_no_scaling.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

In [137]:
parameters = {'josh_regression__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

lr_model_finder = GridSearchCV(model_with_no_scaling,
                               parameters,
                               scoring = "neg_mean_squared_error",
                               cv=[[training_indices, dev_indices]])

In [138]:
lr_model_finder.fit(vehicle_data[numeric_features],
                    vehicle_data["mpg"])

In [139]:
lr_model_finder.best_params_

{'josh_regression__alpha': 0.1}

In [140]:
print(f"MSE: {lr_model_finder.best_score_}")

MSE: -14.934514409527829


In [141]:
lr_model_finder.best_estimator_

# lr_model_finder.predict DON'T DO IT!!!

In [142]:
lr_model_finder.best_params_

{'josh_regression__alpha': 0.1}

In [None]:
mean_squared_error(lr_model_finder.predict(vehicle_data.iloc[training_indices][numeric_features]),
                vehicle_data.iloc[training_indices]["mpg"])

In [None]:
mean_squared_error(lr_model_finder.predict(vehicle_data.iloc[dev_indices][numeric_features]),
                vehicle_data.iloc[dev_indices]["mpg"])

In [None]:
model_with_no_scaling = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('scale', StandardScaler()),
    ('josh_regression', Ridge(alpha = 0.001))
])
model_with_no_scaling.fit(vehicle_data.iloc[training_indices][numeric_features], vehicle_data.iloc[training_indices]["mpg"])
mean_squared_error(model_with_no_scaling.predict(vehicle_data.iloc[dev_indices][numeric_features]),
                vehicle_data.iloc[dev_indices]["mpg"])

### Experiment to see if CV scores match for simpler case

In [None]:
parameters = {'fit_intercept': [False, True]}

lr_model_finder = GridSearchCV(LinearRegression(),
                               parameters,
                               scoring = "neg_mean_squared_error",
                               cv=[[training_indices, dev_indices]])

lr_model_finder.fit(vehicle_data[["hp"]], vehicle_data["mpg"])

In [None]:
lr_model_finder.best_params_

In [None]:
print(f"MSE: {lr_model_finder.best_score_}")

In [None]:
lr_model_finder.cv_results_

In [None]:
model = LinearRegression()
model.fit(vehicle_data.iloc[training_indices][["hp"]], vehicle_data.iloc[training_indices]["mpg"])

In [None]:
mean_squared_error(model.predict(vehicle_data.iloc[training_indices][["hp"]]), vehicle_data.iloc[training_indices]["mpg"])

In [None]:
mean_squared_error(model.predict(vehicle_data.iloc[dev_indices][["hp"]]), vehicle_data.iloc[dev_indices]["mpg"])

In [None]:
mean_squared_error(lr_model_finder.predict(vehicle_data.iloc[dev_indices][["hp"]]), vehicle_data.iloc[dev_indices]["mpg"])

##### HEY LOOK CROSS VALIDATION JUST LIKE WE EXPECT!

In [None]:
tiny_vehicle_data = vehicle_data.iloc[0:120]

In [None]:
tiny_indices = range(0, len(tiny_vehicle_data))
tiny_indices = shuffle(tiny_indices)
num_train = int(np.ceil(len(tiny_indices)*0.6))
tiny_training_indices, tiny_dev_indices = np.split(tiny_indices, [num_train])

In [None]:
parameters = {'fit_intercept': [False, True]}

lr_model_finder = GridSearchCV(LinearRegression(),
                               parameters,
                               scoring = "neg_mean_squared_error",
                               cv=[[tiny_training_indices, tiny_dev_indices]])

lr_model_finder.fit(tiny_vehicle_data[["hp"]], tiny_vehicle_data["mpg"])

In [None]:
lr_model_finder.cv_results_

In [None]:
mean_squared_error(lr_model_finder.predict(tiny_vehicle_data.iloc[tiny_dev_indices][["hp"]]), tiny_vehicle_data.iloc[tiny_dev_indices]["mpg"])

In [None]:
model = LinearRegression(fit_intercept = True)
model.fit(tiny_vehicle_data.iloc[tiny_training_indices][["hp"]], tiny_vehicle_data.iloc[tiny_training_indices]["mpg"])

In [None]:
mean_squared_error(model.predict(tiny_vehicle_data.iloc[tiny_training_indices][["hp"]]), tiny_vehicle_data.iloc[tiny_training_indices]["mpg"])

In [None]:
mean_squared_error(model.predict(tiny_vehicle_data.iloc[tiny_dev_indices][["hp"]]), tiny_vehicle_data.iloc[tiny_dev_indices]["mpg"])

### What wait what's the difference between two sections above?

In [None]:
parameters = {'fit_intercept': [False, True]}

lr_model_finder = GridSearchCV(LinearRegression(),
                               parameters,
                               scoring = "neg_mean_squared_error",
                               cv=[[training_indices, dev_indices]])

lr_model_finder.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
print(f"best score native: {lr_model_finder.best_score_}")
print(f'training score recomputed: {mean_squared_error(lr_model_finder.predict(vehicle_data.iloc[training_indices][["hp"]]), vehicle_data.iloc[training_indices]["mpg"])}')
print(f'dev score recomputed: {mean_squared_error(lr_model_finder.predict(vehicle_data.iloc[dev_indices][["hp"]]), vehicle_data.iloc[dev_indices]["mpg"])}')

### An experiment in model variance

As a sidetrack while trying to verify something about cv score storage, I decided to see the MSE for models fit with the two styles, iloc vs with the data set aside.

What happened was that I saw the MSE you get on your dev set is enormously dependent on the split. This is no surprise since the model has enormous crazy huge variance.

In [None]:
pipelined_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('josh_regression', LinearRegression())
])
pipelined_model.fit(training_vehicle_data[numeric_features], training_vehicle_data["mpg"])

print(f'training MSE: {mean_squared_error(pipelined_model.predict(training_vehicle_data[numeric_features]), training_vehicle_data["mpg"])}')
print(f'dev MSE: {mean_squared_error(pipelined_model.predict(dev_vehicle_data[numeric_features]), dev_vehicle_data["mpg"])}')

In [None]:
pipelined_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('josh_regression', LinearRegression())
])
pipelined_model.fit(vehicle_data.iloc[training_indices][numeric_features], vehicle_data.iloc[training_indices]["mpg"])

print(f'training MSE: {mean_squared_error(pipelined_model.predict(vehicle_data.iloc[training_indices][numeric_features]), vehicle_data.iloc[training_indices]["mpg"])}')
print(f'dev MSE: {mean_squared_error(pipelined_model.predict(vehicle_data.iloc[dev_indices][numeric_features]), vehicle_data.iloc[dev_indices]["mpg"])}')

ahhh waht is happening??? how is MSE so much worse for the above thing which is EXACTLY THE SAME EXCEPT CHOSEN SAMPLES???

In [None]:
samples2 = range(0, len(vehicle_data))
samples2 = shuffle(samples2)
training_indices2, dev_indices2 = np.split(samples2, [320])

In [None]:
training_vehicle_data2 = vehicle_data.iloc[training_indices2]
dev_vehicle_data2 = vehicle_data.iloc[dev_indices2]

pipelined_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('josh_regression', LinearRegression())
])
pipelined_model.fit(training_vehicle_data2[numeric_features], training_vehicle_data2["mpg"])

print(f'training MSE: {mean_squared_error(pipelined_model.predict(training_vehicle_data2[numeric_features]), training_vehicle_data2["mpg"])}')
print(f'dev MSE: {mean_squared_error(pipelined_model.predict(dev_vehicle_data2[numeric_features]), dev_vehicle_data2["mpg"])}')

In [None]:
pipelined_model = Pipeline([
    ('josh_transform', PolynomialFeatures(degree = 3, include_bias = False)),
    ('josh_regression', LinearRegression())
])
pipelined_model.fit(vehicle_data.iloc[training_indices2][numeric_features], vehicle_data.iloc[training_indices2]["mpg"])

print(f'training MSE: {mean_squared_error(pipelined_model.predict(vehicle_data.iloc[training_indices2][numeric_features]), vehicle_data.iloc[training_indices2]["mpg"])}')
print(f'dev MSE: {mean_squared_error(pipelined_model.predict(vehicle_data.iloc[dev_indices2][numeric_features]), vehicle_data.iloc[dev_indices2]["mpg"])}')

## SANITY CHECKING CHECK YOUR UNDERSTANDING PROBLEM

In [None]:
df = pd.DataFrame({'phi1': [1, 1], 'phi2': [-2, 3], 'y': [-3, 12]})

In [None]:
df

In [None]:
y_hat = df["phi1"] * 3+ df["phi2"] * 2 - 2
y_hat

In [None]:
mean_squared_error(df["y"], y_hat)

In [None]:
2*3**2 + 2*2**2

In [None]:
26+12.5