# Week3 Logistic Regression - Categories

## Highlights
- sklearn train_test_split: split your dataset
- sklearn.metrics mutual_info_score: it tells us how much we can learn about one variable if we know the value of another
- one-hot encoding to deal with categorical data
- calculate the accuracy of your prediction

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [49]:
# download data
# data = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv"
# !wget $data

In [50]:
# read data
df = pd.read_csv("data.csv")
df

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,46120
11910,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,56670
11911,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50620
11912,Acura,ZDX,2013,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50920


In [51]:
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [52]:
# only choose certain columns

selected_df  = df[
    ["Make",
    "Model",
    "Year",
    "Engine HP",
    "Engine Cylinders",
    "Transmission Type",
    "Vehicle Style",
    "highway MPG",
    "city mpg",
    "MSRP"]
].copy()
selected_df



Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [53]:
# format the data
selected_df.columns = selected_df.columns.str.replace(" ", "_").str.lower()
selected_df_columns = list(selected_df.dtypes.index)
selected_df_columns



['make',
 'model',
 'year',
 'engine_hp',
 'engine_cylinders',
 'transmission_type',
 'vehicle_style',
 'highway_mpg',
 'city_mpg',
 'msrp']

In [54]:
for c in selected_df_columns:
    if selected_df[c].dtype == "object":
        selected_df[c] = selected_df[c].str.lower().str.replace(" ", "_")
    
selected_df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500


In [55]:
# rename msrp to price
selected_df = selected_df.rename(columns={"msrp": "price"})
selected_df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920


In [56]:
selected_df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
price                 0
dtype: int64

In [57]:
# Fill in the missing values of the selected features with 0
selected_df.fillna(0, inplace=True)


In [58]:
selected_df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

In [59]:
#Q1 What is the most frequent observation (mode) for the column transmission_type?
selected_df["transmission_type"].mode()


0    automatic
Name: transmission_type, dtype: object

In [60]:
# turn the price variable from numeric into a binary format
selected_df["above_average"] = (selected_df.price > selected_df["price"].mean()).astype(int)
selected_df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135,1
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650,1
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350,0
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450,0
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120,1
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670,1
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620,1
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920,1


In [61]:
# set up the validation framework
from sklearn.model_selection import train_test_split

selected_df_full_train, selected_df_test = train_test_split(selected_df, test_size=0.2, random_state=42)
selected_df_train, selected_df_val = train_test_split(selected_df_full_train, test_size=0.25, random_state=42)

In [62]:
len(selected_df_train), len(selected_df_val), len(selected_df_test)

(7148, 2383, 2383)

In [63]:
selected_df_train = selected_df_train.reset_index(drop=True)
selected_df_val = selected_df_val.reset_index(drop=True)
selected_df_test = selected_df_test.reset_index(drop=True)


In [64]:
y_train = selected_df_train["above_average"].values
y_val = selected_df_val["above_average"].values
y_test = selected_df_test["above_average"].values

del selected_df_train["price"]
del selected_df_val["price"]
del selected_df_test["price"]
del selected_df_train["above_average"]
del selected_df_val["above_average"]
del selected_df_test["above_average"]

In [65]:
selected_df_train

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,mitsubishi,endeavor,2011,225.0,6.0,automatic,4dr_suv,19,15
1,kia,borrego,2009,276.0,6.0,automatic,4dr_suv,21,17
2,lamborghini,gallardo,2012,570.0,10.0,manual,convertible,20,12
3,chevrolet,colorado,2016,200.0,4.0,automatic,crew_cab_pickup,27,20
4,pontiac,vibe,2009,158.0,4.0,automatic,4dr_hatchback,26,20
...,...,...,...,...,...,...,...,...,...
7143,toyota,sienna,2016,266.0,6.0,automatic,passenger_minivan,25,18
7144,chevrolet,hhr,2009,260.0,4.0,manual,wagon,29,21
7145,hyundai,veracruz,2012,260.0,6.0,automatic,4dr_suv,22,17
7146,mitsubishi,expo,1993,136.0,4.0,manual,2dr_hatchback,26,19


Q2 What are the two features that have the biggest correlation in this dataset?

engine_hp and year
engine_hp and engine_cylinders
highway_mpg and engine_cylinders
highway_mpg and city_mpg

In [66]:
# correlation engine_hp and year
selected_df_train["engine_hp"].corr(selected_df_train["year"])

0.3345455568164391

In [67]:
# correlation engine_hp and engine_cylinders
selected_df_train["engine_hp"].corr(selected_df_train["engine_cylinders"])

0.7819780299482815

In [68]:
# correlation highway_mpg and engine_cylinders
selected_df_train["highway_mpg"].corr(selected_df_train["engine_cylinders"])

-0.595856440686465

In [69]:
# correlation highway_mpg and city_mpg
selected_df_train["highway_mpg"].corr(selected_df_train["city_mpg"])

0.8525891672814325

In [70]:
selected_df_train.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
dtype: object

In [71]:
categorical = list(selected_df_train.dtypes[selected_df_train.dtypes == "object"].index)
numerical = list(selected_df_train.dtypes[selected_df_train.dtypes != "object"].index)
categorical, numerical

(['make', 'model', 'transmission_type', 'vehicle_style'],
 ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg'])

In [72]:
# Q3 Which of these variables has the lowest mutual information score?
from sklearn.metrics import mutual_info_score

for c in categorical:
    score = mutual_info_score(selected_df_train[c], y_train)
    print(f"mutual inofrmation score between '{c}' and 'above_average': {round(score, 2)}")

mutual inofrmation score between 'make' and 'above_average': 0.24
mutual inofrmation score between 'model' and 'above_average': 0.46
mutual inofrmation score between 'transmission_type' and 'above_average': 0.02
mutual inofrmation score between 'vehicle_style' and 'above_average': 0.08


In [73]:
# one-hot encoding
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = selected_df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

val_dict = selected_df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dict)

In [74]:
# training logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [75]:
model.intercept_[0]

-0.36217058412908215

In [76]:
model.coef_[0].round(3)

array([ 7.800e-02, -1.210e-01,  3.700e-02, -3.000e-03,  1.354e+00,
        1.807e+00,  5.980e-01,  2.916e+00,  1.790e-01,  2.393e+00,
        0.000e+00, -4.910e-01,  2.360e+00, -1.502e+00, -1.418e+00,
       -3.858e+00,  3.770e-01, -5.140e-01, -1.915e+00,  6.240e-01,
       -9.780e-01, -1.316e+00, -2.450e-01, -2.695e+00,  3.270e-01,
       -1.532e+00,  1.100e-02,  1.956e+00,  1.328e+00,  1.204e+00,
        4.043e+00,  1.059e+00,  5.000e-03, -1.665e+00,  0.000e+00,
        9.410e-01, -2.019e+00, -9.940e-01, -1.443e+00, -3.520e-01,
       -3.204e+00,  2.022e+00,  9.830e-01,  9.160e-01, -2.360e-01,
        3.330e-01, -2.734e+00, -1.930e+00,  3.256e+00, -7.680e-01,
       -7.360e-01,  1.191e+00, -4.150e-01, -4.800e-02, -9.000e-03,
       -1.355e+00, -2.000e-03, -9.280e-01, -3.000e-03, -4.000e-03,
       -3.000e-03, -1.211e+00, -1.900e-01,  4.350e-01, -4.650e-01,
       -6.600e-02, -1.690e-01, -1.690e-01, -0.000e+00, -1.600e-02,
       -9.900e-01,  1.570e-01,  2.600e-02,  3.600e-02,  1.706e

In [77]:
# Q4 What accuracy did you get?
y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)
round((price_decision == y_val).mean(),2)

0.95

Q5 Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

- year
- engine_hp
- transmission_type
- city_mpg

In [78]:
dv_no_year = DictVectorizer(sparse=False)

selected_df_train_no_year = selected_df_train[
    ["make",
    "model",
    "engine_hp",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]
]

selected_df_train_no_year
selected_df_val_no_year = selected_df_val[
    ["make",
    "model",
    "engine_hp",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]
]

train_dict_no_year = selected_df_train_no_year[["make",
    "model",
    "engine_hp",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]].to_dict(orient="records")
X_train_no_year = dv_no_year.fit_transform(train_dict_no_year)

val_dict_no_year = selected_df_val_no_year[["make",
    "model",
    "engine_hp",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]].to_dict(orient="records")
X_val_no_year = dv_no_year.transform(val_dict_no_year)


In [79]:
# training logistic regression no year

model_no_year = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)
model_no_year.fit(X_train_no_year, y_train)

In [80]:
# accuracy no year
y_pred_no_year = model_no_year.predict_proba(X_val_no_year)[:, 1]
price_decision_no_year = (y_pred_no_year >= 0.5)
round((price_decision_no_year == y_val).mean(),2)

0.95

In [81]:
# training logistic regression no_engine_hp
dv_no_engine_hp = DictVectorizer(sparse=False)

selected_df_train_no_engine_hp = selected_df_train[
    ["make",
     "year",
    "model",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]
]

selected_df_val_no_engine_hp = selected_df_val[
    ["make",
     "year",
    "model",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]
]

train_dict_no_engine_hp = selected_df_train_no_engine_hp[["make",
     "year",
    "model",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]].to_dict(orient="records")
X_train_no_engine_hp = dv_no_engine_hp.fit_transform(train_dict_no_engine_hp)

val_dict_no_engine_hp = selected_df_val_no_engine_hp[["make",
     "year",
    "model",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]].to_dict(orient="records")
X_val_no_engine_hp = dv_no_engine_hp.transform(val_dict_no_engine_hp)


model_no_engine_hp = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)
model_no_engine_hp.fit(X_train_no_engine_hp, y_train)

# accuracy _no_engine_hp
y_pred_no_engine_hp = model_no_engine_hp.predict_proba(X_val_no_engine_hp)[:, 1]
price_decision_no_engine_hp = (y_pred_no_engine_hp >= 0.5)
round((price_decision_no_engine_hp == y_val).mean(),2)

0.92

In [82]:
# training logistic regression no transmission_type
dv_no_transmission_type = DictVectorizer(sparse=False)

selected_df_train_no_transmission_type = selected_df_train[
    ["make",
     "year",
    "model",
    "engine_cylinders",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]
]

selected_df_val_no_transmission_type = selected_df_val[
    ["make",
     "year",
    "model",
    "engine_cylinders",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]
]

train_dict_no_transmission_type = selected_df_train_no_transmission_type[["make",
     "year",
    "model",
    "engine_cylinders",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]].to_dict(orient="records")
X_train_no_transmission_type = dv_no_transmission_type.fit_transform(train_dict_no_transmission_type)

val_dict_no_transmission_type = selected_df_val_no_transmission_type[["make",
     "year",
    "model",
    "engine_cylinders",
    "vehicle_style",
    "highway_mpg",
    "city_mpg"
    ]].to_dict(orient="records")
X_val_no_transmission_type = dv_no_transmission_type.transform(val_dict_no_transmission_type)


model_no_transmission_type = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)
model_no_transmission_type.fit(X_train_no_transmission_type, y_train)

# accuracy _no_engine_hp
y_pred_no_transmission_type = model_no_transmission_type.predict_proba(X_val_no_transmission_type)[:, 1]
price_decision_no_transmission_type = (y_pred_no_transmission_type >= 0.5)
round((price_decision_no_transmission_type == y_val).mean(),2)

0.92

In [83]:
# training logistic regression no city_mpg
dv_no_city_mpg = DictVectorizer(sparse=False)

selected_df_train_no_city_mpg = selected_df_train[
    ["make",
     "year",
    "model",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg"
    ]
]

selected_df_val_no_city_mpg = selected_df_val[
    ["make",
     "year",
    "model",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg"
    ]
]

train_dict_no_city_mpg = selected_df_train_no_city_mpg[["make",
     "year",
    "model",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg"
    ]].to_dict(orient="records")
X_train_no_city_mpg = dv_no_city_mpg.fit_transform(train_dict_no_city_mpg)

val_dict_no_city_mpg = selected_df_val_no_city_mpg[["make",
     "year",
    "model",
    "engine_cylinders",
    "transmission_type",
    "vehicle_style",
    "highway_mpg"
    ]].to_dict(orient="records")
X_val_no_city_mpg = dv_no_city_mpg.transform(val_dict_no_city_mpg)


model_no_city_mpg = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)
model_no_city_mpg.fit(X_train_no_city_mpg, y_train)

# accuracy no_city_mpg
y_pred_no_city_mpg = model_no_city_mpg.predict_proba(X_val_no_city_mpg)[:, 1]
price_decision_no_city_mpg = (y_pred_no_city_mpg >= 0.5)
round((price_decision_no_city_mpg == y_val).mean(),2)

0.92

Q6 For this question, we'll see how to use a linear regression model from Scikit-Learn.
We'll need to use the original column price. Apply the logarithmic transformation to this column.
Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
Round your RMSE scores to 3 decimal digits.
Which of these alphas leads to the best RMSE on the validation set?

0
0.01
0.1
1
10

In [84]:
# format the data
df_copy = df[
    ["Make",
    "Model",
    "Year",
    "Engine HP",
    "Engine Cylinders",
    "Transmission Type",
    "Vehicle Style",
    "highway MPG",
    "city mpg",
    "MSRP"]
].copy()

df_copy.columns = df_copy.columns.str.replace(" ", "_").str.lower()
df_copy_columns = list(df_copy.dtypes.index)

for c in df_copy_columns:
    if df_copy[c].dtype == "object":
        df_copy[c] = df_copy[c].str.lower().str.replace(" ", "_")
        
df_copy = df_copy.rename(columns={"msrp": "price"})
df_copy.fillna(0, inplace=True)
df_copy.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

In [93]:
df_copy["price"] = np.log1p(df_copy["price"])

In [94]:
# split the data
df_copy_full_train, df_copy_test = train_test_split(df_copy, test_size=0.2, random_state=42)
df_copy_train, df_copy_val = train_test_split(df_copy_full_train, test_size=0.25, random_state=42)

df_copy_train = df_copy_train.reset_index(drop=True)
df_copy_val = df_copy_val.reset_index(drop=True)
df_copy_test = df_copy_test.reset_index(drop=True)

df_copy_y_train = df_copy_train["price"].values
df_copy_y_val = df_copy_val["price"].values
df_copy_y_test = df_copy_test["price"].values

del df_copy_train["price"]
del df_copy_val["price"]
del df_copy_test["price"]

len(df_copy_train)


7148

In [95]:
df_copy_y_train.shape, df_copy_y_val.shape

((7148,), (2383,))

In [96]:
df_copy_train.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
dtype: object

In [97]:
# one-hot encoding
dv_copy = DictVectorizer(sparse=False)
# new_columns = list(df_copy_train.dtypes.index)
train_dict_copy = df_copy_train.to_dict(orient="records")
X_train_copy = dv_copy.fit_transform(train_dict_copy)

val_dict_copy = df_copy_val.to_dict(orient="records")
X_val_copy = dv_copy.transform(val_dict_copy)

In [98]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

scores = {}
alphas = [0, 0.01, 0.1, 1, 10]

for alpha in alphas:
    model = Ridge(alpha=alpha, solver="sag", max_iter=1000, random_state=42)
    model.fit(X_train_copy, df_copy_y_train)
    # predict on validation set
    y_pred_df_copy = model.predict(X_val_copy)
    # calculate rmse
    score = mean_squared_error(df_copy_y_val, y_pred_df_copy, squared=False)
    scores[alpha] = round(score, 3)
    print(f"alpha = {alpha}:\t RMSE = {score}")
    

alpha = 0:	 RMSE = 0.48679431324238887
alpha = 0.01:	 RMSE = 0.48679455192752613
alpha = 0.1:	 RMSE = 0.48679670001899733
alpha = 1:	 RMSE = 0.4868181745432738
alpha = 10:	 RMSE = 0.4870322832975125


In [99]:
scores

{0: 0.487, 0.01: 0.487, 0.1: 0.487, 1: 0.487, 10: 0.487}