In [1]:
# Step 1: Import Necessary Packages
# ------

import pandas as pd
from numpy import arange
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold

In [2]:
# Step 2: Load the Data
# ------

# read in data
model_data = pd.read_csv("../model_data.csv")


# drop rows with Nan values
model_data.dropna(inplace=True)

# reset keys of dataset
model_data.reset_index(drop=True, inplace=True)

# define predictor and response variables
X = model_data[["temp_pi_t0","hum_pi_t0","PM1_pi_t0","PM2.5_pi_t0","PM10_pi_t0","wind_speed_t0","temp_pi_t1","hum_pi_t1","PM1_pi_t1","PM2.5_pi_t1","PM10_pi_t1","wind_speed_t1","temp_pi_t2","hum_pi_t2","PM1_pi_t2","PM2.5_pi_t2","PM10_pi_t2","wind_speed_t2","temp_pi_t3","hum_pi_t3","PM1_pi_t3","PM2.5_pi_t3","PM10_pi_t3","wind_speed_t3","temp_pi_t4","hum_pi_t4","PM1_pi_t4","PM2.5_pi_t4","PM10_pi_t4","wind_speed_t4","temp_pi_t5","hum_pi_t5","PM1_pi_t5","PM2.5_pi_t5","PM10_pi_t5","wind_speed_t5"]]
y = model_data[["PM2.5","PM10"]]

In [3]:
# Step 3: Fit the Ridge Regression Model
# ------

# define cross-validation method to evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# define model
model = RidgeCV(alphas=arange(0, 1e-4, 5e-6), cv=cv)

# fit model
model.fit(X, y)

# get coefficient of determination R² of the prediction.
R = model.score(X,y)

# display R
print(f'R = {R}')

# display lambda that produced the lowest test MSE
print(f'alpha = {model.alpha_}')

# display MSE
print(f'MSE = {model.best_score_}')


R = 0.6837366708912354
alpha = 2e-05
MSE = 0.17586700891706894


In [None]:
# Step 4: Use the Model to Make Predictions
# ------

# TODO: test prediction

#define new observation
new = [24, 2.5, 3.5, 18.5]

#predict hp value using ridge regression model
model.predict([new])

In [5]:
model.get_params()

{'alpha_per_target': False,
 'alphas': array([0.0e+00, 5.0e-06, 1.0e-05, 1.5e-05, 2.0e-05, 2.5e-05, 3.0e-05,
        3.5e-05, 4.0e-05, 4.5e-05, 5.0e-05, 5.5e-05, 6.0e-05, 6.5e-05,
        7.0e-05, 7.5e-05, 8.0e-05, 8.5e-05, 9.0e-05, 9.5e-05]),
 'cv': RepeatedKFold(n_repeats=3, n_splits=10, random_state=1),
 'fit_intercept': True,
 'gcv_mode': None,
 'normalize': False,
 'scoring': None,
 'store_cv_values': False}

In [6]:
2e-05

2e-05