MIT License

Copyright (c) Microsoft Corporation. All rights reserved.

This notebook is adapted from Francesca Lazzeri Energy Demand Forecast Workbench workshop.

Copyright (c) 2021 PyLadies Amsterdam, Alyona Galyeva

# Ridge regression

In [None]:
%matplotlib inline
import os
import pickle
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st
from azureml.core import Workspace, Dataset
from azureml.core.experiment import Experiment
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [None]:
WORKDIR = os.getcwd()
MODEL_NAME = "ridge"

In [None]:
ws = Workspace.from_config()

In [None]:
train_ds = Dataset.get_by_name(ws, name="train_nyc_demand_data")
print(train_ds.name, train_ds.version)

In [None]:
train = train_ds.to_pandas_dataframe()
train.head()

Create model pipeline:
- **one-hot encode categorical variables**
- **randomized parameter search** with cross validation to find optimal values for the alpha parameter

Fitting this pipeline should take less than a minute.

In [None]:
X = train.drop(['demand', 'timeStamp'], axis=1)

In [None]:
ridge_experiment = Experiment(ws, name="Ridge")
run = ridge_experiment.start_logging()

run.log("dataset name", train_ds.name)
run.log("dataset version", train_ds.version)

In [None]:
cat_cols = ['hour', 'month', 'dayofweek']
cat_cols_idx = [X.columns.get_loc(c) for c in X.columns if c in cat_cols]
run.log_list("cat_cols", cat_cols)
preprocessor = ColumnTransformer([('encoder', OneHotEncoder(sparse=False), cat_cols_idx)], remainder='passthrough')
regr = Ridge(fit_intercept=False)
tscv = TimeSeriesSplit(n_splits=3)

In [None]:
param_dist = {'alpha': st.uniform(1e-4, 10.0)}
regr_cv = RandomizedSearchCV(estimator=regr,
                            param_distributions=param_dist,
                            n_iter=100,
                            scoring='neg_mean_squared_error',
                            cv=tscv,
                            verbose=2,
                            n_jobs=-1)
regr_pipe = Pipeline([('preprocessor', preprocessor), ('regr_cv', regr_cv)])
regr_pipe.fit(X, y=train['demand'])
run.log("pipeline steps", regr_pipe.named_steps)
run.log_list("param_dist", param_dist)

In [None]:
with open(os.path.join(WORKDIR, MODEL_NAME + '.pkl'), 'wb') as f:
    pickle.dump(regr_pipe, f)

Cross validation results

In [None]:
cv_results = pd.DataFrame(regr_pipe.named_steps['regr_cv'].cv_results_)
run.log_list("cv_results", regr_pipe.named_steps['regr_cv'].cv_results_)
cv_results.sort_values(by='rank_test_score').head()

In [None]:
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'], 'ro', markersize=1)
plt.title('CV negative mean squared error')
run.log_image("CV errors plot", plot=plt)
plt.show()

In [None]:
run.complete()