In [None]:
%matplotlib inline
import pandas as apd
import numpy as np
import matplotlib.pyplot as plt
from datasets import dataset_info, dataset_load

## Data Experiments
---
Now lets take a look at a particular real-life data problem. In the following example we take a look at the prediction of ozone concentration as a factor of other weather-based features. As with all data problems, it behooves us to take a look at all of the information that we have about the dataset.

In [None]:
dataset_info('laozone')

Now lets take a look at what this dataset looks like. 

In [None]:
data = dataset_load('laozone')
print(data)

Alright, we're ready to get started ! Now, before we touch anything, we need to follow best practices. When faced with a new dataset, we need to set up some kind of objective comparison. To do this, we need to split our dataset into three parts: **Training**(and within that, **Validation**), and **Testing** sets. 

The best practice here is to take the test data and lock it away somewhere. It is always tempting to tune your algorithms to give the best test performance. However, even if the regression isn't explicitly *trained* on the test data, as practitioners, we could be continually making changes in an effort to get our numbers up.

Instead, we should deep-freeze the test data, and then tune as much as we can via **cross-validation (CV)** on our training data.

In [None]:
# from sklearn import linear_model, svm, ensemble
from sklearn.model_selection import train_test_split, KFold

#--- Convert from DataFrame to NDArray ---#
# We also ensure that we load in all data as floating point values
# so that we don't miss anything to truncations.
y = data['ozone'].as_matrix().astype(float)
X = data[[i for i in range(1,10)]].as_matrix().astype(float)

#--- Split Dataset ---#
XTrain, XTest, yTrain, yTest = train_test_split(X,y,test_size = 0.25, random_state=1)

print("Training Samples: ", len(yTrain))
print("Testing  Samples: ", len(yTest))

Now, before we start attempting to fit models, lets take a bit of care and apply some pre-processing to our dataset. The de-facto pre-processing is *centering and normalization*. Specifically, many flavors of estimators (OLS, RR, etc.) can be thrown of by large differences in of scale and variations between the features. We can easily account for this in our estimators by simply normalizing the feature columns and removing averages. Scikit-Learn has some features for this.

In [None]:
from sklearn import preprocessing

#--- Adding Features ... ---#
# Can you think of any other possible features to include, here?
# What other pre-processing steps might you use?


#--- Center and Scale ---#
XTest = preprocessing.scale(XTest)
XTrain = preprocessing.scale(XTrain)

#--- Center observations ---#
mean_yTrain = np.mean(yTrain)
mean_yTest = np.mean(yTest)

yTrain = yTrain - mean_yTrain
yTest = yTest - mean_yTest

Now it is time for us to choose our estimator. What should we choose? 

## Attempt 1: OLS
---


In [None]:
#--- Calculate the OLS Estimate ---#
regOLS = np.linalg.solve(np.dot(XTrain.T,XTrain), np.dot(XTrain.T,yTrain))

yp = np.dot(XTrain, regOLS)
ypTest = np.dot(XTest, regOLS)

#--- Visualize ---#
plt.figure(figsize=(14,7))
plt.subplot(121)
plt.plot(yTrain + mean_yTrain,yp + mean_yTrain,'.', label='Training')
plt.plot(yTest + mean_yTest, ypTest + mean_yTest,'.r', label='Testing', marker='x')
plt.plot([0, 40], [0, 40], '-k', linewidth=0.7, label="Perfect")
plt.axis([0, 40, 0, 40])
plt.xlabel('Ozone (True)', fontsize=16)
plt.ylabel('Ozone (Predicted)', fontsize=16)
plt.legend(loc=2, fontsize=16)
plt.title('Prediction Performance', fontsize=18)
# Plot the learned model
plt.subplot(122)
plt.stem(regOLS)
plt.title('Learned Model $\\hat{\\beta}$', fontsize=18)
plt.xticks(range(9),data.keys()[1:10], rotation='vertical')

#--- Print RSS ---#
rss_train = np.mean(np.power(yTrain - yp,2))
rss_test = np.mean(np.power(yTest - ypTest,2))
print("Normalized RSS (Train): %0.2f" % rss_train)
print("Normalized RSS  (Test): %0.2f" % rss_test)

## Attempt 2: Ridge Regression
---

In [None]:
from sklearn import linear_model

#--- Define Regression Estimator ---#
regRR = linear_model.Ridge(alpha=0.01)

Now, we need to perform some kind of CV to find the best set of parameters for our model. We will do this by constructing a **Pipeline**. A pipeline is a useful way of handling pre-processing on separate data partitions when performing CV. Let's take a look at that.

In [None]:
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline 

#--- Make a Pre-processing + Fitting Pipeline ---#
pipe_regRR = make_pipeline(preprocessing.StandardScaler(), regRR)

#--- Define a Scoring Metric ---#
# To compare fits, we look at the prediction error via the RSS.
def neg_rss(reg, X, y):
    yp = reg.predict(X)
    return -np.mean(np.power(y - yp,2))

#--- Define CV Splitting ---#
# We can create an iterator which performs a set of randomized 
# splits on the dataset into "train" and "validation". We have
# a natural tradeoff between the test set size and the number of 
# splits we should perform
cv = ShuffleSplit(n_splits=20, test_size=0.05, random_state=0)

#--- Define the Parameters to Search ---#
# We need to specify the estimator name since we are performing a CV on
# a Pipeline. (e.g. the formatting of `<estimator>__<param>`).
param_grid = [
    {'ridge__alpha': np.logspace(-4,3,50)}
]

#--- Run the CV ---#
cv_regRR = GridSearchCV(pipe_regRR, param_grid, scoring=neg_rss, cv=cv)
cv_regRR.fit(XTrain,yTrain)

Great ! Now lets take a look at the performance of our estimator. Here, for an example of Ridge or Lasso regression, we chart over the $\alpha$ parameter that we perform CV against.

In [None]:
#--- Record CV Optimizing Hyper-Params ---#
cvOptAlpha = cv_regRR.cv_results_['param_ridge__alpha'][cv_regRR.best_index_]
cvOptParams = cv_regRR.cv_results_['params'][cv_regRR.best_index_]

#--- Visualize ---#
testedAlpha = cv_regRR.cv_results_['param_ridge__alpha']
trainScores = -cv_regRR.cv_results_['mean_train_score']   # Reverse Sign
testScores = -cv_regRR.cv_results_['mean_test_score']     # Reverse Sign

plt.figure(figsize=(15,5))
plt.plot(testedAlpha, trainScores, '-', label='Training (Avg.)')
plt.plot(testedAlpha, testScores, '-', label='Validation (Avg.)')
plt.xlabel('Regularization Parameter $\\alpha$', fontsize=16)
plt.ylabel('$\\frac{1}{N}RSS(y - X\\beta)$', fontsize=16)
plt.axvline(cvOptAlpha, label='$\\alpha^*$', color='k', linestyle=':')
plt.xscale('log')
plt.xlim([1e-4, 1e3])
plt.tight_layout()
plt.legend(loc=2, fontsize=16)

Now, lets take a look at how well we are able to make our predictions on the training set using this CV optimal value of $\alpha$.

In [None]:
#--- Get Training Predicitions ---#
pipe_regRR.set_params(ridge__alpha=cvOptAlpha)
pipe_regRR.fit(XTrain,yTrain)
yp = pipe_regRR.predict(XTrain)

#--- Visualize ---#
plt.figure(figsize=(7,7))
plt.plot(yTrain + mean_yTrain,yp + mean_yTrain,'.', label='Training (CV-Opt)')
plt.plot([0, 40], [0, 40], '-k', linewidth=0.7, label="Perfect")
plt.axis([0, 40, 0, 40])
plt.xlabel('Ozone (True)', fontsize=16)
plt.ylabel('Ozone (Predicted)', fontsize=16)
plt.legend(loc=2, fontsize=16);

And now, finally, we are ready to take our test data out of deep-freeze. How did we do?

In [None]:
ypTest = pipe_regRR.predict(XTest)

#--- Visualize ---#
plt.figure(figsize=(14,7))
plt.subplot(121)
plt.plot(yTrain + mean_yTrain,yp + mean_yTrain,'.', label='Training (CV-Opt)')
plt.plot(yTest + mean_yTest, ypTest + mean_yTest,'.r', label='Testing (CV-Opt)', marker='x')
plt.plot([0, 40], [0, 40], '-k', linewidth=0.7, label="Perfect")
plt.axis([0, 40, 0, 40])
plt.xlabel('Ozone (True)', fontsize=16)
plt.ylabel('Ozone (Predicted)', fontsize=16)
plt.legend(loc=2, fontsize=16)
plt.title('Prediction Performance', fontsize=18)
# Plot the learned model
plt.subplot(122)
plt.stem(regRR.coef_)
plt.title('Learned Model $\\hat{\\beta}$', fontsize=18)
plt.xticks(range(9),data.keys()[1:10], rotation='vertical')

#--- Print RSS ---#
print("Normalized RSS (Train): %0.2f" % -neg_rss(pipe_regRR, XTrain, yTrain))
print("Normalized RSS  (Test): %0.2f" % -neg_rss(pipe_regRR, XTest, yTest))

## Attempt 3: Lasso
---


In [None]:
#--- Construct Estimator ---#
regLasso = linear_model.Lasso(alpha=1.0)

#--- Make a Pre-processing + Fitting Pipeline ---#
pipe_regLasso = make_pipeline(preprocessing.StandardScaler(), regLasso)

#--- Define a Scoring Metric ---#
# To compare fits, we look at the prediction error via the RSS.
def neg_rss(reg, X, y):
    yp = reg.predict(X)
    return -np.mean(np.power(y - yp,2))

#--- Define the Parameters to Search ---#
# We need to specify the estimator name since we are performing a CV on
# a Pipeline. (e.g. the formatting of `<estimator>__<param>`).
param_grid = [
    {'lasso__alpha': np.logspace(-4,3,50)}
]

#--- Run the CV ---#
cv_regLasso = GridSearchCV(pipe_regLasso, param_grid, scoring=neg_rss, cv=cv)
cv_regLasso.fit(XTrain,yTrain)

#--- Record CV Optimizing Hyper-Params ---#
cvOptAlphaLasso = cv_regLasso.cv_results_['param_lasso__alpha'][cv_regLasso.best_index_]
cvOptParamsLasso = cv_regLasso.cv_results_['params'][cv_regLasso.best_index_]

#--- Visualize ---#
testedAlpha = cv_regLasso.cv_results_['param_lasso__alpha']
trainScores = -cv_regLasso.cv_results_['mean_train_score']   # Reverse Sign
testScores = -cv_regLasso.cv_results_['mean_test_score']     # Reverse Sign

plt.figure(figsize=(15,5))
plt.plot(testedAlpha, trainScores, '-', label='Training (Avg.)')
plt.plot(testedAlpha, testScores, '-', label='Validation (Avg.)')
plt.xlabel('Regularization Parameter $\\alpha$', fontsize=16)
plt.ylabel('$\\frac{1}{N}RSS(y - X\\beta)$', fontsize=16)
plt.axvline(cvOptAlphaLasso, label='$\\alpha^*$', color='k', linestyle=':')
plt.xscale('log')
plt.xlim([1e-4, 1e3])
plt.tight_layout()
plt.legend(loc=2, fontsize=16)

In [None]:
#--- Get Training Predicitions ---#
pipe_regLasso.set_params(lasso__alpha=cvOptAlphaLasso)
pipe_regLasso.fit(XTrain,yTrain)
yp = pipe_regLasso.predict(XTrain)
ypTest = pipe_regLasso.predict(XTest)

#--- Visualize ---#
# Plot prediction performance
plt.figure(figsize=(14,7))
plt.subplot(121)
plt.plot(yTrain + mean_yTrain,yp + mean_yTrain,'.', label='Training (CV-Opt)')
plt.plot(yTest + mean_yTest, ypTest + mean_yTest,'.r', label='Testing (CV-Opt)', marker='x')
plt.plot([0, 40], [0, 40], '-k', linewidth=0.7, label="Perfect")
plt.axis([0, 40, 0, 40])
plt.xlabel('Ozone (True)', fontsize=16)
plt.ylabel('Ozone (Predicted)', fontsize=16)
plt.legend(loc=2, fontsize=16)
plt.title('Prediction Performance', fontsize=18)
# Plot the learned model
plt.subplot(122)
plt.stem(regLasso.coef_)
plt.title('Learned Model $\\hat{\\beta}$', fontsize=18)
plt.xticks(range(9),data.keys()[1:10], rotation='vertical')

plt.tight_layout()
#--- Print RSS ---#
print("Normalized RSS (Train): %0.2f" % -neg_rss(pipe_regLasso, XTrain, yTrain))
print("Normalized RSS  (Test): %0.2f" % -neg_rss(pipe_regLasso, XTest, yTest))

## Boosting
---
With boosing methods, over-training becomes a very real possibility. In this case we need to make sure to use our CV in order to stop fine-tuning our boosting approach when we start to have a loss on the validation data.

We also have many possible different metrics to use in this case, not just the RSS. So, it is important to use our CV parameter grid to check many possible values.

In [None]:
from sklearn import ensemble

#--- Create Estimator Object ---#
regGB = ensemble.GradientBoostingRegressor()

#--- Make a Pre-processing + Fitting Pipeline ---#
pipe_regGB = make_pipeline(preprocessing.StandardScaler(), regGB)

#--- Define the Parameters to Search ---#
# We need to specify the estimator name since we are performing a CV on
# a Pipeline. (e.g. the formatting of `<estimator>__<param>`).
param_grid = [
    {'gradientboostingregressor__loss': ['ls','lad'], 
     'gradientboostingregressor__learning_rate': np.logspace(-3,0,10),
     'gradientboostingregressor__n_estimators': range(50,200,50)} 
]

#--- Run the CV ---#
cv_regGB = GridSearchCV(pipe_regGB, param_grid, scoring=neg_rss, cv=cv)
cv_regGB.fit(XTrain,yTrain)

In [None]:
cv_regGB.cv_results_['params'][cv_regGB.best_index_]
cv_regGB.best_estimator_

In [None]:
#--- Get Training Predicitions ---#
# pipe_regGB.set_params(cv_regGB.best_estimator_['params'])
cvopt_regGB = cv_regGB.best_estimator_
cvopt_regGB.fit(XTrain,yTrain)
yp = cvopt_regGB.predict(XTrain)
ypTest = cvopt_regGB.predict(XTest)

#--- Visualize ---#
# Plot prediction performance
plt.figure(figsize=(14,7))
plt.subplot(121)
plt.plot(yTrain + mean_yTrain,yp + mean_yTrain,'.', label='Training (CV-Opt)')
plt.plot(yTest + mean_yTest, ypTest + mean_yTest,'.r', label='Testing (CV-Opt)', marker='x')
plt.plot([0, 40], [0, 40], '-k', linewidth=0.7, label="Perfect")
plt.axis([0, 40, 0, 40])
plt.xlabel('Ozone (True)', fontsize=16)
plt.ylabel('Ozone (Predicted)', fontsize=16)
plt.legend(loc=2, fontsize=16)
plt.title('Prediction Performance', fontsize=18)
# Plot the learned model
plt.subplot(122)
plt.stem(cvopt_regGB.named_steps['gradientboostingregressor'].feature_importances_)
plt.title('Learned Model $\\hat{\\beta}$', fontsize=18)
plt.ylabel('Feature Importances', fontsize=16)
plt.xticks(range(9),data.keys()[1:10], rotation='vertical')

plt.tight_layout()
#--- Print RSS ---#
print("Normalized RSS (Train): %0.2f" % -neg_rss(cvopt_regGB, XTrain, yTrain))
print("Normalized RSS  (Test): %0.2f" % -neg_rss(cvopt_regGB, XTest, yTest))