# Cross Validation and Hyperparameter Optimization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, SGDRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.datasets import load_boston, load_iris

## Dataset

In [None]:
boston = load_boston()
boston.keys()

In [None]:
boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_df['MEDV'] = boston['target']

In [None]:
boston_df.head()

In [None]:
cm = boston_df.corr()
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cm, square=True, annot=True, ax=ax, cmap='RdBu');

In [None]:
sns.pairplot(boston_df[['LSTAT', 'PTRATIO', 'RM', 'TAX', 'MEDV']]);

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X = boston_df.drop('MEDV', axis=1)
y = boston_df['MEDV']

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# predict
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

#validation

train_score = r2_score(y_train, y_train_pred)
test_score = r2_score(y_test, y_test_pred)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print('Train Set')
print('Train MSE : ', train_mse)
print('Train R2 Score: ', train_score)
print('Test Set')
print('Test MSE : ', test_mse)
print('Test R2 Score: ', test_score)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(range(len(model.coef_)), height=model.coef_)
ax.set_xticks(range(len(model.coef_)))
ax.set_xticklabels(X_train.columns);

## K-Folds Cross-Validation

<img src='images/kfold.svg'>

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True)

for train, test in kfold.split(X, y):
    print('-------Fold-------')
    print()
    model = LinearRegression()
    model.fit(X.values[train], y.values[train])
    
    y_train_pred = model.predict(X.values[train])
    y_test_pred = model.predict(X.values[test])

    train_score = r2_score(y.values[train], y_train_pred)
    test_score = r2_score(y.values[test], y_test_pred)

    train_mse = mean_squared_error(y.values[train], y_train_pred)
    test_mse = mean_squared_error(y.values[test], y_test_pred)

    print('- Train Set')
    print('Train MSE : ', train_mse)
    print('Train R2 Score: ', train_score)
    print('\n- Test Set')
    print('Test MSE : ', test_mse)
    print('Test R2 Score: ', test_score)

By default cross_val_score does a Stratified K-Fold Cross Validation.

In [None]:
from sklearn.model_selection import cross_val_score

model = LinearRegression()
scores = cross_val_score(model, X.values, y.values, cv=5, scoring='neg_mean_squared_error')
r2_scores = cross_val_score(model, X.values, y.values, cv=5, scoring='r2')
mse = -scores
print('MSE: ', mse)
print('Mean: ', mse.mean())
print('Std: ', mse.std())
print('-'*70)
print('Scores: ', r2_scores)
print('Mean: ', r2_scores.mean())
print('Std: ', r2_scores.std())

In [None]:
from sklearn.model_selection import cross_val_score

model = LinearRegression()
scores = cross_val_score(model, X.values, y.values, cv=kfold, scoring='neg_mean_squared_error', )
r2_scores = cross_val_score(model, X.values, y.values, cv=kfold, scoring='r2')
mse = -scores
print('MSE: ', mse)
print('Mean: ', mse.mean())
print('Std: ', mse.std())
print('-'*70)
print('Scores: ', r2_scores)
print('Mean: ', r2_scores.mean())
print('Std: ', r2_scores.std())

## Stratified K-Fold Cross-Validation

In [None]:
iris = load_iris()
iris_df = pd.DataFrame(iris['data'], columns=iris['feature_names'])
iris_df['class'] = iris['target']
iris_df.head()

In [None]:
display(iris_df.shape)
iris_df['class'].value_counts() / iris_df.shape[0]

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)

X = iris_df.drop('class', axis=1)
y = iris_df['class']

for train, test in skf.split(X, y):
    print('-'*30 + 'Fold' + '-'*30)
    
    print('Train Size:', y[train].shape[0])
    display(y[train].value_counts() / y[train].shape[0])
    
    print('Test Size:', y[test].shape[0])
    display(y[test].value_counts() / y[test].shape[0])

In [None]:
clf = LogisticRegression(multi_class='multinomial', solver='newton-cg')

scores = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')
acc = scores
print('Accuracy: ', acc)
print('Mean: ', acc.mean())
print('Std: ', acc.std())

## Leave One Out

Note: LeaveOneOut() is equivalent to KFold(n_splits=n) and LeavePOut(p=1) where n is the number of samples.

In [None]:
from sklearn.model_selection import LeaveOneOut

X = boston_df.drop('MEDV', axis=1)
y = boston_df['MEDV']
loo = LeaveOneOut()

model = LinearRegression()
scores = cross_val_score(model, X.values, y.values, cv=loo, scoring='neg_mean_squared_error', )
mse = -scores
print('Mean: ', mse.mean())
print('Std: ', mse.std())

## Shuffle Split

In [None]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=5)

scores = cross_val_score(model, X.values, y.values, cv=ss, scoring='neg_mean_squared_error', )
r2_scores = cross_val_score(model, X.values, y.values, cv=ss, scoring='r2')
mse = -scores
print('MSE: ', mse)
print('Mean: ', mse.mean())
print('Std: ', mse.std())
print('-'*70)
print('Scores: ', r2_scores)
print('Mean: ', r2_scores.mean())
print('Std: ', r2_scores.std())

## Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'eta0': [0.001, 0.01, 0.1, 0.5],
               'fit_intercept': [True, False],
               'max_iter': [100, 1000, 10000],
               }]
sgd_reg = SGDRegressor()
grid_search = GridSearchCV(sgd_reg, param_grid, cv=KFold(n_splits=5),
                          scoring='neg_mean_squared_error')

grid_search.fit(X, y)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_