In [None]:
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
#import mglearn
from sklearn.model_selection import KFold
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import seaborn as sns; sns.set()
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 6.4 Cross validation - solution

Sources for Notebook:
- Andreas Mueller, Introduction to ML with Python
- Andreas Mueller, Scipy 2016 sklearn
- Sklearn Documentation

## 6.4.1 Concept

In [None]:
#import iris dataset and make X, y
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target

In [None]:
y

Hmm, our data is not shuffled. Let's try to do a 3 fold cross validation and see what happens.

The ``sklearn.model_selection`` module has all functions related to cross validation. There easiest function is ``cross_val_score`` which takes an estimator and a dataset, and will do all of the splitting for you.

In [None]:
classifier = LogisticRegression()
results = cross_val_score(classifier, X, y, cv=3)
results

By default, cross_val_score will use ``StratifiedKFold`` **for classification**, which ensures that the class proportions in the dataset are reflected in each fold. If you have a binary classification dataset with 90% of data point belonging to class 0, that would mean that in each fold, 90% of datapoints would belong to class 0.
If you would just use KFold cross-validation, it is likely that you would generate a split that only contains class 0.
It is generally a good idea to use ``StratifiedKFold`` whenever you do classification.

<img src="figures/Crossval.png" alt="Crossval" style="width: 95%;"/>

What happens if we don't shuffle our data for classification

In [None]:
kfold = KFold(n_splits=3, shuffle = False)
print("Cross-validation scores:\n{}".format(
    cross_val_score(classifier, X, y, cv=kfold)))

## 6.4.2 Grid search with cross validation

Using CV for finding the best parameters to set for a model:
- Define a range of parameters with values you would like to optimize.
- Split your dataset in train / test
- perform CV on the train set with all possible combination of parameters
- Save the parameters of the model with the highest cross-validation score
- Make a final model on the entire training data with those parameters
- Do a final evaluation with the test data 



<img src="figures/Gridsearch.png" alt="Gridsearch" style="width: 55%;"/>

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
# get some data
digits = load_digits()
X, y = digits.data, digits.target

# split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify = y)


clf = RandomForestClassifier(n_estimators=20)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 2, 3, 4],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
#Refit an estimator using the best found parameters on the whole dataset (by default True)
k=5
grid_search = GridSearchCV(clf, param_grid=param_grid, refit = True, cv = k)
grid_search.fit(X_train, y_train)


report(grid_search.cv_results_)

In [None]:
#select best model based on grid search CV
final_model = grid_search.best_estimator_

#Retraining on entire training set has already been done automatically 
#because we specified refit = True on the GridSearchCV function

#how good does it score on our test set
final_model.score(X_test, y_test)

## 6.4.3 Task 4

We will use the same data as in the SVM exercise.
- Look at the data again
- Make a logistic regression model
- Evaluate the model using k fold cross validation. Try some k's.
- Make a SVM model.
    - Think what parameters need to be decided upon
    - Pick the best model using grid search
- Which model is, according to you, the best?

## 6.4.4 Solution

The following is identical to the SVM excercise

In [None]:
svm_df = pd.read_csv('data/SVM.csv', index_col=0)

In [None]:
sns.lmplot('x1', 'x2', data=svm_df, hue='y', fit_reg=False)

In [None]:
svm_df = pd.read_csv('data/SVM.csv', index_col=0)

In [None]:
svm_df.sample(3)

In [None]:
svm_mx = svm_df.as_matrix()

In [None]:
X_svm = svm_mx[:,:2]

In [None]:
X_svm.shape

In [None]:
# Conver the dependent into a binary classification 
Y_svm = svm_mx[:,2].astype(int) - 1

In [None]:
Y_svm[:10]

In [None]:
Y_svm.shape

Now comes the new part

In [None]:
classifier = LogisticRegression()
results = cross_val_score(classifier, X_svm, Y_svm, cv=3)
results

In [None]:
classifier = LogisticRegression(solver='lbfgs')
results = cross_val_score(classifier, X_svm, Y_svm, cv=15)
results
#The test folds get quite small, so it's easier to get extreme fits

For the grid search, we do need our own train and test set. The grid search will perform crossvalidation on the train set (splitting that one again) and calculate an overal score on the test set.

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_svm, 
                                                    Y_svm,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=Y_svm)
ss = StandardScaler()
ss.fit(X_train)

### WARNING. 
The following can be very computationally heavy, don't hesitate to delete some options.

In [None]:
p_grid = {}
p_grid['C'] = [10**x for x in range(-1,6)]
#p_grid['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
p_grid['kernel'] = ['linear', 'rbf']
p_grid['degree'] = [3, 4, 5]
p_grid['gamma'] = [10**x for x in range(-2,6)]
p_grid['shrinking'] = [True, False]

In [None]:
svc = SVC()
grid_search = GridSearchCV(svc, 
                           param_grid=p_grid, 
                           refit=True,
                           n_jobs=-1,
                           verbose=True,
                           cv=5)

In [None]:
grid_search.fit(ss.transform(X_train), Y_train)

In [None]:
grid_search.best_estimator_

In [None]:
final_model = grid_search.best_estimator_

In [None]:
final_model.score(ss.transform(X_test), Y_test)