In [1]:
# dataset loader
from sklearn import datasets

# model training and evalutation utilities 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold # this is one way to generate folds
from sklearn.model_selection import KFold

# models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

# toy data
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

## What you should learn/be aware of based on this lecture
Key `sklearn` functions:

* `train_test_split`
* `cross_validate`
* Fold generators: `KFold` and `StratifiedKFold`
* Scoring functions per last lecture and how to pass to `cross_validate`
* How to compare different models by looping over them with `cross_validate`, `GridSearchCV`, or `RandomizedSearchCV`

Not covered today but you should check out:

* `confusion_matrix` and `classification_report` (helpful to evaluate models)

## A simple "split, train, evaluate" example

In [2]:
# split the data with 50% in each set
X1, X2, y1, y2 = train_test_split(X, y, random_state=0,
                                  train_size=0.5)

# fit the model on one set of data
# ignore the model I choose here, its not important what
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X1, y1) # fit on the "training data" X1 and  y1

# evaluate the model on the second set of data
y2_model = model.predict(X2) # using X2 (out-of-sample data), predict y2
accuracy_score(y2, y2_model) # see how close y2 is to prediction (fraction of all pred that are exactly right)

0.9066666666666666

## What to do k-fold? It's like repeating the above. In pseudo code, it looks like:

1. Break the X and y data into $k$ subsamples
2. For each subsample, fit the model, predict OOS, score predictions, and save those

Ok?

## K-Fold in Python: The explicit way, and the wrapped way

Watch me do the explicit way

In [3]:
# you can take quick notes here, but I'm not going to write this code slow enough to copy
# the point here is to illustrate

accuracy = []

# lop over folds
for train_index, test_index in StratifiedKFold(n_splits = 5).split(X, y):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_predict))
    
accuracy
import numpy as np
np.mean(accuracy)

0.96

Now try the wrapper below! We are going to see how to use that function to:

* try multiple models
* try different sets of X variables
* try different ways to specific folds

In [4]:
# try the function here

cross_validate(model, X, y)

# exactly as what we did manually



{'fit_time': array([0.00059628, 0.00059867, 0.00039506]),
 'score_time': array([0.00462389, 0.00569701, 0.00164199]),
 'test_score': array([0.98039216, 0.92156863, 1.        ])}

In [5]:
# try here with diff score

cross_validate(model, X, y, scoring = ['accuracy', 'r2', 'precision_macro'])



{'fit_time': array([0.0005579 , 0.00059104, 0.00046206]),
 'score_time': array([0.01174307, 0.00855613, 0.00960994]),
 'test_accuracy': array([0.98039216, 0.92156863, 1.        ]),
 'test_precision_macro': array([0.98148148, 0.9251462 , 1.        ]),
 'test_r2': array([0.97058824, 0.88235294, 1.        ])}

All the metrics it can compute out of the box are here: https://scikit-learn.org/stable/modules/model_evaluation.html

Notice that many of these were discussed in our last lecture!

__*Warning/Note*__: the metric names on that link and what you put in the scoring dictionary don't seem to match up.

## question:

Using 5 folds, what is the average (across the folders) out-of-sample (training) F1 (metric)?

In [6]:
# answer here

cross_validate(model, X, y, scoring = 'f1_macro')['test_score'].mean()



0.9672238255571589

## Exploring the `cross_validate` parameters

### The model parameter

In [7]:
# change the model

# by changing the model paramter, you can adjust the type of model and the model's parameters
cross_validate(SVC(gamma='auto'), X, y, scoring = 'f1_macro')
cross_validate(SVC(C = 5), X, y, scoring = 'f1_macro')



{'fit_time': array([0.00275087, 0.00096822, 0.00188112]),
 'score_time': array([0.00068402, 0.00105715, 0.000736  ]),
 'test_score': array([0.98037518, 0.96064815, 1.        ])}

In [8]:
# change the model

# by changing the model paramter, you can adjust the type of model and the model's parameters
cross_validate(LinearRegression(), X, y, scoring = 'r2')['test_score'].mean()



0.0

### question:

In [9]:
# answer here

`linear_model` submodule contains lots of useful alternate options

In [10]:
# for example:
linear_model.Lasso
linear_model.Ridge
linear_model.LogisticRegression

linear_model.LassoCV() # Returns a Lasso (L1 Regularization) linear model with picking the best model by cross validation
linear_model.RidgeCV() # Returns a Ridge (L2 Regularization) linear model with picking the best model by cross validation
linear_model.LogisticRegressionCV() # return best logit model by CV

LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

Looping over models

In [11]:
# set up models to try
models = []
models.append(('svc_1', SVC(gamma='auto') ))
models.append(('svc_2', SVC(C=5) ))
models.append(('neighbor',  KNeighborsClassifier(n_neighbors=1)))

# loop and print
for name, model in models:
    scores = cross_validate(model, X, y, scoring='accuracy')
    print('%s: %.3f (%.3f)' % (name.ljust(10), 
                                   scores['test_score'].mean(), 
                                   scores['test_score'].std()
                                   )
         )

svc_1     : 0.980 (0.016)
svc_2     : 0.987 (0.016)
neighbor  : 0.960 (0.025)


These are built in methods that help you pick the actual, specific parameters in the model to help it perform the best:
* grid search CV
* randomizedsearchCV

### The X parameter

You can loop over Xs

In [31]:
# define a smaller X and a bigger X
X_small = X[:,:2] # just first two columns

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
X3 = poly.fit_transform(X) # has not 4 vars, but 34


In [37]:
# set up Xs to try
Xs = []
Xs.append(('X', X))
Xs.append(('X_small', X_small))
Xs.append(('X3', X3)) 

# loop and print
model = KNeighborsClassifier(n_neighbors = 1)

for X_name, X in Xs:
    scores = cross_validate(model, X, y, scoring='accuracy')
    print('%s: %.3f (%.3f)' % (name.ljust(10), 
                                   scores['test_score'].mean(), 
                                   scores['test_score'].std()
                                   )
         )

neighbor  : 0.960 (0.025)
neighbor  : 0.727 (0.061)
neighbor  : 0.947 (0.016)


### Xs and Models

In [32]:
poly.fit_transform(X[:,:1])

array([[  5.1  ,  26.01 , 132.651],
       [  4.9  ,  24.01 , 117.649],
       [  4.7  ,  22.09 , 103.823],
       [  4.6  ,  21.16 ,  97.336],
       [  5.   ,  25.   , 125.   ],
       [  5.4  ,  29.16 , 157.464],
       [  4.6  ,  21.16 ,  97.336],
       [  5.   ,  25.   , 125.   ],
       [  4.4  ,  19.36 ,  85.184],
       [  4.9  ,  24.01 , 117.649],
       [  5.4  ,  29.16 , 157.464],
       [  4.8  ,  23.04 , 110.592],
       [  4.8  ,  23.04 , 110.592],
       [  4.3  ,  18.49 ,  79.507],
       [  5.8  ,  33.64 , 195.112],
       [  5.7  ,  32.49 , 185.193],
       [  5.4  ,  29.16 , 157.464],
       [  5.1  ,  26.01 , 132.651],
       [  5.7  ,  32.49 , 185.193],
       [  5.1  ,  26.01 , 132.651],
       [  5.4  ,  29.16 , 157.464],
       [  5.1  ,  26.01 , 132.651],
       [  4.6  ,  21.16 ,  97.336],
       [  5.1  ,  26.01 , 132.651],
       [  4.8  ,  23.04 , 110.592],
       [  5.   ,  25.   , 125.   ],
       [  5.   ,  25.   , 125.   ],
       [  5.2  ,  27.04 , 14

In [34]:
X3.shape

(150, 34)

In [39]:
for X_name, X in Xs:
    for name, model in models: 
        scores = cross_validate(model, X, y, scoring='accuracy', cv = 5)
        print('%s + %s: %.3f (%.3f)' % (name.ljust(10), 
                                   X_name.ljust(10),
                                   scores['test_score'].mean(), 
                                   scores['test_score'].std()
                                  )
             )

svc_1      + X         : 0.980 (0.016)
svc_2      + X         : 0.987 (0.016)
neighbor   + X         : 0.960 (0.025)
svc_1      + X_small   : 0.820 (0.058)
svc_2      + X_small   : 0.813 (0.054)
neighbor   + X_small   : 0.727 (0.061)
svc_1      + X3        : 0.527 (0.077)
svc_2      + X3        : 0.973 (0.025)
neighbor   + X3        : 0.947 (0.016)


### CV parameter and folds

Just watch.

In [40]:
cross_validate(model, X, y, scoring='accuracy', cv = 5)

{'fit_time': array([0.00227427, 0.00161672, 0.00191736, 0.00095201, 0.00196099]),
 'score_time': array([0.00512743, 0.00399065, 0.00402427, 0.00395656, 0.00342226]),
 'test_score': array([0.93333333, 0.96666667, 0.93333333, 0.93333333, 0.96666667])}

In [41]:
# silly data for illustration

 y = ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c']

In [42]:
kf = KFold(n_splits = 3)
kf = KFold(n_splits = 3, shuffle = True, random_state = 1)

for train, test in kf.splits[y]:
    print('train: %s test: %s' % [str(train).ljust(32), test])
    print("%s %s" % [str(y[j] for j in train).ljust(32), [y[j]] for j in test])

SyntaxError: invalid syntax (<ipython-input-42-607e3fc1ee2e>, line 6)

## Links, resources, and next week

Only two resources needed

* sklearn docs are GREAT https://scikit-learn.org/stable/user_guide.html
* Python Data Science Handbook (note some module calls are obsolete, so you might need to update code) https://jakevdp.github.io/PythonDataScienceHandbook/index.html

Next week:

* preprocessing
* data transformations
* feasture selection