In [1]:
# dataset loader
from sklearn import datasets

# model training and evalutation utilities 
from sklearn.model_selection import train_test_split # very important!
from sklearn.model_selection import cross_validate   # v nice!
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold # this is one way to generate folds
from sklearn.model_selection import KFold

# models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

# toy data
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

## What you should learn/be aware of based on this lecture 

Key `sklearn` functions:
- `train_test_split`
- `cross_validate`
- Fold generators: `KFold` and `StratifiedKFold`
- Scoring functions per last lecture and how to pass to `cross_validate`
- How to compare different models by looping over them with `cross_validate`, `GridSearchCV`, or `RandomizedSearchCV` 

Not covered today but you should check out:
- `confusion_matrix` and `classification_report` (helpful to evaluate models)


## A simple "split, train, evaluate" example

In [2]:
# split the data with 50% in each set
X1, X2, y1, y2 = train_test_split(X, y, random_state=0,
                                  train_size=0.5)

# fit the model on one set of data
# ignore the model I choose here, its not important what
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X1, y1) # fit on the "training data" X1 and  y1

# evaluate the model on the second set of data
y2_model = model.predict(X2) # using X2 (out-of-sample data), predict y2
accuracy_score(y2, y2_model) # see how close y2 is to prediction (fraction of all pred that are exactly right)

0.9066666666666666

## Want to do k-fold? It's like repeating the above. In pseudo code, it looks like:
1. Break the X and y data into $k$ subsamples
2. For each subsample, fit the model, predict OOS, score predictions, and save those

Ok?


## K-Fold in Python: The explicit way, and the wrapped way

Watch me do the explicit way

In [11]:
# you can take quick notes here, but I'm not going to write this code slow enough to copy
# the point here is to illustrate

accuracy = [] # to store accuracies

# loop over folds 
for train_index, test_index in StratifiedKFold(n_splits=5).split(X,y):

    # .split() yields the indices in train/test sets. use those to get 
    # the x/y vars for each separated out:
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # fit/estimate, predict OOS, evaluate and store
    model.fit(X_train,y_train)
    y_predict = model.predict(X_test)
    accuracy.append(   accuracy_score(y_test, y_predict)      )
    
accuracy # print   
# import numpy as np
# np.mean(accuracy)

[0.9666666666666667,
 0.9666666666666667,
 0.9333333333333333,
 0.9333333333333333,
 1.0]

Now try the wrapper below! We are going to see how to use that function to:
- try multiple models
- try different sets of X variables
- try different ways to specific folds

In [10]:
# try the function here
cross_validate(model, X, y)

{'fit_time': array([0.        , 0.00099707, 0.00098538, 0.        , 0.00099969]),
 'score_time': array([0.0029757 , 0.00051785, 0.00099826, 0.00199819, 0.00199223]),
 'test_score': array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ])}

In [14]:
# try here with diff scores

cross_validate(model, X, y, scoring=['accuracy','r2','precision_macro'])  

{'fit_time': array([0.        , 0.        , 0.0009973 , 0.        , 0.00099659]),
 'score_time': array([0.00299144, 0.00299263, 0.00199223, 0.00299287, 0.00199533]),
 'test_accuracy': array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ]),
 'test_r2': array([0.95, 0.95, 0.9 , 0.9 , 1.  ]),
 'test_precision_macro': array([0.96969697, 0.96969697, 0.94444444, 0.93333333, 1.        ])}

All the metrics it can compute out of the box are here: https://scikit-learn.org/stable/modules/model_evaluation.html

Notice that many of these were discussed in our last lecture!

_**Warning/Note:**_ the metric names on that link and what you put in the `scoring` dictionary don't seem to match up.  

## question:

Using 5 folds, what is the average (across the folds) out-of-sample (training) F1?


In [18]:
cross_validate(model,X,y,scoring='f1_macro') ['test_score'].mean()

0.9598319029897976

## Exploring the `cross_validate` parameters

### The model parameter 

In [27]:
# change the model

# yb changing the model parameter, you can adj the tyupe of model and the models parameters
cross_validate( SVC(gamma='auto'),X,y,scoring='f1_macro')
cross_validate( SVC(C=5),X,y,scoring='f1_macro')

{'fit_time': array([0.0009973 , 0.00099897, 0.0009954 , 0.00099754, 0.        ]),
 'score_time': array([0.0009985 , 0.00099659, 0.0009985 , 0.        , 0.00099707]),
 'test_score': array([0.96658312, 1.        , 1.        , 0.96658312, 1.        ])}

### question:

try to use a regression model, (you can't use f1 on this, so evaluate on r2)

In [25]:
# answer here
cross_validate( LinearRegression() ,X,y,scoring='r2') ['test_score'].mean()

0.3225607248900085

`linear_model` submodule contains lots of useful alternate options

In [None]:
linear_model.

# for example:
linear_model.Lasso
linear_model.Ridge
linear_model.LogisticRegression

linear_model.LassoCV() # Returns a Lasso (L1 Regularization) linear model with picking the best model by cross validation
linear_model.RidgeCV() # Returns a Ridge (L2 Regularization) linear model with picking the best model by cross validation
linear_model.LogisticRegressionCV() # return best logit model by CV


Looping over models

In [31]:
# set up models to try
models = []
models.append(('svc_1', SVC(gamma='auto') ))
models.append(('svc_2', SVC(C=5) ))
models.append(('neighbor1',  KNeighborsClassifier(n_neighbors=1)))
models[0][1]

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [33]:
# loop and print
for name, model in models:
    scores = cross_validate(model, X, y, scoring='accuracy', cv=5)
    print('%s: %.3f (%.3f)' % (name.ljust(10), 
                                   scores['test_score'].mean(), 
                                   scores['test_score'].std()
                                   )
         )

svc_1     : 0.980 (0.016)
svc_2     : 0.987 (0.016)
neighbor1 : 0.960 (0.025)


In [None]:
# biult in methods 

gridsearchCV

randomizedsearchCV

### The X parameter

You can loop over Xs

In [36]:
# define a smaller X and a bigger X
X_small = X[:,:2] # just first two columns

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
X3 = poly.fit_transform(X) # has not 4 vars, but 36

In [46]:
# set up Xs to try
Xs = []
Xs.append( ('X' ,X ))
Xs.append( ('X_small' ,X_small   ))
Xs.append( ('X3'   ,X3     ))

# loop and print
model = KNeighborsClassifier(n_neighbors=1)

for X_name, X in Xs:
    scores = cross_validate(model, X, y, scoring='accuracy', cv=5)
    print('%s: %.3f (%.3f)' % (X_name.ljust(10), 
                                   scores['test_score'].mean(), 
                                   scores['test_score'].std()
                                   )
         )

X         : 0.960 (0.025)
X_small   : 0.727 (0.061)
X3        : 0.947 (0.016)


### Xs and Models

In [47]:
# i willl post this!

for X_name, X in Xs:
    for name, model in models: 
        scores = cross_validate(model, X, y, scoring='accuracy', cv=5)
        print('%s + %s: %.3f (%.3f)' % (name.ljust(10),
                                        X_name.ljust(10), 
                                       scores['test_score'].mean(), 
                                       scores['test_score'].std()
                                       )
             )

svc_1      + X         : 0.980 (0.016)
svc_2      + X         : 0.987 (0.016)
neighbor1  + X         : 0.960 (0.025)
svc_1      + X_small   : 0.820 (0.058)
svc_2      + X_small   : 0.813 (0.054)
neighbor1  + X_small   : 0.727 (0.061)
svc_1      + X3        : 0.527 (0.077)
svc_2      + X3        : 0.973 (0.025)
neighbor1  + X3        : 0.947 (0.016)


### CV parameter and folds

Just  watch.

In [50]:
cross_validate(model, X, y, scoring='accuracy', cv=5)

{'fit_time': array([0.00101757, 0.00099778, 0.00099778, 0.0009954 , 0.        ]),
 'score_time': array([0.00297141, 0.00199604, 0.0019958 , 0.00099826, 0.00098968]),
 'test_score': array([0.93333333, 0.96666667, 0.93333333, 0.93333333, 0.96666667])}

In [51]:
# silly data for illustration
y= ["a",'a','a','b','b','b','c','c','c']

# a/b/c are equal % of data

In [53]:
kf = KFold(n_splits=3)    
kf = KFold(n_splits=3,shuffle=True,random_state=1)   # must give state!  

for train, test in kf.split(y):       # for each fold,
    print("train: %s test: %s" % (str(train).ljust(32), test))   # but here, just show 
    print("       %s       %s" % (str([y[j] for j in train]).ljust(32), [y[j] for j in test]))
    print() #blank line
    
# kfold --> splits it BY INDEX
    

train: [0 1 3 4 5 7]                    test: [2 6 8]
       ['a', 'a', 'b', 'b', 'b', 'c']         ['a', 'c', 'c']

train: [2 3 4 5 6 8]                    test: [0 1 7]
       ['a', 'b', 'b', 'b', 'c', 'c']         ['a', 'a', 'c']

train: [0 1 2 6 7 8]                    test: [3 4 5]
       ['a', 'a', 'a', 'c', 'c', 'c']         ['b', 'b', 'b']



In [54]:
skf = StratifiedKFold(n_splits=3)
# skf = StratifiedKFold(n_splits=3,shuffle=True,random_state=1) # now random
X = y # skf needs an X and y variable
for train, test in skf.split(X,y):       # for each fold,
    print("train: %s test: %s" % (str(train).ljust(32), test))   # but here, just show 
    print("       %s       %s" % (str([y[j] for j in train]).ljust(32), [y[j] for j in test]))
    print() #blank line

# skf keeps fractions of a/b/c as equal as possible in test/train to overall data

train: [1 2 4 5 7 8]                    test: [0 3 6]
       ['a', 'a', 'b', 'b', 'c', 'c']         ['a', 'b', 'c']

train: [0 2 3 5 6 8]                    test: [1 4 7]
       ['a', 'a', 'b', 'b', 'c', 'c']         ['a', 'b', 'c']

train: [0 1 3 4 6 7]                    test: [2 5 8]
       ['a', 'a', 'b', 'b', 'c', 'c']         ['a', 'b', 'c']



In [55]:
# reload the X and y variables (we jsut overwrote)
X, y = datasets.load_iris(return_X_y=True)
model = KNeighborsClassifier(n_neighbors=1)
cross_validate(model, X, y, cv=StratifiedKFold(n_splits=3)  , scoring='accuracy')

# set up folds to try
folds = []
folds.append(('kf', KFold(n_splits=3)   ))
folds.append(('kf_rand', KFold(n_splits=3,shuffle=True,random_state=1)  ))
folds.append(('skf_3',       StratifiedKFold(n_splits=3)))
folds.append(('skf_3_rand',  StratifiedKFold(n_splits=3,shuffle=True,random_state=1)))
folds.append(('skf_5',       StratifiedKFold(n_splits=5)))

model = KNeighborsClassifier(n_neighbors=1)
# loop and print
for fold_name, fold in folds:
    scores = cross_validate(model, X, y, cv=fold, scoring='accuracy')
    print('%s: %.3f (%.3f)' % (fold_name.ljust(10), 
                                   scores['test_score'].mean(), 
                                   scores['test_score'].std()
                                   )
         )

# lol the kf completely failed... probably had new types OOS    

kf        : 0.000 (0.000)
kf_rand   : 0.947 (0.025)
skf_3     : 0.960 (0.016)
skf_3_rand: 0.953 (0.009)
skf_5     : 0.960 (0.025)


# Links, resoruces, and next week

Only two resources needed
- sklearn docs are GREAT https://scikit-learn.org/stable/user_guide.html 
- Python Data Science Handbook (note some module calls are obsolete, so you might need to update code) https://jakevdp.github.io/PythonDataScienceHandbook/index.html

Next week:
- preprocessing
- data transformations
- feasture selection
