In [11]:
from sklearn.datasets import load_iris
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn import svm
import pandas as pd
from sklearn.impute import SimpleImputer
iris = load_iris() # data

# set up the pipeline, which will, given a set of observations 
# 1. fit and apply these steps to the training fold
# 2. in the testing fold, apply the transform and model to predict (no estimation)

classifier_pipeline = make_pipeline(
                                    preprocessing.StandardScaler(),  # clean the data
                                    svm.SVC(C=1)                     # model
                                    )

cross_validate(classifier_pipeline, iris.data, iris.target, cv=5)

{'fit_time': array([0.00939322, 0.00108123, 0.00123191, 0.00112987, 0.00109386]),
 'score_time': array([0.00051379, 0.00032163, 0.00030732, 0.00035477, 0.00036001]),
 'test_score': array([0.96666667, 0.96666667, 0.96666667, 0.93333333, 1.        ])}

In [6]:
# question 1: try this with a Nearest Neighbors Classifier (5 min)

from sklearn.neighbors import KNeighborsClassifier
knn_pipe = make_pipeline(
                        preprocessing.StandardScaler(),  # clean the data
                        KNeighborsClassifier()           # model
                        )

cross_validate(classifier_pipeline, iris.data, iris.target, cv=5)

{'fit_time': array([0.00282097, 0.00207782, 0.00100088, 0.00236607, 0.00102186]),
 'score_time': array([0.00062394, 0.00036025, 0.00036907, 0.00048399, 0.00043392]),
 'test_score': array([0.96666667, 0.96666667, 0.96666667, 0.93333333, 1.        ])}

In [12]:
iris2 = load_iris()
X2 = pd.DataFrame(iris2.data)
X2.columns = [1,2,3,4]
X2[2] = X2[2].sample(frac=0.5,random_state=14)
X2[2].describe()
iris2.data = X2

# print the scores using IRIS2.data (not iris.data)
# this produces an error because of the missing values!
# cross_validate(knn_pipe, iris2.data, iris.target, cv=5)

# so add an imputation step to the pipeline! (5 min, use lecture page!)
knn_pipe2 = make_pipeline(
                        SimpleImputer(strategy = 'mean'),
                        preprocessing.StandardScaler(),
                        KNeighborsClassifier()
                        )
cross_validate(knn_pipe2, iris2.data, iris.target, cv=5)

{'fit_time': array([0.00319219, 0.0026679 , 0.00444913, 0.00272608, 0.00356293]),
 'score_time': array([0.0021019 , 0.00201702, 0.00834894, 0.00208402, 0.00197792]),
 'test_score': array([0.9       , 0.96666667, 0.9       , 0.96666667, 1.        ])}

# Optimize a model - here, KNN, with `GridSearchCV`

In [13]:
knn_pipe2

Pipeline(memory=None,
         steps=[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [36]:
# grid search will let you specify all the parameters of the model
# you want to tweak, and the values you want to try

from sklearn.model_selection import GridSearchCV

# set up parameter grid to try
# the parameter grid is a dictionary where key:value pairs are built like:
#     stepName<two underlines>paramName : [list of settings to try]
param_grid = {'kneighborsclassifier__n_neighbors':[1,5,6,7,8,9,10]}

# like a normal estimator, this has not yet been applied to any data
grid = GridSearchCV(knn_pipe2, param_grid=param_grid)
grid.fit(iris2.data, iris.target)
grid.best_params_

# now save that pipeline as a model object!
optimal_knn_model = grid.best_estimator_



{'kneighborsclassifier__n_neighbors': 9}

In [37]:
# grid search will let you specify all the parameters of the model
# you want to tweak, and the values you want to try

from sklearn.model_selection import GridSearchCV

# set up parameter grid to try
# the parameter grid is a dictionary where key:value pairs are built like:
#     stepName<two underlines>paramName : [list of settings to try]
param_grid = {'kneighborsclassifier__n_neighbors':[1,5,6,7,8,9,10]}

# like a normal estimator, this has not yet been applied to any data
grid = GridSearchCV(knn_pipe2, param_grid=param_grid)
grid.fit(iris2.data, iris.target)
grid.best_params_



{'kneighborsclassifier__n_neighbors': 9}

In [38]:
#add to the param grid to see if we should change these two params: StandardScaler(with_mean = True, with_std = True)
# set up parameter grid to try
# the parameter grid is a dictionary where key:value pairs are built like:
#     stepName<two underlines>paramName : [list of settings to try]
param_grid = {'kneighborsclassifier__n_neighbors':[1,5,6,7,8,9,10],
             'standardscaler__with_mean':['True','False'],
             'standardscaler__with_std':['True','False']}

# like a normal estimator, this has not yet been applied to any data
grid = GridSearchCV(knn_pipe2, param_grid=param_grid)
grid.fit(iris2.data, iris.target)
grid.best_params_



{'kneighborsclassifier__n_neighbors': 9,
 'standardscaler__with_mean': 'True',
 'standardscaler__with_std': 'True'}

In [39]:
grid.cv_results_ #it did 28 different tests to find the best parameters

{'mean_fit_time': array([0.00489036, 0.00489322, 0.00326983, 0.00289202, 0.00364367,
        0.00382106, 0.00374603, 0.00849589, 0.00300781, 0.00369406,
        0.00279562, 0.00329105, 0.00504994, 0.00359186, 0.00498708,
        0.00387859, 0.00458566, 0.00405518, 0.00290116, 0.00337807,
        0.00313147, 0.00366402, 0.00368063, 0.00300686, 0.00311375,
        0.00272711, 0.00444166, 0.00438428]),
 'mean_score_time': array([0.00401529, 0.00541004, 0.00340104, 0.00312527, 0.00299366,
        0.00279435, 0.0026354 , 0.00377973, 0.00290672, 0.00581098,
        0.00263739, 0.00323002, 0.00443935, 0.00388972, 0.00273259,
        0.00320021, 0.00276534, 0.0032057 , 0.0037481 , 0.0033706 ,
        0.00329725, 0.00354226, 0.00295377, 0.00373681, 0.00344165,
        0.00305764, 0.00312432, 0.00284743]),
 'mean_test_score': array([0.92666667, 0.92666667, 0.92666667, 0.92666667, 0.92      ,
        0.92      , 0.92      , 0.92      , 0.93333333, 0.93333333,
        0.93333333, 0.93333333, 0.933

# Some Post Optimization Diagnostics

In [40]:
#print k-fold scoring like before
cross_validate(optimal_knn_model, iris2.data, iris.target, cv = 5)

{'fit_time': array([0.00467706, 0.00498295, 0.01372099, 0.00869918, 0.00310612]),
 'score_time': array([0.01202202, 0.00378227, 0.01355386, 0.00211596, 0.00201488]),
 'test_score': array([0.93333333, 0.9       , 0.93333333, 0.93333333, 1.        ])}

In [41]:
# print k-fold scoring (like before)
cross_validate(optimal_knn_model, iris2.data, iris.target, cv=5)

###########################################################
# use classification_report to see which types of Y values 
# your prediction performs better/worse on
###########################################################

# to use class_report, we need some predicted y values, so
# make a fold and generate predicted values

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(iris2.data, iris.target, random_state=9,train_size=.5)
y_pred = optimal_knn_model.fit(Xtrain, ytrain).predict(Xtest)

from sklearn.metrics import classification_report
print(classification_report(ytest,
                            y_pred,
                            target_names=iris.target_names)) 

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        27
  versicolor       0.93      0.96      0.94        26
   virginica       0.95      0.91      0.93        22

    accuracy                           0.96        75
   macro avg       0.96      0.96      0.96        75
weighted avg       0.96      0.96      0.96        75



In [None]:
#pipeline does data validation without data leakage, you'll get predictions that can generalize outside the sample
#computed by mean and did a standard deviation (there are more options)

In [42]:
#################################################################
# use confusion_matrix see exactly model gets predictions wrong
#################################################################

from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

plot_confusion_matrix(optimal_knn_model, Xtest, ytest,   # model and test data
                      display_labels=iris.target_names,  # labels
                      cmap=plt.cm.Blues,                 # colors
                      normalize=None)                    # turns on/off fractions (within row) 

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (/Users/mikestragapede/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/__init__.py)