In [811]:
import numpy as np
import pandas as pd
import patsy

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.grid_search import GridSearchCV

from IPython.core.display import HTML, Image

# Advanced Model Tuning

In [816]:
Image(url='http://pix-media.s3.amazonaws.com/blog/1086/t-pain.jpg', width=400)

## AKA Autotune...

## AKA...<br><br>One simple trick to minimizing your loss functions!

# We are going to build a model using SF crime data. It will use day, time, and district predict the crime type.

## Load our data set 

In [809]:
sf_crime = pd.read_csv('./assets/datasets/sf_crime_train.csv')
sf_crime = sf_crime.dropna()

In [810]:
sf_crime.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,5/13/15 23:53,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,5/13/15 23:53,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,5/13/15 23:33,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


## Data type conversions and transformations

In [3]:
sf_crime['Dates'] = pd.to_datetime(sf_crime.Dates)
sf_crime_dates = pd.DatetimeIndex(sf_crime.Dates.values, dtype='datetime64[ns]', freq=None)

sf_crime['hour'] = sf_crime_dates.hour
sf_crime['month'] = sf_crime_dates.month
sf_crime['year'] = sf_crime_dates.year

## Let's see what all the listed crimes are

In [4]:
sf_crime['Category'].unique()

array(['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT',
       'VANDALISM', 'NON-CRIMINAL', 'ROBBERY', 'ASSAULT', 'WEAPON LAWS',
       'BURGLARY', 'SUSPICIOUS OCC', 'DRUNKENNESS',
       'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC', 'STOLEN PROPERTY',
       'SECONDARY CODES', 'TRESPASS', 'MISSING PERSON', 'FRAUD',
       'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE',
       'SEX OFFENSES FORCIBLE', 'PROSTITUTION', 'DISORDERLY CONDUCT',
       'ARSON', 'FAMILY OFFENSES', 'LIQUOR LAWS', 'BRIBERY',
       'EMBEZZLEMENT', 'SUICIDE', 'LOITERING', 'SEX OFFENSES NON FORCIBLE',
       'EXTORTION', 'GAMBLING', 'BAD CHECKS'], dtype=object)

## Select a subsection of the listed crimes

In [5]:
subset = ['VEHICLE THEFT','BURGLARY','DRUG/NARCOTIC']
sf_crime_sub = sf_crime[sf_crime['Category'].isin(subset)]

In [6]:
sf_crime_sub

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,hour,month,year
6,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138,23,5,2015
7,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564,23,5,2015
46,2015-05-13 20:00:00,VEHICLE THEFT,STOLEN MOTORCYCLE,Wednesday,INGLESIDE,NONE,0 Block of CRESCENT AV,-122.423702,37.735233,20,5,2015
49,2015-05-13 19:52:00,BURGLARY,"BURGLARY, VEHICLE (ARREST MADE)",Wednesday,PARK,"ARREST, BOOKED",1500 Block of HAIGHT ST,-122.447761,37.769846,19,5,2015
59,2015-05-13 19:28:00,VEHICLE THEFT,STOLEN AND RECOVERED VEHICLE,Wednesday,CENTRAL,NONE,0 Block of SANSOME ST,-122.400720,37.790712,19,5,2015
60,2015-05-13 19:28:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,CENTRAL,NONE,0 Block of SANSOME ST,-122.400720,37.790712,19,5,2015
73,2015-05-13 19:00:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,MISSION,NONE,26TH ST / GUERRERO ST,-122.422572,37.748774,19,5,2015
87,2015-05-13 18:30:00,BURGLARY,"BURGLARY OF RESIDENCE, ATTEMPTED FORCIBLE ENTRY",Wednesday,BAYVIEW,NONE,1300 Block of FELTON ST,-122.417938,37.726605,18,5,2015
97,2015-05-13 18:00:00,BURGLARY,"BURGLARY OF APARTMENT HOUSE, UNLAWFUL ENTRY",Wednesday,SOUTHERN,NONE,0 Block of 6TH ST,-122.409504,37.781526,18,5,2015
104,2015-05-13 17:55:00,BURGLARY,"BURGLARY,STORE UNDER CONSTRUCTION, UNLAWFUL ENTRY",Wednesday,SOUTHERN,"ARREST, BOOKED",1200 Block of MARKET ST,-122.415449,37.778294,17,5,2015


## Check the total number of districts

In [7]:
sf_crime_sub['PdDistrict'].unique()

array(['INGLESIDE', 'BAYVIEW', 'PARK', 'CENTRAL', 'MISSION', 'SOUTHERN',
       'NORTHERN', 'RICHMOND', 'TARAVAL', 'TENDERLOIN'], dtype=object)

In [8]:
sf_crime_sub['PdDistrict'].nunique()

10

## Set up our design matrix and target vector with Patsy

### Patsy allows us to use R-style formulas to do this 

In [145]:
X = patsy.dmatrix('~ C(hour) + C(DayOfWeek) + C(PdDistrict)', sf_crime_sub)
y = sf_crime_sub['Category'].values

In [146]:
y

array(['VEHICLE THEFT', 'VEHICLE THEFT', 'VEHICLE THEFT', ...,
       'VEHICLE THEFT', 'BURGLARY', 'VEHICLE THEFT'], dtype=object)

In [147]:
X.design_info.column_names

['Intercept',
 'C(hour)[T.1]',
 'C(hour)[T.2]',
 'C(hour)[T.3]',
 'C(hour)[T.4]',
 'C(hour)[T.5]',
 'C(hour)[T.6]',
 'C(hour)[T.7]',
 'C(hour)[T.8]',
 'C(hour)[T.9]',
 'C(hour)[T.10]',
 'C(hour)[T.11]',
 'C(hour)[T.12]',
 'C(hour)[T.13]',
 'C(hour)[T.14]',
 'C(hour)[T.15]',
 'C(hour)[T.16]',
 'C(hour)[T.17]',
 'C(hour)[T.18]',
 'C(hour)[T.19]',
 'C(hour)[T.20]',
 'C(hour)[T.21]',
 'C(hour)[T.22]',
 'C(hour)[T.23]',
 'C(DayOfWeek)[T.Monday]',
 'C(DayOfWeek)[T.Saturday]',
 'C(DayOfWeek)[T.Sunday]',
 'C(DayOfWeek)[T.Thursday]',
 'C(DayOfWeek)[T.Tuesday]',
 'C(DayOfWeek)[T.Wednesday]',
 'C(PdDistrict)[T.CENTRAL]',
 'C(PdDistrict)[T.INGLESIDE]',
 'C(PdDistrict)[T.MISSION]',
 'C(PdDistrict)[T.NORTHERN]',
 'C(PdDistrict)[T.PARK]',
 'C(PdDistrict)[T.RICHMOND]',
 'C(PdDistrict)[T.SOUTHERN]',
 'C(PdDistrict)[T.TARAVAL]',
 'C(PdDistrict)[T.TENDERLOIN]']

## Let's look at our design matrix as a DataFrame

In [148]:
pdf = pd.DataFrame(X, columns=X.design_info.column_names)
pdf['Target'] = y
pdf

Unnamed: 0,Intercept,C(hour)[T.1],C(hour)[T.2],C(hour)[T.3],C(hour)[T.4],C(hour)[T.5],C(hour)[T.6],C(hour)[T.7],C(hour)[T.8],C(hour)[T.9],...,C(PdDistrict)[T.CENTRAL],C(PdDistrict)[T.INGLESIDE],C(PdDistrict)[T.MISSION],C(PdDistrict)[T.NORTHERN],C(PdDistrict)[T.PARK],C(PdDistrict)[T.RICHMOND],C(PdDistrict)[T.SOUTHERN],C(PdDistrict)[T.TARAVAL],C(PdDistrict)[T.TENDERLOIN],Target
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,VEHICLE THEFT
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,VEHICLE THEFT
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,VEHICLE THEFT
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,BURGLARY
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,VEHICLE THEFT
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,VEHICLE THEFT
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,VEHICLE THEFT
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BURGLARY
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,BURGLARY
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,BURGLARY


## Let's see how many districts are listed in our design matrix 

In [149]:
sf_crime_sub['PdDistrict'].nunique()

10

In [150]:
[x for x in pdf.columns if 'PdDistrict' in x]

['C(PdDistrict)[T.CENTRAL]',
 'C(PdDistrict)[T.INGLESIDE]',
 'C(PdDistrict)[T.MISSION]',
 'C(PdDistrict)[T.NORTHERN]',
 'C(PdDistrict)[T.PARK]',
 'C(PdDistrict)[T.RICHMOND]',
 'C(PdDistrict)[T.SOUTHERN]',
 'C(PdDistrict)[T.TARAVAL]',
 'C(PdDistrict)[T.TENDERLOIN]']

In [151]:
pd.Series([x for x in pdf.columns if 'PdDistrict' in x]).nunique()

9

## And how many hours?

In [152]:
sf_crime_sub['hour'].nunique()

24

In [153]:
pd.Series([x for x in pdf.columns if 'hour' in x]).nunique()

23

## Check: Why is there one less on both?

## Set up our training and testing sets

In [219]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=77)

## Now let's fit a standard logistic regression model

In [360]:
lr = LogisticRegression(solver='liblinear')

In [361]:
lr_model = lr.fit(X_train, y_train)

### Make our predictions

In [362]:
lr_ypred = lr_model.predict(X_test)

### Check our misclassifications with a confusion matrix

In [373]:
lr_cm = confusion_matrix(y_test, lr_ypred, labels=lr.classes_)
lr_cm = pd.DataFrame(lr_cm, columns=lr.classes_, index=lr.classes_)
lr_cm

Unnamed: 0,BURGLARY,DRUG/NARCOTIC,VEHICLE THEFT
BURGLARY,96,19,134
DRUG/NARCOTIC,45,49,76
VEHICLE THEFT,59,11,236


### Check our precision, recall, and f1

In [374]:
print(classification_report(y_test, lr_ypred, labels=lr.classes_))

             precision    recall  f1-score   support

   BURGLARY       0.48      0.39      0.43       249
DRUG/NARCOTIC       0.62      0.29      0.39       170
VEHICLE THEFT       0.53      0.77      0.63       306

avg / total       0.53      0.53      0.50       725



## Check the CV Score

In [517]:
cross_val_score(lr, X, y, cv=3).mean()

0.51530013953612797

## Let's now use a penalized regression - we'll use lasso (l1)

In [508]:
lr_l1 = LogisticRegression(C=1.5, penalty='l1', solver='liblinear')
lr_l1_model = lr_l1.fit(X_train, y_train)

In [509]:
lr_l1_model = lr_l1.fit(X_train, y_train)

In [510]:
lr_l1_ypred = lr_l1_model.predict(X_test)

## Get confusion matrix

In [505]:
lr_l1_cm = confusion_matrix(y_test, lr_l1_ypred, labels=lr_l1.classes_)
lr_l1_cm = pd.DataFrame(lr_l1_cm, columns=lr_l1.classes_, index=lr_l1.classes_)
lr_l1_cm

Unnamed: 0,BURGLARY,DRUG/NARCOTIC,VEHICLE THEFT
BURGLARY,98,18,133
DRUG/NARCOTIC,45,49,76
VEHICLE THEFT,58,10,238


## Get classification report

In [506]:
print(classification_report(y_test, lr_l1_ypred, labels=lr_l1.classes_))

             precision    recall  f1-score   support

   BURGLARY       0.49      0.39      0.44       249
DRUG/NARCOTIC       0.64      0.29      0.40       170
VEHICLE THEFT       0.53      0.78      0.63       306

avg / total       0.54      0.53      0.51       725



## Get mean cross val score

In [516]:
cross_val_score(lr_l1, X, y, cv=3).mean()

0.51803238507243377

 ## Looks like a minimal improvement with L1 penalty at 1.5, how about other values?

## We can build a function to test this

In [569]:
def test_penalties(c_val):
    lr_l1 = LogisticRegression(C=c_val, penalty='l1', solver='liblinear')
    cvs = cross_val_score(lr_l1, X, y, cv=3)
    return cvs

In [570]:
test_cs = pd.Series([.001, .01, .1, 1, 1.5, 2.5, 5, 10, 100]).to_frame('c_vals')
score_frame = pd.DataFrame([test_penalties(x) for x in test_cs['c_vals']]).mean(axis=1).to_frame('score')

pd.concat([test_cs, score_frame], axis=1)

Unnamed: 0,c_vals,score
0,0.001,0.333486
1,0.01,0.440547
2,0.1,0.495249
3,1.0,0.512573
4,1.5,0.518032
5,2.5,0.515756
6,5.0,0.520765
7,10.0,0.517118
8,100.0,0.51484


## Sklearn has a function that will do this for us already

In [620]:
# fit model with five folds and lasso regularization
# use Cs=15 to test a grid of 15 distinct parameters
# remeber: Cs describes the inverse of regularization strength
logreg_cv = LogisticRegressionCV(Cs=20, solver='liblinear', cv=3, penalty='l1', scoring='f1')
cv_model = logreg_cv.fit(X_train, y_train)

## Find best C per class

In [621]:
print('best C for class:')
best_C = {logreg_cv.classes_[i]:x for i, (x, c) in enumerate(zip(logreg_cv.Cs_, logreg_cv.classes_))}
print(best_C)

best C for class:
{'BURGLARY': 0.0001, 'VEHICLE THEFT': 0.00069519279617756048, 'DRUG/NARCOTIC': 0.00026366508987303583}


## Get the classification report for best model

In [622]:
print(classification_report(y_test, logreg_cv.predict(X_test)))

             precision    recall  f1-score   support

   BURGLARY       0.48      0.40      0.44       249
DRUG/NARCOTIC       0.69      0.25      0.36       170
VEHICLE THEFT       0.53      0.78      0.63       306

avg / total       0.55      0.53      0.50       725



## E1. 

## Using the data set (pdf), fit a model to predict between "Burglary" and "Drug/Narcotic" crimes
## One model should use l1 and the other should use an l2 penalty
## Make sure to use train_test_split
## Print out a confusion matrix and a classification report for both
## Finally, build a third model that uses LogisticRegressionCV
## Print our a confusion matrix, classification report and the best value of C

## Select the appropriate rows

In [636]:
vice = pdf[pdf['Target'].isin(['BURGLARY', 'DRUG/NARCOTIC'])]

In [637]:
vice_X = vice.iloc[:,:-1].as_matrix()
vice_y = vice.iloc[:,-1].as_matrix()

## Apply tts

In [638]:
vice_X_train, vice_X_test, vice_y_train, vice_y_test =\
train_test_split(vice_X, vice_y, test_size=0.33, random_state=50)

## Fit our 2 models

In [641]:
lr1 = LogisticRegression(penalty='l1')
lr2 = LogisticRegression(penalty='l2')

lr1_model = lr1.fit(vice_X_train, vice_y_train)
lr2_model = lr2.fit(vice_X_train, vice_y_train)

## Use our fitted models to make predictions

In [642]:
vice_y1_pred = lr1.predict(vice_X_test)
vice_y2_pred = lr2.predict(vice_X_test)

## Get our confusion matrices

In [645]:
cm1 = confusion_matrix(vice_y_test, vice_y1_pred, labels=lr1.classes_)
cm1 = pd.DataFrame(cm1, columns=lr1.classes_, index=lr1.classes_)

cm2 = confusion_matrix(vice_y_test, vice_y2_pred, labels=lr2.classes_)
cm2 = pd.DataFrame(cm2, columns=lr2.classes_, index=lr2.classes_)

## L1 confusion matrix

In [647]:
cm1

Unnamed: 0,BURGLARY,DRUG/NARCOTIC
BURGLARY,217,34
DRUG/NARCOTIC,89,66


## L2 matrix

In [648]:
cm2

Unnamed: 0,BURGLARY,DRUG/NARCOTIC
BURGLARY,210,41
DRUG/NARCOTIC,84,71


## L1 model classification report

In [818]:
print(classification_report(vice_y_test, vice_y1_pred, labels=lr2.classes_))

             precision    recall  f1-score   support

   BURGLARY       0.71      0.86      0.78       251
DRUG/NARCOTIC       0.66      0.43      0.52       155

avg / total       0.69      0.70      0.68       406



## L2 classification report

In [817]:
print(classification_report(vice_y_test, vice_y2_pred, labels=lr1.classes_))

             precision    recall  f1-score   support

   BURGLARY       0.71      0.84      0.77       251
DRUG/NARCOTIC       0.63      0.46      0.53       155

avg / total       0.68      0.69      0.68       406



## Now using LRCV

In [666]:
lrcv = LogisticRegressionCV(penalty='l1', solver='liblinear')

In [667]:
lrcv_model = lrcv.fit(vice_X_train, vice_y_train)

In [668]:
lrcv_ypred = lrcv_model.predict(vice_X_test)

## Get our best C

In [669]:
lrcv_model.C_

array([ 0.35938137])

## Our confusion matrix

In [675]:
lrcv_cm = confusion_matrix(vice_y_test, lrcv_ypred, labels=lrcv.classes_)
lrcv_cm = pd.DataFrame(lrcv_cm, columns=lrcv.classes_, index=lrcv.classes_)
lrcv_cm

Unnamed: 0,BURGLARY,DRUG/NARCOTIC
BURGLARY,225,26
DRUG/NARCOTIC,95,60


## Our classification report

In [676]:
print(classification_report(vice_y_test, lrcv_ypred, labels=lrcv.classes_))

             precision    recall  f1-score   support

   BURGLARY       0.70      0.90      0.79       251
DRUG/NARCOTIC       0.70      0.39      0.50       155

avg / total       0.70      0.70      0.68       406



## Introducing GridSearchCV

## To start we'll select a model and penalties and some hyperparameters 
## Then will pass those to GridSearchCV

In [735]:
logreg = LogisticRegression(solver='liblinear')
C_vals = [0.0001, 0.001, 0.01, 0.1, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

gs = GridSearchCV(logreg, {'penalty': penalties, 'C': C_vals}, verbose=False, cv=15)
gs.fit(X, y)

GridSearchCV(cv=15, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 0.15, 0.25, 0.275, 0.33, 0.5, 0.66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=False)

## Now let's find the best parameters

In [736]:
gs.best_params_

{'C': 0.1, 'penalty': 'l2'}

## Use this parameter to .fit, .predict, and print a classification_report for our X and Y

In [737]:
logreg = LogisticRegression(C=gs.best_params_['C'], penalty=gs.best_params_['penalty'])
cv_model = logreg.fit(vice_X_train, vice_y_train)

In [738]:
cv_pred = cv_model.predict(vice_X_test)

## Now let's check our stats...

In [739]:
cm3 = confusion_matrix(vice_y_test, cv_pred, labels=logreg.classes_)
cm3 = pd.DataFrame(cm3, columns=logreg.classes_, index=logreg.classes_)

In [740]:
cm3

Unnamed: 0,BURGLARY,DRUG/NARCOTIC
BURGLARY,227,24
DRUG/NARCOTIC,95,60


In [741]:
print(classification_report(vice_y_test, cv_pred, labels=logreg.classes_))

             precision    recall  f1-score   support

   BURGLARY       0.70      0.90      0.79       251
DRUG/NARCOTIC       0.71      0.39      0.50       155

avg / total       0.71      0.71      0.68       406



## Independent Practice

## Use GridSearchCV with knn on the iris data set
## Use train_test_split with a test size of .66
## Set a parameter diction with the number of neighbors and one other parameter
## Get your best estimator and print out a classification report

## First, we load our data

In [745]:
from sklearn.datasets import load_iris
iris = load_iris()

## Set our X matrix and y vector

In [762]:
X = iris.data
y = iris.target

## Train test split our data

In [795]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.66)

## Next we set up our possible params and choose a model

In [796]:
from sklearn.neighbors import KNeighborsClassifier

In [797]:
knn = KNeighborsClassifier()

In [798]:
param_dict = dict(n_neighbors=range(1, 31), weights=['uniform', 'distance'])

## These are then passed into GridSearchCV and fit

In [799]:
gscv = GridSearchCV(knn, param_dict, scoring='accuracy')

In [800]:
gscv_model = gscv.fit(X_train, y_train)

## Let's see our best model

In [801]:
gscv_model.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

## We could actually call fit on this model as it is a model object

## Can also just retrieve the params

In [802]:
gscv.best_params_

{'n_neighbors': 6, 'weights': 'uniform'}

## Using the gridsearched params we can get predictions

In [803]:
gscv_ypred = gscv.predict(X_test)

## Now get our reports

In [804]:
print(classification_report(y_test, gscv_ypred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        32
          1       0.89      0.97      0.93        35
          2       0.97      0.88      0.92        32

avg / total       0.95      0.95      0.95        99



In [805]:
gscv_model.best_estimator_.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 6,
 'p': 2,
 'weights': 'uniform'}