# Cross Validation

# Exercises
Within your codeup-data-science directory, create a new repo named advanced-topics. This will be where you do your work for this module. Create a repository on GitHub with the same name, and link your local repository to GitHub.

Save this work in your advanced-topics repo. Then add, commit, and push your changes.

Do your work for this exercise in a jupyter notebook or python script named cross_validation.

Use the cross validation techniques discussed in the lesson to figure out what kind of model works best with the cars dataset used in the lesson.

# Imports

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Acquire

In [2]:
#load mpg data
df = data('mpg')

In [3]:
df

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


## Prepare

In [4]:
#change trans column to auto and manual
df.trans = np.where(df.trans.str.startswith('auto'), 'auto', 'manual')

In [5]:
df.trans.value_counts()

auto      157
manual     77
Name: trans, dtype: int64

## Model

In [6]:
#split into X and y
X = df[['displ', 'year', 'cyl', 'cty', 'hwy']]
X.head()

Unnamed: 0,displ,year,cyl,cty,hwy
1,1.8,1999,4,18,29
2,1.8,1999,4,21,29
3,2.0,2008,4,20,31
4,2.0,2008,4,21,30
5,2.8,1999,6,16,26


In [7]:
y = df.model
y.value_counts()

caravan 2wd               11
ram 1500 pickup 4wd       10
civic                      9
jetta                      9
dakota pickup 4wd          9
mustang                    9
impreza awd                8
a4 quattro                 8
grand cherokee 4wd         8
a4                         7
toyota tacoma 4wd          7
camry solara               7
camry                      7
tiburon                    7
sonata                     7
passat                     7
f150 pickup 4wd            7
durango 4wd                7
explorer 4wd               6
new beetle                 6
altima                     6
4runner 4wd                6
forester awd               6
gti                        5
c1500 suburban 2wd         5
corolla                    5
corvette                   5
malibu                     5
grand prix                 5
k1500 tahoe 4wd            4
pathfinder 4wd             4
mountaineer 4wd            4
range rover                4
maxima                     3
navigator 2wd 

In [8]:
#split into trains and tests
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20,
                                                    random_state=123)

## BASELINE

In [9]:
y_train.value_counts()

civic                     9
caravan 2wd               9
ram 1500 pickup 4wd       8
mustang                   8
a4 quattro                8
dakota pickup 4wd         7
camry solara              7
altima                    6
camry                     6
a4                        6
impreza awd               6
passat                    6
f150 pickup 4wd           6
new beetle                6
jetta                     6
forester awd              5
tiburon                   5
grand cherokee 4wd        5
durango 4wd               5
gti                       5
toyota tacoma 4wd         5
sonata                    4
malibu                    4
corolla                   4
grand prix                4
4runner 4wd               4
corvette                  3
range rover               3
expedition 2wd            3
a6 quattro                3
navigator 2wd             3
maxima                    3
k1500 tahoe 4wd           3
pathfinder 4wd            3
mountaineer 4wd           3
land cruiser wagon 4

In [10]:
# BASELINE
9/(9+178)

0.0481283422459893

#### typically split into a validation set

In [11]:
#get the validates
X_train, X_validate, y_train, y_validate = train_test_split(X_train,
                            y_train, test_size=.25, random_state=123)

In [12]:
print('X_train:', X_train.shape)
print('X_validate:', X_validate.shape)
print('X_test:', X_test.shape)

X_train: (140, 5)
X_validate: (47, 5)
X_test: (47, 5)


In [13]:
X_train.head()

Unnamed: 0,displ,year,cyl,cty,hwy
228,1.8,1999,4,21,29
207,4.0,2008,6,16,20
41,3.3,1999,6,16,22
105,1.8,2008,4,26,34
210,2.0,2008,4,21,29


#### use decision tree for predicting target variable

In [14]:
#make it
tree = DecisionTreeClassifier(max_depth=3)
tree

In [15]:
#fit it
tree.fit(X_train, y_train)

In [16]:
#evaluate on train
tree.score(X_train, y_train)

0.2571428571428571

In [17]:
#evaluate on validate
tree.score(X_validate, y_validate)

0.1276595744680851

In [18]:
#try a different hyperparameter
tree = DecisionTreeClassifier(max_depth=20)
tree.fit(X_train, y_train)
tree.score(X_train, y_train)

0.8642857142857143

In [19]:
#evaluate on validate
tree.score(X_validate, y_validate)

0.23404255319148937

## `cross_val_score`

#### fresh split without validate

In [20]:
#split into trains and tests
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=123)

#### model and score

In [21]:
#make it
tree = DecisionTreeClassifier(max_depth=4, random_state=123)

In [22]:
#fit and score
cross_val_score(tree, X_train, y_train, cv=5)



array([0.14285714, 0.22857143, 0.2       , 0.17142857, 0.08571429])

In [23]:
cross_val_score(tree, X_train, y_train, cv=5).mean()



0.16571428571428573

## Grid Search

`GridSearchCV` (grid search cross validation) class lets us quickly try out many different combinations of hyper parameters by also using cross validation

#### cycle through 1 - 20 neighbors

In [24]:
#make our modeling object
knn = KNeighborsClassifier()

Set up grid search 
- build a dictionary of parameters
    - keys are the names of hyperparameters
    - values are the range of values to search through
- send in model, parameters, number of folds

In [25]:
#define hyperparameters to send in
parameters = {'n_neighbors': range(1,21)}
parameters

{'n_neighbors': range(1, 21)}

In [26]:
#make the gridsearch object
grid = GridSearchCV(knn, parameters, cv=5)
grid

In [27]:
#fit the gridsearch object
grid.fit(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

In [28]:
#use the gridsearch object
grid.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [29]:
grid.cv_results_['params']

[{'n_neighbors': 1},
 {'n_neighbors': 2},
 {'n_neighbors': 3},
 {'n_neighbors': 4},
 {'n_neighbors': 5},
 {'n_neighbors': 6},
 {'n_neighbors': 7},
 {'n_neighbors': 8},
 {'n_neighbors': 9},
 {'n_neighbors': 10},
 {'n_neighbors': 11},
 {'n_neighbors': 12},
 {'n_neighbors': 13},
 {'n_neighbors': 14},
 {'n_neighbors': 15},
 {'n_neighbors': 16},
 {'n_neighbors': 17},
 {'n_neighbors': 18},
 {'n_neighbors': 19},
 {'n_neighbors': 20}]

In [30]:
grid.cv_results_['mean_test_score']

array([0.20571429, 0.21142857, 0.2       , 0.14857143, 0.14285714,
       0.12      , 0.13142857, 0.13714286, 0.12571429, 0.12      ,
       0.12      , 0.11428571, 0.13142857, 0.14285714, 0.14285714,
       0.13714286, 0.13714286, 0.14285714, 0.14857143, 0.15428571])

In [31]:
grid.best_score_

0.21142857142857144

In [32]:
grid.best_estimator_

In [33]:
#save into varible to run on test data
knn_model = grid.best_estimator_

In [34]:
knn_model.score(X_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.5485714285714286

#### cycle through multiple parameters

In [35]:
#make our modeling object
dt = DecisionTreeClassifier(random_state=123)

In [36]:
#define our parameters to rotate through
parameters = {'max_depth': range(1,21), 'min_samples_leaf': range(1,11), 'criterion': ['gini', 'entropy', 'log_loss']}

In [37]:
#make gridsearch object
grid = GridSearchCV(dt, parameters, cv=5)

In [38]:
#fit gridsearch object
grid.fit(X_train, y_train)



In [39]:
#use gridsearch object
len(grid.cv_results_['params'])

600

In [40]:
grid.cv_results_['params'][:15]

[{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 2},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 3},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 4},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 5},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 6},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 7},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 8},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 9},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 10},
 {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1},
 {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2},
 {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 3},
 {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 4},
 {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5}]

In [41]:
grid.cv_results_['mean_test_score'][:15]

array([0.08      , 0.08      , 0.08      , 0.08      , 0.08      ,
       0.08571429, 0.08571429, 0.08571429, 0.08571429, 0.08571429,
       0.11428571, 0.11428571, 0.11428571, 0.10857143, 0.10857143])

In [42]:
for p, score in zip(grid.cv_results_['params'], grid.cv_results_['mean_test_score']):
    p['score'] = score

In [43]:
grid.cv_results_['params'][:15]

[{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1, 'score': 0.08},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 2, 'score': 0.08},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 3, 'score': 0.08},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 4, 'score': 0.08},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 5, 'score': 0.08},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_samples_leaf': 6,
  'score': 0.08571428571428572},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_samples_leaf': 7,
  'score': 0.08571428571428572},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_samples_leaf': 8,
  'score': 0.08571428571428572},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_samples_leaf': 9,
  'score': 0.08571428571428572},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_samples_leaf': 10,
  'score': 0.08571428571428572},
 {'criterion': 'gini',
  'max_depth': 2,
  'min_samples_leaf': 1,
  'score': 0.11428571428571428},
 {'criterion': 

In [44]:
pd.DataFrame(grid.cv_results_['params']).sort_values('score', ascending=False)

Unnamed: 0,criterion,max_depth,min_samples_leaf,score
132,gini,14,3,0.268571
192,gini,20,3,0.268571
122,gini,13,3,0.268571
142,gini,15,3,0.268571
112,gini,12,3,0.268571
...,...,...,...,...
1,gini,1,2,0.080000
4,gini,1,5,0.080000
3,gini,1,4,0.080000
2,gini,1,3,0.080000


#### Use `grid.best_estimator` to save best hyperparameters

In [45]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 3,
 'score': 0.26857142857142857}

In [46]:
dt_model = grid.best_estimator_
dt_model

#### Try your model on unseen data. Test only on one model!!!

In [47]:
#evaluate on test
dt_model.score(X_test, y_test)

0.22033898305084745