In [1]:
# train_test_split -- takes an X (inputs/examples) and y (outputs/answers)
#  returns X_train, X_test, y_train, y_test
#  train with X_train and y_train
#  test with X_test and y_test
#   we can check -- are the results (y_pred) the same as y_test?
#   if not the same, how close?

# cross_val_score -- function that does the same sort of testing
#  as train_test_split, but we don't have to do as much.  We can
#  swap out the algorithm for a different testing strategy

# the default strategy is StratifiedKFold, which means: create n
#  groups, each group *MUST* have at least one rep from each class
#  (category)

# another strategy is LeaveOneOut -- train with all but one
#  of the data points, and test with one data point.  With n
#  data points, we'll do n iterations, each giving 0/1 for 
#  successful prediction

In [3]:
%pylab inline
import pandas as pd
from pandas import Series, DataFrame

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

iris = load_iris()
X = DataFrame(iris.data, columns=iris.feature_names)
y = Series(iris.target)

model = KNeighborsClassifier()  # default: k=5

results = cross_val_score(model, X, y, cv=5)  # divide into 5 parts
Series(results).describe()

Populating the interactive namespace from numpy and matplotlib


count    5.000000
mean     0.973333
std      0.027889
min      0.933333
25%      0.966667
50%      0.966667
75%      1.000000
max      1.000000
dtype: float64

In [4]:
%pylab inline
import pandas as pd
from pandas import Series, DataFrame

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut

strategy = StratifiedKFold()

iris = load_iris()
X = DataFrame(iris.data, columns=iris.feature_names)
y = Series(iris.target)

model = KNeighborsClassifier()  # default: k=5

results = cross_val_score(model, X, y, cv=strategy)  # divide into 5 parts
Series(results).describe()

Populating the interactive namespace from numpy and matplotlib


count    5.000000
mean     0.973333
std      0.027889
min      0.933333
25%      0.966667
50%      0.966667
75%      1.000000
max      1.000000
dtype: float64

In [7]:
%pylab inline
import pandas as pd
from pandas import Series, DataFrame

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut

strategy = KFold(n_splits=3, shuffle=True)

iris = load_iris()
X = DataFrame(iris.data, columns=iris.feature_names)
y = Series(iris.target)

model = KNeighborsClassifier()  

results = cross_val_score(model, X, y, cv=strategy)  # divide into 5 parts
Series(results).describe()

Populating the interactive namespace from numpy and matplotlib


count    3.000000
mean     0.966667
std      0.011547
min      0.960000
25%      0.960000
50%      0.960000
75%      0.970000
max      0.980000
dtype: float64

In [8]:
%pylab inline
import pandas as pd
from pandas import Series, DataFrame

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut

strategy = LeaveOneOut()

iris = load_iris()
X = DataFrame(iris.data, columns=iris.feature_names)
y = Series(iris.target)

# model = KNeighborsClassifier(n_neighbors=11)
model = KNeighborsClassifier()  

results = cross_val_score(model, X, y, cv=strategy)  # divide into 5 parts
Series(results).describe()

# Use a "for" loop to iterate over all k values from 1 to 23
# Use cross_val_score and LeaveOneOut to identify the best
# value of k (highest mean, lowest std, lowest k)

Populating the interactive namespace from numpy and matplotlib


count    150.000000
mean       0.966667
std        0.180107
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
dtype: float64