In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# train_test_split

This function takes a list of arrays and splits each array into two arrays (a training set and a test set) by randomly selecting rows or values.

In [None]:
# x is our predictor matrix
# y is a numeric output - for regression methods
# z is a categorical output - for classification methods
X = np.arange(12).reshape((2, 6)).T
y = np.arange(6)
z = np.array([0,0,0,1,1,1])
print(X)
print(y)
print(z)

We can use train_test_split on each array individually.

It returns a tuple that can be unpacked into train and test arrays.

In [None]:
X_train, X_test = train_test_split(X, test_size = 1/3, random_state = 1)
print(X_train)
print(X_test)

In [None]:
y_train, y_test = train_test_split(y, test_size = 1/3, random_state = 1)
print(y_train)
print(y_test)

We can also apply it to multiple arrays simultaneously.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 1)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

In [None]:
## if you have a categorical variable, the stratify argument ensures 
# that you'll get an appropriate number of each category in the resulting split
X_train, X_test, z_train, z_test = train_test_split(X, z, test_size = 1/3, random_state = 1, 
                                                    stratify = z)
print(X_train)
print(X_test)
print(z_train)
print(z_test)

# classification example with iris data


In [None]:
iris = load_iris()
type(iris) # iris is a Bunch (like a dictionary)

In [None]:
iris.keys()

In [None]:
print(iris.data[:8,]) # first 8 rows: we have 4 columns of variables

In [None]:
print(iris.feature_names)  # The predictor variables (features) are four numeric variables

In [None]:
iris.target # the target is a categorical variable with three levels

In [None]:
print(iris.target_names) # the observations are one of the following three species

In [None]:
print(iris.DESCR)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                    test_size = 0.2, random_state = 1, 
                                                    stratify = iris.target)

In [None]:
X_train.shape

In [None]:
print(np.bincount(y_test))

# we will fit a k nearest neighbors classifier

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
dat = pd.DataFrame(X_train)
dat.columns = iris.feature_names
print(sns.pairplot(dat))

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
print(knn)

In [None]:
knn.fit(X_train, y_train)

In [None]:
print(knn.predict(X_test))  # our predictions based on the fitted model

In [None]:
print(y_test)  # the actual values

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, knn.predict(X_test))
# true answer is in rows, and the prediction is in the column
# we see that when the true answer is 2, the predition was 1 one time.
# no errors would be a diagonal matrix

In [None]:
# here we try a different knn model using only one nearest neighbor

knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train, y_train)
confusion_matrix(y_test, knn1.predict(X_test))
# same performance

In [None]:
# another knn model using 25 nearest neighbors
knn25 = KNeighborsClassifier(n_neighbors=25)
knn25.fit(X_train, y_train)
confusion_matrix(y_test, knn25.predict(X_test))
# slighlty worse performance with 25 nearest neighbors
# we have 2 errors. The true value is 2 and we predicted 1 twice.

# Automated parameter search using GridSearchCV

What if I want to test out a bunch of different nearest neighbor values.

So we can try a knn with 1 nearest neighbor

a knn with 2 neighbors

knn with 3 neighbors, etc.

Rather than manually fitting 50 different models, we can have python automatically fit 50 models for us using GridSearchCV.

GridSearchCV will try a bunch of different models (you can vary more than one parameter), and will report back the model score.

Keep in mind that the more parameters you try out, the longer it will take to test and fit all of the models.

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(1,50), 'weights': ['uniform','distance']}  # we make a dictionary called param_grid
# and the keyword will be the argument that goes into the model

knn = KNeighborsClassifier()  # note we do not specify n_neighbors here
knn_cv = GridSearchCV(knn, param_grid, cv = 5)  # we call GridSearchCV, we pass it the name of our model 'knn', and 
# also we give it the parameters it will search through
# cv=5 means we score it based on 5-fold cross validation

# we are not using the test data at all
# we are only using the training data

In [None]:
knn_cv.fit(X_train, y_train)

In [None]:
knn_cv.best_params_  # which set of parameters produced the best performing model?

In [None]:
knn_cv.best_score_   # uses an accuracy score, which may not be the best metric

In [None]:
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)
print(confusion_matrix(y_train, knn.predict(X_train)))

In [None]:
118/120

In [None]:
print(confusion_matrix(y_test, knn.predict(X_test)))

In [None]:
knn = KNeighborsClassifier(n_neighbors=13)
cv_scores = cross_val_score(knn, X_train, y_train, cv = 5)
print(cv_scores)
print(np.mean(cv_scores))

## Gaussian Naive Bayes

In [None]:
nbclass = GaussianNB()

In [None]:
nbclass.fit(X_train, y_train)

In [None]:
nbclass.predict(X_test)

In [None]:
print(confusion_matrix(y_test, nbclass.predict(X_test)))

In [None]:
cross_val_score(nbclass, X_train, y_train, cv = 5)

In [None]:
np.mean(cross_val_score(nbclass, X_train, y_train, cv = 5))