In [32]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier 
from sklearn.model_selection import cross_validate, GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error

In [33]:
df = pd.read_csv('./data/PCA_Output.csv').drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,PC-1,PC-2,PC-3,PC-4,PC-5,PC-6,PC-7,PC-8,PC-9,PC-10,...,PC-14,PC-15,PC-16,PC-17,PC-18,PC-19,PC-20,Year,Round,School
0,44.488013,13.451983,17.215303,15.516061,-1.392,10.32212,-0.833705,5.512971,9.932289,0.306404,...,-1.952358,1.265309,2.922808,-0.524872,1.021296,-0.708134,-0.519683,1,0,Duke
1,2.141548,-8.058277,5.556478,-10.172859,9.089618,1.121069,7.020399,1.294642,5.613959,1.230927,...,-1.255755,-2.200641,-0.754523,0.183981,1.142544,-1.451628,0.074957,4,0,Connecticut
2,22.14822,3.858848,-4.308118,2.385105,-7.294075,-3.227119,-2.024828,-1.64213,1.684555,-0.747842,...,-0.42208,-0.253577,0.585204,0.350586,-1.001076,-1.680653,1.665078,10,0,Duke
3,8.91111,-11.482865,9.480825,-10.883555,-1.270769,1.009218,2.371393,-1.351887,-4.145126,-2.918149,...,0.48558,-1.087791,-2.293007,1.35617,1.099706,-1.668532,0.767453,0,0,Michigan State
4,28.242471,5.259683,10.714157,3.375753,-5.324363,-6.114989,6.478768,-4.972679,2.294216,-2.136309,...,1.349417,2.037605,2.970757,-3.331529,-3.134163,-2.338882,-0.178971,6,0,Florida


In [43]:
df_test = df.sample(frac=.2, replace=False).copy()
df_train = df.drop(df_test.index).copy()

features_train = df_train.drop(['Year', 'Round', 'School'], axis=1)
features_test = df_test.drop(['Year', 'Round', 'School'], axis=1)

features_train = (features_train - features_train.mean())/features_train.std()
features_test = (features_test - features_test.mean())/features_test.std()

targets_train = df_train.Round
targets_test = df_test.Round

In [44]:
knn = KNeighborsRegressor(n_neighbors = 5)
knn.fit(features_train, targets_train)
target_predicted = knn.predict(features_test)
RMSE_test = np.sqrt(mean_squared_error(targets_test, target_predicted))

In [45]:
print(RMSE_test)
print(knn.score(features_test, targets_test))

2.306906626159408
-0.1336971830985918


In [46]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(features_train, targets_train)
target_predicted = knn.predict(features_test)
RMSE_test = np.sqrt(mean_squared_error(targets_test, target_predicted))

In [47]:
print(RMSE_test)
print(knn.score(features_test, targets_test))

3.4575345919521117
0.09090909090909091


In [48]:
grid = {'n_neighbors': np.arange(1, 50)}

knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, param_grid=grid, return_train_score=True)
knnCV.fit(features_train, targets_train)

print(knnCV.best_params_, knnCV.best_score_)

knnCV.score(features_test, targets_test)

{'n_neighbors': 1} 0.26666666666666666


0.22727272727272727

In [49]:
pred = knnCV.predict(features_test)

print(np.mean(abs(pred - targets_test)))

2.1363636363636362


In [41]:
grid = {'n_neighbors': np.arange(1, 50)}

knn = KNeighborsRegressor()
knnCV = GridSearchCV(knn, param_grid=grid, return_train_score=True)
knnCV.fit(features_train, targets_train)

print(knnCV.best_params_, knnCV.best_score_)

knnCV.score(features_test, targets_test)

{'n_neighbors': 16} -11.75229614672896


0.12139423076923074

In [42]:
pred = knnCV.predict(features_test)

print(np.mean(abs(pred - targets_test)))

1.9090909090909092


Classifier: 27, .47, 1.02