# Implementing KNN Classifier

A simple Knn Classifier using NBA data. This is following along a tutorial from https://www.dataquest.io/blog/k-nearest-neighbors/

In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [83]:
# load in file path
player_data_file = "/Users/scottlorimor/desktop/python/datasets/nba_2013.csv"
plr_df = pd.read_csv(player_data_file)

# some basics about the dataset
print plr_df.shape
plr_df.head()

(481, 31)


Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


Equation for computing Euclidean Distance: $ \sqrt{(q_1 - p_1)^2 + (q_2 - p_2)^2  + .. +  (q_n - p_n)^2} $

Euclidean distance is used to compute the distance for "k".

In [50]:
# find similar players to LeBron
lebron = plr_df[plr_df['player'] == 'LeBron James']
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 
                    'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 
                    'efg.', 'ft', 'fta', 'ft.', 'orb',
                    'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']


In [54]:
# implements euclidean distance algo

def euclid_distance(row):
    '''
    very simple euclidean distance function
    '''
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - lebron[k])**2
        return np.sqrt(inner_value)

    

In [78]:
# find similar players to LeBron
lebron_distance = plr_df.apply(euclid_distance,axis=1)
lebron_distance.head()

Unnamed: 0,225
0,22.406744
1,19.406744
2,26.406744
3,27.406744
4,24.406744


In [79]:
# normalize columns to keep from inherently larger column's values skewing the distance function

plr_numeric = plr_df[distance_columns]

# normalize data so mean = 0 and std = 1 
plr_normed = (plr_numeric - plr_numeric.mean())/(plr_numeric.std())

In [100]:
# finding nearest neighbors with scipy
from scipy.spatial import distance as distance

plr_normed.fillna(0, inplace=True)
lebron_normed = plr_normed[plr_df['player'] == "LeBron James"]

euc_dist_lebron = plr_normed.apply(lambda row: distance.euclidean(row,lebron_normed),axis=1)

simi_plrs = pd.DataFrame(data={"distance": euc_dist_lebron, "index": euc_dist_lebron.index})
simi_plrs.sort_values("distance", inplace=True)
nn = simi_plrs.iloc[1]['index']
most_similar = plr_df.loc[int(nn)]['player']

print "The Most similar player to LeBron James: {0}".format(most_similar)

The Most similar player to LeBron James: Carmelo Anthony


In [115]:
# create test and training sets
import random
from numpy.random import permutation

rand_idx = permutation(plr_df.index)
rand_idx.shape

cutoff = int(np.floor(len(plr_df)/3))

test = plr_df.loc[rand_idx[1:cutoff]]
test.fillna(0,inplace=True)
train = plr_df.loc[rand_idx[cutoff:]]
train.fillna(0,inplace=True)

In [128]:
from sklearn.neighbors import KNeighborsRegressor
# note: weights='distance' <-- assigns weights proportional to the inverse of the distance to the query point
features = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 
                        'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.',
                        'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
y = ['pts']

# train model
# model = KNeighborsRegressor(n_neighbors=5)
model = KNeighborsRegressor(n_neighbors=5,weights='distance')
model.fit(train[features], train[y])

# predictions 
predictions = model.predict(test[features])

In [129]:
# computer error MSE
actual = test[y]

mse = (((predictions - actual) ** 2).sum()) / len(predictions)
mse

pts    7120.28003
dtype: float64