# An Introduction to K-Nearest Neighbors

Each row of the data set contains information on how a player performed during the 2013-2014 NBA season.
Here are a few of the columns:
- `player` - The player's name
- `pos` - The player's position
- `g` - The number of games the player was in
- `gs` - The number of games in which the player started
- `pts` - The total points the player scored

In [1]:
import pandas as pd
import math
from scipy.spatial import distance
import random
from numpy.random import permutation
from sklearn.neighbors import KNeighborsRegressor

### Exploring the Data

In [2]:
nba = pd.read_csv("nba_2013.csv")

In [3]:
nba.shape

(481, 31)

In [4]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [5]:
nba.columns.values

array(['player', 'pos', 'age', 'bref_team_id', 'g', 'gs', 'mp', 'fg',
       'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.',
       'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov',
       'pf', 'pts', 'season', 'season_end'], dtype=object)

### Using Euclidean Distance to find similar rows

In [6]:
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

In [7]:
def euclidean_dist(row):
    d = 0
    for k in distance_columns:
        d += (row[k] - selected_player[k]) ** 2
    return math.sqrt(d)

In [8]:
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
selected_player

player          LeBron James
pos                       PF
age                       29
bref_team_id             MIA
g                         77
gs                        77
mp                      2902
fg                       767
fga                     1353
fg.                    0.567
x3p                      116
x3pa                     306
x3p.                0.379085
x2p                      651
x2pa                    1047
x2p.                0.621777
efg.                    0.61
ft                       439
fta                      585
ft.                     0.75
orb                       81
drb                      452
trb                      533
ast                      488
stl                      121
blk                       26
tov                      270
pf                       126
pts                     2089
season             2013-2014
season_end              2013
Name: 225, dtype: object

In [9]:
lebron_distance = nba.apply(euclidean_dist, axis=1)
lebron_distance

0      3475.792868
1              NaN
2              NaN
3      1189.554979
4      3216.773098
5              NaN
6       960.443178
7      3131.071083
8      2326.129199
9      2806.955657
10     2277.933945
11             NaN
12     2819.058890
13     2534.074598
14     1970.085795
15     3262.065464
16     2451.378405
17      485.856006
18             NaN
19     3246.515831
20     1539.172839
21             NaN
22     2969.043638
23             NaN
24     2023.603985
25             NaN
26             NaN
27             NaN
28     3754.041967
29     3835.882699
          ...     
451     716.243023
452    2996.450583
453    4135.156714
454    3023.456473
455    4138.570811
456            NaN
457    2206.524879
458    1347.758158
459    2136.309449
460            NaN
461            NaN
462    1922.713718
463    2364.771676
464    3033.755934
465    2625.998112
466    2495.296784
467    2232.354830
468            NaN
469    3525.434026
470    3574.911070
471    2873.509019
472    3831.

### Normalizing columns

In [10]:
nba_numeric = nba[distance_columns]
nba_numeric.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,23,63,0,847,66,141,0.468,4,15,0.266667,...,0.66,72,144,216,28,23,26,30,122,171
1,20,81,20,1197,93,185,0.503,0,0,,...,0.581,142,190,332,43,40,57,71,203,265
2,27,53,12,961,143,275,0.52,0,0,,...,0.639,102,204,306,38,24,36,39,108,362
3,28,73,73,2552,464,1011,0.459,128,300,0.426667,...,0.815,32,230,262,248,35,3,146,136,1330
4,25,56,30,951,136,249,0.546,0,1,0.0,...,0.836,94,183,277,40,23,46,63,187,328


In [11]:
mean = nba_numeric.mean()
mean

age       26.509356
g         53.253638
gs        25.571726
mp      1237.386694
fg       192.881497
fga      424.463617
fg.        0.436436
x3p       39.613306
x3pa     110.130977
x3p.       0.285111
x2p      153.268191
x2pa     314.332640
x2p.       0.466947
efg.       0.480752
ft        91.205821
fta      120.642412
ft.        0.722419
orb       55.810811
drb      162.817048
trb      218.627859
ast      112.536383
stl       39.280665
blk       24.103950
tov       71.862786
pf       105.869023
pts      516.582121
dtype: float64

In [12]:
std = nba_numeric.std()
std

age       4.198265
g        25.322711
gs       29.658465
mp      897.258840
fg      171.832793
fga     368.850833
fg.       0.098672
x3p      50.855639
x3pa    132.751732
x3p.      0.157633
x2p     147.223161
x2pa    294.174554
x2p.      0.104448
efg.      0.099552
ft      103.667725
fta     131.240639
ft.       0.160166
orb      62.101191
drb     145.348116
trb     200.356507
ast     131.019557
stl      34.783590
blk      30.875381
tov      62.701690
pf       71.213627
pts     470.422228
dtype: float64

In [13]:
nba_normalized = (nba_numeric-mean)/std
nba_normalized.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,...,-0.389712,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,,...,-0.88295,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938,-0.534801
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.84688,-0.778936,-0.829601,,...,-0.520826,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,...,0.578033,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,...,0.709147,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262,-0.400878


### Finding the nearest neighbor of a given row

In [14]:
nba_normalized.fillna(0, inplace=True)

In [15]:
nba_normalized.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,...,-0.389712,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,0.0,...,-0.88295,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938,-0.534801
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.84688,-0.778936,-0.829601,0.0,...,-0.520826,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,...,0.578033,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,...,0.709147,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262,-0.400878


In [16]:
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]
lebron_normalized 

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
225,0.593256,0.93775,1.734017,1.855221,3.341146,2.517376,1.323204,1.50203,1.475454,0.596156,...,0.172205,0.405615,1.989589,1.569064,2.865707,2.349365,0.06141,3.159998,0.282684,3.342567


In [17]:
euclidean_distances_to_lebron = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)
euclidean_distances_to_lebron

0      13.131600
1      12.180993
2      11.850978
3       6.473960
4      12.182853
5      13.881297
6       6.745977
7      12.403945
8      10.395894
9      10.649743
10      9.785699
11     14.803723
12     11.502708
13     10.863219
14      9.116650
15     12.647002
16     10.353420
17      4.171854
18     15.109541
19     12.044387
20      7.613101
21     14.902435
22     12.021561
23     12.056919
24      9.056844
25     13.810640
26     12.382987
27     15.323273
28     13.977957
29     14.084593
         ...    
451     4.937466
452    11.488794
453    14.824086
454    11.674824
455    15.610856
456    15.214291
457    10.371671
458     7.283207
459     7.381408
460    18.235339
461    18.306939
462     7.918121
463    10.132054
464    11.825950
465    10.510909
466    10.392935
467     9.827142
468    14.993251
469    13.268025
470    13.389306
471    11.530440
472    14.058841
473    11.574653
474    14.994545
475    12.376936
476     8.083717
477     8.543626
478     6.2541

In [18]:
d = pd.DataFrame(data = {"dist": euclidean_distances_to_lebron, "idx": euclidean_distances_to_lebron.index})
d.sort_values("dist",inplace=True)
d

Unnamed: 0,dist,idx
225,0.000000,225
17,4.171854,17
136,4.206786,136
128,4.382582,128
185,4.489928,185
133,4.619280,133
123,4.673849,123
162,4.844802,162
332,4.893563,332
451,4.937466,451


In [19]:
most_similar_to_lebron = nba.loc[int(d.iloc[1]["idx"])]["player"]
most_similar_to_lebron

'Carmelo Anthony'

### Making predictions on a test set

In [20]:
random_indices = permutation(nba.index)
test_cutoff = math.floor(len(nba)/3)

In [21]:
test = nba.loc[random_indices[1:test_cutoff]]
train = nba.loc[random_indices[test_cutoff:]]

In [22]:
train.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
214,Kris Humphries,PF,28,BOS,69,30,1376,231,461,0.501,...,282,409,67,31,61,64,136,579,2013-2014,2013
388,Mustafa Shakur,PG,29,OKC,3,0,11,0,3,0.0,...,0,0,4,0,0,2,2,1,2013-2014,2013
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
411,Amar'e Stoudemire,C,31,NYK,65,21,1466,311,558,0.557,...,208,320,34,23,37,91,159,772,2013-2014,2013
96,Jason Collins,C,35,BRK,22,1,172,11,24,0.458,...,12,19,4,8,1,7,30,25,2013-2014,2013


In [23]:
test.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
261,Shane Larkin,PG,21,DAL,48,0,489,52,137,0.38,...,31,42,71,26,1,39,46,132,2013-2014,2013
117,Glen Davis,PF,28,TOT,68,44,1662,271,593,0.457,...,243,351,77,58,29,76,159,641,2013-2014,2013
168,Archie Goodwin,SG,19,PHO,52,0,533,76,167,0.455,...,62,87,20,20,11,44,45,194,2013-2014,2013
452,Gerald Wallace,SF,31,BOS,58,16,1416,116,230,0.504,...,176,212,143,73,14,97,79,298,2013-2014,2013
304,Mike Miller,SF,33,MEM,82,4,1707,213,443,0.481,...,177,207,130,26,5,77,95,579,2013-2014,2013


In [24]:
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
y_column = ["pts"]

In [25]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [26]:
knn = KNeighborsRegressor(n_neighbors = 5)
knn.fit(train[x_columns], train[y_column])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [27]:
predictions = knn.predict(test[x_columns])

In [28]:
actual = test[y_column]

In [29]:
mse = ((predictions-actual)**2).sum() / len(actual)