In [1]:
import pandas as pd
import math
from scipy.spatial import distance
import random
from numpy.random import permutation
from sklearn.neighbors import KNeighborsRegressor

with open("nba_2013.csv", 'r') as csvfile:
    nba = pd.read_csv(csvfile)

# Select a player from dataset nba
player_chosen = nba[nba["player"] == "James Anderson"].iloc[0]

In [2]:
nba.columns

Index(['player', 'pos', 'age', 'bref_team_id', 'g', 'gs', 'mp', 'fg', 'fga',
       'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft',
       'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'season', 'season_end'],
      dtype='object')

In [3]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [4]:
# Chose columns with numbers as values
num_cols = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

def distance_to_player(row):
    value_inner = 0
    for k in num_cols:
        value_inner += (row[k] - player_chosen[k]) ** 2
    return math.sqrt(value_inner)

# Find the distance from each player
dist_james = nba.apply(distance_to_player, axis=1)

In [5]:
dist_james

0      1784.998042
1              NaN
2              NaN
3       830.619661
4      1582.680996
5              NaN
6      1780.529425
7      1468.702163
8       480.330132
9      1181.967014
10      594.270150
11             NaN
12     1158.916342
13      697.597311
14        0.000000
15     1663.735274
16      667.463866
17     2148.231837
18             NaN
19     1576.556691
20      607.077435
21             NaN
22     1315.005721
23             NaN
24      511.217215
25             NaN
26             NaN
27             NaN
28     2118.075311
29     2233.520543
          ...     
451    1594.227090
452    1223.526084
453    2534.648499
454    1321.360286
455    2518.965304
456            NaN
457     311.197719
458     915.275922
459    1011.944669
460            NaN
461            NaN
462     445.257238
463     679.449050
464    1348.340092
465     976.154713
466     723.108571
467     580.401614
468            NaN
469    1836.799664
470    1927.998216
471    1196.580132
472    2212.

In [6]:
# numeric columns
nba_num_cols = nba[num_cols]

# normalize columns nba
nba_norm = (nba_num_cols - nba_num_cols.mean()) / nba_num_cols.std()

In [7]:
nba_norm.head(5)

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,...,-0.389712,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,,...,-0.88295,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938,-0.534801
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.84688,-0.778936,-0.829601,,...,-0.520826,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,...,0.578033,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,...,0.709147,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262,-0.400878


In [8]:
# fill in empty values
nba_norm.fillna(0, inplace=True)

# normalized vector.
norm_james = nba_norm[nba["player"] == "James Anderson"]

# distance from james to others
distance_all = nba_norm.apply(lambda row: distance.euclidean(row, norm_james), axis=1)

# dataframe with distances
distance_frame = pd.DataFrame(data={"dist": distance_all, "idx": distance_all.index})
distance_frame.sort_values("dist", inplace=True)

# find the similar player to james
small2 = distance_frame.iloc[1]["idx"]
similar_james = nba.loc[int(small2)]

In [9]:
similar_james

player          Wilson Chandler
pos                          SF
age                          26
bref_team_id                DEN
g                            62
gs                           55
mp                         1927
fg                          307
fga                         738
fg.                       0.416
x3p                         122
x3pa                        351
x3p.                   0.347578
x2p                         185
x2pa                        387
x2p.                   0.478036
efg.                      0.499
ft                          110
fta                         152
ft.                       0.724
orb                          58
drb                         236
trb                         294
ast                         114
stl                          46
blk                          31
tov                          79
pf                          193
pts                         846
season                2013-2014
season_end                 2013
Name: 89

In [10]:
# random nba
random_idx = permutation(nba.index)

# cutoff of test set
test_cutoff = math.floor(len(nba)/3)

# generate test
test_set = nba.loc[random_idx[1:test_cutoff]]

# train set
train_set = nba.loc[random_idx[test_cutoff:]]

In [11]:
# cols for prediction
x_cols = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# cols to predict
y_cols = ["pts"]


train_set.fillna(0, inplace=True)
test_set.fillna(0, inplace=True)

In [12]:
train_set[x_cols].head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf
403,22,77,12,1231,129,336,0.384,57,178,0.320225,...,41,0.756,19,105,124,68,29,15,44,84
356,31,1,0,19,2,6,0.333,0,0,0.0,...,0,0.0,0,5,5,0,0,1,1,1
174,26,68,59,1651,218,505,0.432,132,318,0.415094,...,63,0.794,25,204,229,104,65,61,76,107
210,24,2,0,7,0,4,0.0,0,2,0.0,...,2,0.5,0,0,0,1,1,0,0,0
223,28,30,0,146,11,23,0.478,0,1,0.0,...,11,0.545,15,27,42,3,3,8,10,24


In [20]:
# knn model
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data.
knn.fit(train_set[x_cols], train_set[y_cols])

# Make point predictions on the test set using the fit model.
predictions = knn.predict(test_set[x_cols])

In [21]:
predictions[:5]

array([[ 71.6],
       [ 33.2],
       [570.8],
       [165. ],
       [ 14.6]])

In [24]:
# actual values of test set
actual_test = test_set[y_cols]

# mean squared error of predictions
mean_sq_err = (((predictions - actual_test) ** 2).sum()) / len(predictions)

In [19]:
# points scored by the players in 2013-14 season
actual_test.head(20)

Unnamed: 0,pts
323,76
18,28
280,544
470,190
182,26
61,419
384,767
158,859
84,967
394,75


In [25]:
mean_sq_err

pts    6342.386667
dtype: float64