In [2]:
#We can implement a KNN model by following the below steps:

#Load the data
#Initialise the value of k
#For getting the predicted class, iterate from 1 to total number of training data points
#Calculate the distance between test data and each row of training data. Here we will use Euclidean distance as our distance metric since it’s the most popular method. The other metrics that can be used are Chebyshev, cosine, etc.
#Sort the calculated distances in ascending order based on distance values
#Get top k rows from the sorted array
#Get the most frequent class of these rows
#Return the predicted class

In [3]:
#importing library
import pandas as pd
import numpy as np
import math

In [4]:
import operator 
with open("nba_2013.csv", 'r') as csvfile:
    nba = pd.read_csv(csvfile)

# The names of all the columns in the data.
print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [5]:
# Replace NaN values with zeros.
nba = nba.fillna(0)

In [6]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [7]:
#We can use the principle of euclidean distance to find the most similar NBA players to Lebron James.

In [8]:
# Select Lebron James from our dataset
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]

# Choose only the numeric columns (we'll use these to compute euclidean distance)
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

# Find the distance from each player in the dataset to lebron.
lebron_distance = nba.apply(euclidean_distance, axis=1)
lebron_distance

0      3475.792868
1      3148.395020
2      3161.567361
3      1189.554979
4      3216.773098
5      3919.277251
6       960.443178
7      3131.071083
8      2326.129199
9      2806.955657
10     2277.933945
11     4083.902230
12     2819.058890
13     2534.074598
14     1970.085795
15     3262.065464
16     2451.378405
17      485.856006
18     4096.585451
19     3246.515831
20     1539.172839
21     4162.749727
22     2969.043638
23     3259.582822
24     2023.603985
25     3815.822230
26     3326.666522
27     4149.198445
28     3754.041967
29     3835.882699
          ...     
451     716.243023
452    2996.450583
453    4135.156714
454    3023.456473
455    4138.570811
456    4258.518608
457    2206.524879
458    1347.758158
459    2136.309449
460    4260.053965
461    4261.181149
462    1922.713718
463    2364.771676
464    3033.755934
465    2625.998112
466    2495.296784
467    2232.354830
468    4244.598517
469    3525.434026
470    3574.911070
471    2873.509019
472    3831.

In [9]:
#Normalizing columns
#A simple way to deal with this is to normalize all the columns to have a mean of 0, and a standard deviation of 1. 
#This will ensure that no single column has a dominant impact on the euclidean distance calculations.
#To set the mean to 0, we have to find the mean of a column, then subtract the mean from every value in the column. 
#To set the standard deviation to 1, we divide every value in the column by the standard deviation. 
#The formula is //(x=\frac{x-\mu}{\sigma}\).

In [10]:
# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]
nba_numeric.head()


Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,23,63,0,847,66,141,0.468,4,15,0.266667,...,0.66,72,144,216,28,23,26,30,122,171
1,20,81,20,1197,93,185,0.503,0,0,0.0,...,0.581,142,190,332,43,40,57,71,203,265
2,27,53,12,961,143,275,0.52,0,0,0.0,...,0.639,102,204,306,38,24,36,39,108,362
3,28,73,73,2552,464,1011,0.459,128,300,0.426667,...,0.815,32,230,262,248,35,3,146,136,1330
4,25,56,30,951,136,249,0.546,0,1,0.0,...,0.836,94,183,277,40,23,46,63,187,328


In [11]:
# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()
nba_normalized.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.325957,-0.700282,-0.716608,0.12052,...,-0.151926,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.667749,-0.778936,-0.829601,-1.390497,...,-0.522588,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938,-0.534801
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.833763,-0.778936,-0.829601,-1.390497,...,-0.250457,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.238067,1.737992,1.430256,1.02713,...,0.57532,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.087666,-0.778936,-0.822068,-1.390497,...,0.673851,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262,-0.400878


In [12]:
#Finding the nearest neighbor
#We now know enough to find the nearest neighbor of a given row in the NBA dataset. 
#We can use the distance.euclidean function from scipy.spatial, 
#a much faster way to calculate euclidean distance.

In [13]:
from scipy.spatial import distance

# Fill in NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

# Create a new dataframe with distances.
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
# Find the most similar player to lebron (the lowest distance to lebron is lebron, the second smallest is the most similar non-lebron player)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]
most_similar_to_lebron

'Carmelo Anthony'

In [14]:
#Generating training and testing sets
#Now that we know how to find the nearest neighbors, 
#we can make predictions on a test set. We'll try to 
#predict how many points a player scored using the 5 closest neighbors. 
#We'll find neighbors by using all the numeric columns in the dataset 
#to generate similarity scores.

In [15]:
import random
from numpy.random import permutation

# Randomly shuffle the index of nba.
random_indices = permutation(nba.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba)/3)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = nba.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
train = nba.loc[random_indices[test_cutoff:]]
train.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
368,Luke Ridnour,PG,32,TOT,61,14,1141,126,327,0.385,...,74,97,176,32,8,66,92,306,2013-2014,2013
6,LaMarcus Aldridge,PF,28,POR,69,69,2498,652,1423,0.458,...,599,765,178,63,68,123,147,1603,2013-2014,2013
19,Pero Antic,PF,31,ATL,50,26,925,123,294,0.418,...,152,209,58,19,12,55,126,352,2013-2014,2013
249,Sergey Karasev,SF,20,CLE,22,1,156,12,35,0.343,...,16,16,6,3,1,11,22,37,2013-2014,2013
211,Al Horford,C,27,ATL,29,29,958,238,420,0.567,...,178,244,76,27,44,64,56,538,2013-2014,2013


In [16]:
#Using sklearn for k nearest neighbors
#we can use the k-nearest neighbors mplementation in scikit-learn. 
#There's a regressor and a classifier available, but we'll be using the regressor, 
#as we have continuous values to predict on.

In [17]:
# The columns that we will be making predictions with.
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column that we want to predict.
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])
# Make point predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])
print(predictions)

[[1.6204e+03]
 [8.0180e+02]
 [7.9900e+02]
 [1.0000e+00]
 [5.9240e+02]
 [1.4420e+02]
 [1.9320e+02]
 [9.0000e+00]
 [3.9380e+02]
 [1.0568e+03]
 [2.3020e+02]
 [6.8440e+02]
 [3.2800e+01]
 [5.5300e+02]
 [1.7200e+01]
 [1.1180e+03]
 [7.6040e+02]
 [1.0200e+01]
 [2.1000e+02]
 [2.8800e+01]
 [1.0692e+03]
 [4.0280e+02]
 [5.2000e+00]
 [4.0280e+02]
 [3.2400e+01]
 [6.7960e+02]
 [1.1460e+02]
 [5.4680e+02]
 [1.7202e+03]
 [9.1000e+01]
 [1.0514e+03]
 [6.6280e+02]
 [1.4400e+01]
 [1.0000e+00]
 [9.5000e+01]
 [8.5000e+01]
 [6.1440e+02]
 [9.8240e+02]
 [3.5040e+02]
 [1.9280e+02]
 [7.7320e+02]
 [9.6720e+02]
 [1.0834e+03]
 [2.0000e+00]
 [8.1040e+02]
 [3.0660e+02]
 [8.9800e+01]
 [7.0920e+02]
 [3.7800e+01]
 [2.7400e+01]
 [2.0060e+02]
 [1.8800e+01]
 [9.7200e+01]
 [1.6920e+02]
 [1.2890e+03]
 [6.4660e+02]
 [6.9820e+02]
 [7.5320e+02]
 [5.8740e+02]
 [1.8340e+02]
 [1.1296e+03]
 [1.3160e+02]
 [4.9380e+02]
 [1.1800e+01]
 [9.0400e+01]
 [2.2660e+02]
 [7.9900e+02]
 [3.1620e+02]
 [6.9200e+01]
 [1.2460e+03]
 [4.8000e+00]
 [7.86

In [18]:
#Computing error
#Now that we know our point predictions, 
#we can compute the error involved with our predictions.

In [19]:
# Get the actual values for the test set.
actual = test[y_column]
# Compute the mean squared error of our predictions.
mse = (((predictions - actual) ** 2).sum()) / len(predictions)
mse

pts    12000.141887
dtype: float64