# Task: Using the K-nearest neighbors algorithm to predict how many points NBA players scored in the 2013-2014 season.

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
from scipy.spatial import distance
import random
from numpy.random import permutation
from sklearn.neighbors import KNeighborsRegressor
import math
import random
from numpy.random import permutation

In [None]:
#  Loading the data
df=pd.read_csv("nba_2013.csv")
print(df)

             player pos  age bref_team_id   g  gs    mp   fg   fga    fg.  \
0        Quincy Acy  SF   23          TOT  63   0   847   66   141  0.468   
1      Steven Adams   C   20          OKC  81  20  1197   93   185  0.503   
2       Jeff Adrien  PF   27          TOT  53  12   961  143   275  0.520   
3     Arron Afflalo  SG   28          ORL  73  73  2552  464  1011  0.459   
4     Alexis Ajinca   C   25          NOP  56  30   951  136   249  0.546   
..              ...  ..  ...          ...  ..  ..   ...  ...   ...    ...   
476     Tony Wroten  SG   20          PHI  72  16  1765  345   808  0.427   
477      Nick Young  SG   28          LAL  64   9  1810  387   889  0.435   
478  Thaddeus Young  PF   25          PHI  79  78  2718  582  1283  0.454   
479     Cody Zeller   C   21          CHA  82   3  1416  172   404  0.426   
480    Tyler Zeller   C   24          CLE  70   9  1049  156   290  0.538   

     ...  drb  trb  ast  stl  blk  tov   pf   pts     season  season_end  


In [None]:
# We want to get the all columns of the data
print(df.columns)

Index(['player', 'pos', 'age', 'bref_team_id', 'g', 'gs', 'mp', 'fg', 'fga',
       'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft',
       'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'season', 'season_end'],
      dtype='object')


In [None]:
# Number of rows and columns
df.shape

(481, 31)

In [None]:
df.info

<bound method DataFrame.info of              player pos  age bref_team_id   g  gs    mp   fg   fga    fg.  \
0        Quincy Acy  SF   23          TOT  63   0   847   66   141  0.468   
1      Steven Adams   C   20          OKC  81  20  1197   93   185  0.503   
2       Jeff Adrien  PF   27          TOT  53  12   961  143   275  0.520   
3     Arron Afflalo  SG   28          ORL  73  73  2552  464  1011  0.459   
4     Alexis Ajinca   C   25          NOP  56  30   951  136   249  0.546   
..              ...  ..  ...          ...  ..  ..   ...  ...   ...    ...   
476     Tony Wroten  SG   20          PHI  72  16  1765  345   808  0.427   
477      Nick Young  SG   28          LAL  64   9  1810  387   889  0.435   
478  Thaddeus Young  PF   25          PHI  79  78  2718  582  1283  0.454   
479     Cody Zeller   C   21          CHA  82   3  1416  172   404  0.426   
480    Tyler Zeller   C   24          CLE  70   9  1049  156   290  0.538   

     ...  drb  trb  ast  stl  blk  tov   pf

In [None]:
# From the given data we want to replace all the NA values and inconsistent numbers.
new_df = df.replace([np.inf, -np.inf], np.NaN)

In [None]:
new_df.dropna(inplace=True)

In [None]:
new_df.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013
6,LaMarcus Aldridge,PF,28,POR,69,69,2498,652,1423,0.458,...,599,765,178,63,68,123,147,1603,2013-2014,2013
7,Lavoy Allen,PF,24,TOT,65,2,1072,134,300,0.447,...,192,311,71,24,33,44,126,303,2013-2014,2013


In [None]:
new_df_filter = new_df.isin([np.nan, np.inf, -np.inf])

In [None]:
new_df = new_df[~new_df_filter]

In [None]:
new_df.dropna(inplace=True)

In [None]:
# Selecting LeBron James from our given dataset as we want to find the most similar player to LeBron James. 
selected_player = new_df[new_df["player"] == "LeBron James"].iloc[0]

In [None]:
# distance_columns used to find the euclidean distance so for that we need only numeric columns
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

In [None]:
def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

In [None]:
# Now we find the distance from each player in the dataset to lebron.
lebron_distance = new_df.apply(euclidean_distance, axis=1)

In [None]:
# Now we select only the numeric columns from our dataset
new_df_numeric = new_df[distance_columns]

In [None]:
# Now we normalize all the numeric columns so that all the values can come upto the same scale
new_df_normalized = (new_df_numeric - new_df_numeric.mean()) / new_df_numeric.std()

### Finding the Nearest neighbour

In [None]:
# Now we fill in NA values in nba_normalized
new_df_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = new_df_normalized[new_df["player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = new_df_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

# Create a new dataframe with distances and sort them in a ascending order.
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)

# Now the lowest distance to lebron is lebron, so we want to find the second smallest distance 
# it will be most similar non-lebron player
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = new_df.loc[int(second_smallest)]["player"]

## Generating train test splits

In [None]:
x = new_df.iloc[:,[2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]].values
y = new_df.iloc[:,28].values

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x[:,0] = le.fit_transform(x[:,0])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

In [None]:
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
y_column = ['pts']

# Create the knn model.

# Import necessary modules
from sklearn.neighbors import KNeighborsClassifier
# Look at the seven closest neighbors.
knn = KNeighborsClassifier(n_neighbors=7)
 
knn.fit(x_train, y_train)

# Fit the model on the training data.

# Make point predictions on the test set using the fit model.
predicted = (knn.predict(x_test))

In [None]:
print(predicted)

[ 346  306  630  174  448   25 1264  181  338   48  908  454   29  618
    6   72   44  572  403   15   79  754  754  257  100  425 1119   14
  799  929  213  137  132  470 1068  645  760 1227 1106  760  587  200
   98  303 1457  298  799  298  546    7  257  810  916  338  630  258
  703  448  530 1118 1011 1106 1106   54   14    7  824  200  339  529
  499  200   40   14  132  495  761   15   54   84  461]


In [None]:
actual = y_test

In [None]:
print(actual)

[ 490  548  820  217  491   47 1737  202  520   18  961  527   26 1002
   26  102   47  844  564   11  115  939 1028  302  140  622 1614    0
  871 1107  248  144  171  638 1343  758  895 1372 1256  781  826  208
  183  401 1791  252  911  393  814   39  338  738 1096  419  721  261
  846  480  763 1328 1089 1248 1249   62    1   28 1144  202  436  770
  665  250   68    1  194  701  891   26   74   75  603]


In [None]:
Test_With_Predicted = pd.DataFrame({'Actual Points': y_test.tolist(), 'Predicted Points': (knn.predict(x_test)).tolist()})

Test_With_Predicted

Unnamed: 0,Actual Points,Predicted Points
0,490,346
1,548,306
2,820,630
3,217,174
4,491,448
...,...,...
76,891,761
77,26,15
78,74,54
79,75,84


## Computing the error using Mean Squared

In [None]:
mse = (((predicted - actual) ** 2).sum()) / len(predicted)

In [None]:
print(mse)

25240.913580246914
