#### Import the Libraries

## Problem Statement: Machine Learning 7

#### In this assignment, students will be using the K-nearest neighbors algorithm to predict how many points NBA players scored in the 2013-2014 season.

#### Import LIbraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

#### Read the input dataset

In [2]:
nba = pd.read_csv('nba_2013.csv')
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


#### Data Pre-processing

In [3]:
## Check the shape of the dataset
nba.shape

(481, 31)

In [4]:
## Check if dataset has null values
nba.isnull().sum().sum()

94

In [5]:
nba.describe()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,season_end
count,481.0,481.0,481.0,481.0,481.0,481.0,479.0,481.0,481.0,414.0,...,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0
mean,26.509356,53.253638,25.571726,1237.386694,192.881497,424.463617,0.436436,39.613306,110.130977,0.285111,...,55.810811,162.817048,218.627859,112.536383,39.280665,24.10395,71.862786,105.869023,516.582121,2013.0
std,4.198265,25.322711,29.658465,897.25884,171.832793,368.850833,0.098672,50.855639,132.751732,0.157633,...,62.101191,145.348116,200.356507,131.019557,34.78359,30.875381,62.70169,71.213627,470.422228,0.0
min,19.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.0
25%,23.0,32.0,0.0,388.0,47.0,110.0,0.4005,0.0,3.0,0.234355,...,12.0,43.0,55.0,20.0,9.0,4.0,21.0,44.0,115.0,2013.0
50%,26.0,61.0,10.0,1141.0,146.0,332.0,0.438,16.0,48.0,0.330976,...,35.0,135.0,168.0,65.0,32.0,14.0,58.0,104.0,401.0,2013.0
75%,29.0,76.0,54.0,2016.0,307.0,672.0,0.4795,68.0,193.0,0.375,...,73.0,230.0,310.0,152.0,60.0,32.0,108.0,158.0,821.0,2013.0
max,39.0,83.0,82.0,3122.0,849.0,1688.0,1.0,261.0,615.0,1.0,...,440.0,783.0,1114.0,721.0,191.0,219.0,295.0,273.0,2593.0,2013.0


In [6]:
nba.isnull().sum()

player           0
pos              0
age              0
bref_team_id     0
g                0
gs               0
mp               0
fg               0
fga              0
fg.              2
x3p              0
x3pa             0
x3p.            67
x2p              0
x2pa             0
x2p.             3
efg.             2
ft               0
fta              0
ft.             20
orb              0
drb              0
trb              0
ast              0
stl              0
blk              0
tov              0
pf               0
pts              0
season           0
season_end       0
dtype: int64

In [7]:
# bref_team_id is just an id, so dropping it
# Session and Session_end has correlated value, so dropping Session column 
nba.drop(['bref_team_id','season'],axis=1,inplace=True)

In [8]:
##Player Name is irrelavent, so ignoring the column and creating dependent and independent variables
X=nba.drop(labels=['pts','player'],axis=1)
y=nba.loc[:,['player']]

In [9]:
## Replace null values with mean
X=X.fillna(X.mean())

In [10]:
X.isnull().sum().sum()

0

In [11]:
X.head(2)

Unnamed: 0,pos,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,season_end
0,SF,23,63,0,847,66,141,0.468,4,15,...,0.66,72,144,216,28,23,26,30,122,2013
1,C,20,81,20,1197,93,185,0.503,0,0,...,0.581,142,190,332,43,40,57,71,203,2013


In [12]:
y.head(2)

Unnamed: 0,player
0,Quincy Acy
1,Steven Adams


In [13]:
# Check unique values of pos column
X['pos'].unique()

array(['SF', 'C', 'PF', 'SG', 'PG', 'G', 'F'], dtype=object)

In [14]:
#convert the categorial variable into dummies for column pos
X=pd.concat([X,pd.get_dummies(X['pos'],prefix='pos',drop_first=True)],axis=1)

In [15]:
X=X.drop('pos',axis=1)

In [16]:
X.head(2)

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,blk,tov,pf,season_end,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG
0,23,63,0,847,66,141,0.468,4,15,0.266667,...,26,30,122,2013,0,0,0,0,1,0
1,20,81,20,1197,93,185,0.503,0,0,0.285111,...,57,71,203,2013,0,0,0,0,0,0


In [17]:
X['season_end'].unique()

array([2013], dtype=int64)

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 32 columns):
age           481 non-null int64
g             481 non-null int64
gs            481 non-null int64
mp            481 non-null int64
fg            481 non-null int64
fga           481 non-null int64
fg.           481 non-null float64
x3p           481 non-null int64
x3pa          481 non-null int64
x3p.          481 non-null float64
x2p           481 non-null int64
x2pa          481 non-null int64
x2p.          481 non-null float64
efg.          481 non-null float64
ft            481 non-null int64
fta           481 non-null int64
ft.           481 non-null float64
orb           481 non-null int64
drb           481 non-null int64
trb           481 non-null int64
ast           481 non-null int64
stl           481 non-null int64
blk           481 non-null int64
tov           481 non-null int64
pf            481 non-null int64
season_end    481 non-null int64
pos_F         481 non-null 

#### Stadardize the variable

In [19]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()

In [20]:
scalar.fit(X)
scalar_features = scalar.transform(X)

  return self.partial_fit(X, y)
  


In [21]:
# Splitting into Test Train Dataset
X_train,X_test,y_train,y_test = train_test_split(scalar_features,X,test_size=0.3)

#### Apply KNN Algorithm

In [27]:
knnmodel = KNeighborsRegressor()
knnmodel.fit(X_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [28]:
pred=knnmodel.predict(X_test)

In [29]:
##validating with R square
r2_score(y_test,pred) 

0.850337213585548

#### Apply Hyper parameter Tuning to find the best parameters

In [30]:
#With Hyper Parameters Tuning
#importing modules
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
#making the instance
model = KNeighborsRegressor(n_jobs=-1)
#Hyper Parameters Set
params = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21,23,25],
          'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'n_jobs':[-1]}
#Making models with hyper parameters sets
model1 = GridSearchCV(model, param_grid=params, n_jobs=1)
#Learning
model1.fit(X_train,y_train)
#The best hyper parameters set
print("Best Hyper Parameters:\n",model1.best_params_)
#Prediction
prediction=model1.predict(X_test)
#importing the metrics module
from sklearn import metrics
#evaluation(Accuracy)
print("Accuracy:",r2_score(y_test,pred))



Best Hyper Parameters:
 {'leaf_size': 1, 'n_jobs': -1, 'n_neighbors': 5, 'weights': 'distance'}
Accuracy: 0.850337213585548


#### Model Evaluation

In [31]:
print('The Mean Sqaured Error is : ', mean_squared_error(y_test,pred))
print('The Mean Absolute Error is : ', mean_absolute_error(y_test,pred))
print('The Model accuracy/R2 Score using K Nearest Neighbor Algorithm is ', r2_score(y_test,pred))

The Mean Sqaured Error is :  2768.383411688563
The Mean Absolute Error is :  21.360784759671063
The Model accuracy/R2 Score using K Nearest Neighbor Algorithm is  0.850337213585548
