## Problem Statement

In this assignment, students will be using the K-nearest neighbors algorithm to predict
how many points NBA players scored in the 2013-2014 season.

## Import Libraries and Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score

In [2]:
with open("D:/Data Science/Python/Assignment27/nba_2013.csv") as file:
    nba = pd.read_csv(file)

## Exploratory Data Preparation

In [3]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [4]:
nba.drop(columns="player", inplace=True)
nba.head()

Unnamed: 0,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,SF,23,TOT,63,0,847,66,141,0.468,4,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,C,20,OKC,81,20,1197,93,185,0.503,0,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,PF,27,TOT,53,12,961,143,275,0.52,0,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,SG,28,ORL,73,73,2552,464,1011,0.459,128,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,C,25,NOP,56,30,951,136,249,0.546,0,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [5]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 30 columns):
pos             481 non-null object
age             481 non-null int64
bref_team_id    481 non-null object
g               481 non-null int64
gs              481 non-null int64
mp              481 non-null int64
fg              481 non-null int64
fga             481 non-null int64
fg.             479 non-null float64
x3p             481 non-null int64
x3pa            481 non-null int64
x3p.            414 non-null float64
x2p             481 non-null int64
x2pa            481 non-null int64
x2p.            478 non-null float64
efg.            479 non-null float64
ft              481 non-null int64
fta             481 non-null int64
ft.             461 non-null float64
orb             481 non-null int64
drb             481 non-null int64
trb             481 non-null int64
ast             481 non-null int64
stl             481 non-null int64
blk             481 non-null int64
tov     

In [6]:
## Relevant Numeric Columns

rel_col= ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.',
          'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

# Normalize the data in each column

nba_numeric = nba[rel_col]
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()
#nba_normalized=nba[rel_col]

In [7]:
nba_normalized.isnull().sum().sort_values(ascending=False).head(6)

x3p.    67
ft.     20
x2p.     3
fg.      2
efg.     2
x2pa     0
dtype: int64

In [8]:
#Separate Out X(independent variables) and y(target variable)

y=nba_normalized['pts']
X=nba_normalized.drop(columns='pts')

features=X.columns

In [9]:
# Impute for null values

from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='median')
X = imp.fit_transform(X)
X = pd.DataFrame(data=imp.transform(X), columns=features)

In [10]:
X.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,...,-0.515408,-0.389712,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,0.290963,...,0.117019,-0.88295,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.84688,-0.778936,-0.829601,0.290963,...,-0.012515,-0.520826,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,...,1.640937,0.578033,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,...,-0.408733,0.709147,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262


In [11]:
#Split train and test datasets

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.3, random_state = 1)
y_train = y_train
y_test = y_test

## Prepare Regression Model With K Nearest Neighbors

In [14]:
for K in range(10):
    K_value = K+1
    neigh = KNeighborsRegressor(n_neighbors = K_value, weights='distance', algorithm='auto')
    neigh.fit(X_train, y_train) 
    score=neigh.score(X_test, y_test)
    print( "For K-Value: {0}, Score is : {1}". format(K_value, score),)

For K-Value: 1, Score is : 0.9407788122332351
For K-Value: 2, Score is : 0.9585267651179988
For K-Value: 3, Score is : 0.9600124854372749
For K-Value: 4, Score is : 0.9606393876644542
For K-Value: 5, Score is : 0.9658326882240938
For K-Value: 6, Score is : 0.9634693287727233
For K-Value: 7, Score is : 0.9606901068857302
For K-Value: 8, Score is : 0.9570759977907791
For K-Value: 9, Score is : 0.9532809497878728
For K-Value: 10, Score is : 0.9502986985194739


## Conclusion

As we observe that n_neighbors=5 gives the best model, so we will select this k-value for predictions