In [36]:
import numpy as np
import pandas as pd

In [37]:
players_df = pd.read_csv('../input/unsupervised-ml/nba_2013.csv')
players_df = players_df.drop(['player', 'bref_team_id', 'season', 'season_end'],axis=1)

# Missing Values

In [38]:
players_df.isnull().any()

pos     False
age     False
g       False
gs      False
mp      False
fg      False
fga     False
fg.      True
x3p     False
x3pa    False
x3p.     True
x2p     False
x2pa    False
x2p.     True
efg.     True
ft      False
fta     False
ft.      True
orb     False
drb     False
trb     False
ast     False
stl     False
blk     False
tov     False
pf      False
pts     False
dtype: bool

In [39]:
null_val_cols = ['fg.','x3p.','x2p.','efg.','ft.']

# Going through null value columns and finding indexes of null values
null_val_df = players_df[players_df.isnull().any(axis=1)]
fg_null_df = null_val_df[['ft.']]
fg_null_df = fg_null_df[fg_null_df.isnull().any(axis=1)]
fg_null_df

Unnamed: 0,ft.
27,
90,
102,
109,
112,
134,
137,
166,
190,
219,


In [40]:
# Checking is null value is due to 0/0 div
null_col_sum = players_df[['ft','fta','ft.']]
null_col_sum.iloc[fg_null_df.index].sum(skipna=True)

ft     0.0
fta    0.0
ft.    0.0
dtype: float64

In [41]:
# Replacing null values with zero
for col in null_val_cols:
    players_df[col].fillna(0,inplace=True)

In [42]:
players_df.isnull().any() # all false now

pos     False
age     False
g       False
gs      False
mp      False
fg      False
fga     False
fg.     False
x3p     False
x3pa    False
x3p.    False
x2p     False
x2pa    False
x2p.    False
efg.    False
ft      False
fta     False
ft.     False
orb     False
drb     False
trb     False
ast     False
stl     False
blk     False
tov     False
pf      False
pts     False
dtype: bool

# Model

In [43]:
model_df = players_df #.drop(['fg.', 'x3p.', 'x2p.', 'efg.','ft.'], axis=1)

### Encoding

In [44]:
# Viewing elements of player position column
model_df.pos.value_counts()

SG    109
SF     99
PF     96
C      90
PG     85
G       1
F       1
Name: pos, dtype: int64

In [45]:
# encoding nominal data using one-hot encoding
import category_encoders as ce
data = model_df

#Create object for one-hot encoding
encoder = ce.OneHotEncoder(cols='pos',handle_unknown='return_nan',return_df=True,use_cat_names=True)

#Original Data
data

Unnamed: 0,pos,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,SF,23,63,0,847,66,141,0.468,4,15,...,0.660,72,144,216,28,23,26,30,122,171
1,C,20,81,20,1197,93,185,0.503,0,0,...,0.581,142,190,332,43,40,57,71,203,265
2,PF,27,53,12,961,143,275,0.520,0,0,...,0.639,102,204,306,38,24,36,39,108,362
3,SG,28,73,73,2552,464,1011,0.459,128,300,...,0.815,32,230,262,248,35,3,146,136,1330
4,C,25,56,30,951,136,249,0.546,0,1,...,0.836,94,183,277,40,23,46,63,187,328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,SG,20,72,16,1765,345,808,0.427,40,188,...,0.641,69,159,228,217,78,16,204,151,939
477,SG,28,64,9,1810,387,889,0.435,135,350,...,0.825,29,137,166,95,46,12,95,156,1144
478,PF,25,79,78,2718,582,1283,0.454,90,292,...,0.712,166,310,476,182,167,36,165,213,1417
479,C,21,82,3,1416,172,404,0.426,0,1,...,0.730,118,235,353,92,40,41,87,170,490


In [46]:
# Fit and transform Data
data_encoded = encoder.fit_transform(data)
data_encoded

Unnamed: 0,pos_SF,pos_C,pos_PF,pos_SG,pos_PG,pos_G,pos_F,age,g,gs,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,23,63,0,...,0.660,72,144,216,28,23,26,30,122,171
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,20,81,20,...,0.581,142,190,332,43,40,57,71,203,265
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,27,53,12,...,0.639,102,204,306,38,24,36,39,108,362
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,28,73,73,...,0.815,32,230,262,248,35,3,146,136,1330
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,25,56,30,...,0.836,94,183,277,40,23,46,63,187,328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,0.0,0.0,0.0,1.0,0.0,0.0,0.0,20,72,16,...,0.641,69,159,228,217,78,16,204,151,939
477,0.0,0.0,0.0,1.0,0.0,0.0,0.0,28,64,9,...,0.825,29,137,166,95,46,12,95,156,1144
478,0.0,0.0,1.0,0.0,0.0,0.0,0.0,25,79,78,...,0.712,166,310,476,182,167,36,165,213,1417
479,0.0,1.0,0.0,0.0,0.0,0.0,0.0,21,82,3,...,0.730,118,235,353,92,40,41,87,170,490


### Normalize

In [47]:
data_encoded.columns

Index(['pos_SF', 'pos_C', 'pos_PF', 'pos_SG', 'pos_PG', 'pos_G', 'pos_F',
       'age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.',
       'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb',
       'ast', 'stl', 'blk', 'tov', 'pf', 'pts'],
      dtype='object')

In [48]:
# The columns that we will be making predictions with.
x = data_encoded[['pos_SF', 'pos_C', 'pos_PF', 'pos_SG', 'pos_PG', 'pos_G', 'pos_F',
       'age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.',
       'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb',
       'ast', 'stl', 'blk', 'tov', 'pf']]

# The column that we want to predict.
y = data_encoded['pts']

In [49]:
from sklearn import preprocessing
data_normalized = preprocessing.normalize(x)

### Train-test split

In [50]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

### KNN

In [51]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

# check for increasing neighbour values to see which has the highest regression score

for k in range(10):
    k_value = k + 1
    knn = KNeighborsRegressor(n_neighbors = k_value)
    knn.fit(x_train, y_train) 
    y_pred = knn.predict(x_test)
    print ("Regression score is:",format(metrics.r2_score(y_test, y_pred),'.4f'), "for k_value:", k_value)

Regression score is: 0.9502 for k_value: 1
Regression score is: 0.9671 for k_value: 2
Regression score is: 0.9767 for k_value: 3
Regression score is: 0.9752 for k_value: 4
Regression score is: 0.9751 for k_value: 5
Regression score is: 0.9728 for k_value: 6
Regression score is: 0.9782 for k_value: 7
Regression score is: 0.9757 for k_value: 8
Regression score is: 0.9770 for k_value: 9
Regression score is: 0.9753 for k_value: 10
