In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz

In [4]:
#read from the csv file and return a Pandas DataFrame.
nba = pd.read_csv('NBAstats.csv')

# print the column names
original_headers = list(nba.columns.values)
print(original_headers)

#print the first three rows.
print(nba[0:3])

# "Position (pos)" is the class attribute we are predicting. 
class_column = 'Pos'

#The dataset contains attributes such as player name and team name. 
#We know that they are not useful for classification and thus do not 
#include them as features. 
# ORIGINAL DATASET
feature_columns = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', \
    '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', \
    'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PS/G']
# feature_columns = ['G', 'GS', 'MP', 'FG%', '3PA', \
#     '3P%', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', \
#     'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PS/G']
#Pandas DataFrame allows you to select columns. 
#We use column selection to split the data into features and class. 
nba_feature = nba[feature_columns]
nba_class = nba[class_column]

print(nba_feature[0:3])
print(list(nba_class[0:3]))

train_feature, test_feature, train_class, test_class = \
    train_test_split(nba_feature, nba_class, stratify=nba_class, \
    train_size=0.75, test_size=0.25)

training_accuracy = []
test_accuracy = []


['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PS/G']
         Player Pos  Age   Tm   G  GS    MP   FG  FGA    FG%  ...    FT%  ORB  \
0    Quincy Acy  PF   25  SAC  59  29  14.8  2.0  3.6  0.556  ...  0.735  1.1   
1  Jordan Adams  SG   21  MEM   2   0   7.5  1.0  3.0  0.333  ...  0.600  0.0   
2  Steven Adams   C   22  OKC  80  80  25.2  3.3  5.3  0.613  ...  0.582  2.7   

   DRB  TRB  AST  STL  BLK  TOV   PF  PS/G  
0  2.1  3.2  0.5  0.5  0.4  0.5  1.7   5.2  
1  1.0  1.0  1.5  1.5  0.0  1.0  1.0   3.5  
2  3.9  6.7  0.8  0.5  1.1  1.1  2.8   8.0  

[3 rows x 29 columns]
   Age   G  GS    MP   FG  FGA    FG%   3P  3PA    3P%  ...    FT%  ORB  DRB  \
0   25  59  29  14.8  2.0  3.6  0.556  0.3  0.8  0.388  ...  0.735  1.1  2.1   
1   21   2   0   7.5  1.0  3.0  0.333  0.0  0.5  0.000  ...  0.600  0.0  1.0   
2   22  80  80  25.2  3.3  5.3  0

In [5]:
randomF = RandomForestClassifier().fit(train_feature, train_class)
print("Training set score: {:.3f}".format(randomF.score(train_feature, train_class)))
print("Random Forest Tree Classifier Test set score: {:.3f}".format(randomF.score(test_feature, test_class)))

Training set score: 1.000
Random Forest Tree Classifier Test set score: 0.521


In [6]:
prediction = randomF.predict(test_feature)
print("Confusion matrix below:\n")
print(pd.crosstab(test_class,prediction, rownames=['Player position'], colnames=['Predicted player position'],margins = 'True' ))
print("\n")
scores = cross_val_score(randomF, nba_feature, nba_class, cv=10)
print("Cross-validation scores with 10-fold startified:\n{}".format(scores))
print("\nAverage cross-validation for Random Forest Classifier score: {:.2f}".format(scores.mean()))

Confusion matrix below:

Predicted player position   C  PF  PG  SF  SG  All
Player position                                   
C                          16   4   0   2   0   22
PF                          5  10   1   9   1   26
PG                          0   2  17   2   3   24
SF                          1   1   0   9  12   23
SG                          0   0  10   4  10   24
All                        22  17  28  26  26  119


Cross-validation scores with 10-fold startified:
[0.66666667 0.52083333 0.70833333 0.5625     0.58333333 0.5106383
 0.65957447 0.65957447 0.55319149 0.61702128]

Average cross-validation for Random Forest Classifier score: 0.60
