# Predicting the Player Efficiency Rating (PER) of Future NBA PG

## Background on PER

## Regression Solution

In [1]:
#Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn import linear_model 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

#Read .csv File into Pandas Dataframe
df = pd.read_csv("data.csv")

#Create Array of Player's Names
players = df['Player']

#Split into x and y Dataframes
y = df['NBA PER']
del df['NBA PER']
del df['Player']
del df['Classification']
del df['Classification2']
del df['Conf']
x = df

#Split Into Training & Testing Data Sets
x_Train, x_Test, y_Train, y_Test = train_test_split(x, y, test_size = .2, random_state = 42)

#Implement Classifier Model
clf = linear_model.Ridge(alpha = .2)
clf.fit(x_Train,y_Train)

#Calculate Training/Testing Score
trainingScore = clf.score(x_Train,y_Train)
testingScore = clf.score(x_Test,y_Test)

#Display Results
y_Predict = clf.predict(x_Test)
y_Predict = y_Predict.round(decimals=1)
count = 0
for index, row in y_Test.items():
    print("Actual PER of " + str(players[index]) + ": " + str(row))
    print("Predicted PER of " + str(players[index]) + ": " + str(y_Predict[count]))
    print("")
    count = count + 1
    
meanSquaredError = mean_squared_error(y_Test,y_Predict)
meanAbsoluteError = mean_absolute_error(y_Test,y_Predict)

print("")
print("Training Score: " + str(trainingScore))
print("Testing Score: " + str(testingScore))
print("Mean Squared Error: " + str(meanSquaredError))
print("Mean Absolute Error: " + str(meanAbsoluteError))

Actual PER of Jamal Murray: 15.5
Predicted PER of Jamal Murray: 13.9

Actual PER of Marquis Teague: 4.9
Predicted PER of Marquis Teague: 7.8

Actual PER of Kay Felder: 9.3
Predicted PER of Kay Felder: 14.7

Actual PER of Javaris Crittenton: 10.6
Predicted PER of Javaris Crittenton: 12.9

Actual PER of Nick Calathes: 12.3
Predicted PER of Nick Calathes: 13.8

Actual PER of Josh Selby: 2.7
Predicted PER of Josh Selby: 10.4

Actual PER of Jordan Farmer: 13.3
Predicted PER of Jordan Farmer: 11.0

Actual PER of De'Aaron Fox: 16.3
Predicted PER of De'Aaron Fox: 8.7

Actual PER of Jalen Brunson: 13.6
Predicted PER of Jalen Brunson: 10.5

Actual PER of Ja Morant: 18.0
Predicted PER of Ja Morant: 20.1

Actual PER of Lester Hudson: 13.0
Predicted PER of Lester Hudson: 14.2

Actual PER of Toney Douglas: 13.2
Predicted PER of Toney Douglas: 11.9

Actual PER of DJ Augustin: 14.2
Predicted PER of DJ Augustin: 11.3

Actual PER of Dejounte Murray: 15.4
Predicted PER of Dejounte Murray: 8.7

Actual PER

## Prediction on PG's Being Drafted in 2020 (Regresssion)

In [2]:
#Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn import linear_model 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

#Read .csv File into Pandas Dataframe
df2 = pd.read_csv("data2.csv")

#Create Array of Player's Names
players = df2['Player']

#Split into x and y Dataframes
del df2['NBA PER']
del df2['Player']
del df2['Classification']
del df2['Classification2']
del df2['Conf']

#Make Predictions
predictions = clf.predict(df2)
predictions = predictions.round(decimals=1)

#Display Predictions
for x in range(len(predictions)):
    print("Predicted PER of " + str(players[x]) + ": " + str(predictions[x]))

Predicted PER of Tyrese Haliburton: 15.7
Predicted PER of Cole Anthony: 6.5
Predicted PER of Kira Lewis Jr.: 9.3
Predicted PER of Tyrell Terry: 10.5
Predicted PER of Nico Mannion: 9.0
Predicted PER of Tre Jones: 10.3
Predicted PER of Devon Dotson: 10.2
Predicted PER of Grant Riller: 11.2
Predicted PER of Cassius Winston: 11.0
Predicted PER of Malachi Flynn: 12.7
Predicted PER of Ashton Hagans: 9.6
Predicted PER of Payton Pritchard: 11.8
Predicted PER of Ayo Dosunmu: 6.3
Predicted PER of Markus Howard: 14.8
Predicted PER of Myles Powell: 9.4
Predicted PER of Colbey Ross: 13.3
Predicted PER of Jordan Ford: 8.2


## Classification Solution

In [3]:
#Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn import tree 
from sklearn.model_selection import train_test_split

#Read .csv File into Pandas Dataframe
df = pd.read_csv("data.csv")

#Create Array of Player's Names
players = df['Player']

#Split into x and y Dataframes
y = df['Classification2']
del df['NBA PER']
del df['Classification']
del df['Classification2']
del df['Player']
x = df
codes, uniques = pd.factorize(df['Conf'])
del df['Conf']
df['Conf'] = codes

#Split Into Training & Testing Data Sets
x_Train, x_Test, y_Train, y_Test = train_test_split(x, y, test_size = .2, random_state = 42)

#Implement Classifier Model
clf = tree.DecisionTreeClassifier(max_depth = 6, random_state = 0)
clf.fit(x_Train,y_Train)

#Calculate Training Error
trainingError = clf.score(x_Train,y_Train)
testingError = clf.score(x_Test,y_Test)

y_Predict = clf.predict(x_Test)
count = 0

for index, row in y_Test.items():
    print("Actual Classification of " + str(players[index]) + ": " + str(row))
    print("Predicted Classification of " + str(players[index]) + ": " + str(y_Predict[count]))
    print("")
    count = count + 1
    
print("")
print(trainingError)
print(testingError)

Actual Classification of Jamal Murray: 3
Predicted Classification of Jamal Murray: 4

Actual Classification of Marquis Teague: 5
Predicted Classification of Marquis Teague: 5

Actual Classification of Kay Felder: 5
Predicted Classification of Kay Felder: 4

Actual Classification of Javaris Crittenton: 5
Predicted Classification of Javaris Crittenton: 3

Actual Classification of Nick Calathes: 4
Predicted Classification of Nick Calathes: 3

Actual Classification of Josh Selby: 5
Predicted Classification of Josh Selby: 5

Actual Classification of Jordan Farmer: 4
Predicted Classification of Jordan Farmer: 5

Actual Classification of De'Aaron Fox: 3
Predicted Classification of De'Aaron Fox: 3

Actual Classification of Jalen Brunson: 4
Predicted Classification of Jalen Brunson: 4

Actual Classification of Ja Morant: 2
Predicted Classification of Ja Morant: 4

Actual Classification of Lester Hudson: 4
Predicted Classification of Lester Hudson: 4

Actual Classification of Toney Douglas: 4
Pr

## Prediction on PG's Being Drafted in 2020 (Classification)

In [4]:
#Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn import linear_model 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

#Read .csv File into Pandas Dataframe
df2 = pd.read_csv("data2.csv")

#Create Array of Player's Names
players = df2['Player']

#Split into x and y Dataframes
del df2['NBA PER']
del df2['Player']
del df2['Classification']
del df2['Classification2']
codes, uniques = pd.factorize(df2['Conf'])
del df2['Conf']
df2['Conf'] = codes

#Make Predictions
predictions = clf.predict(df2)
predictions = predictions.round(decimals=1)


for x in range(len(predictions)):
    print("Predicted PER of " + str(players[x]) + ": " + str(predictions[x]))

Predicted PER of Tyrese Haliburton: 4
Predicted PER of Cole Anthony: 5
Predicted PER of Kira Lewis Jr.: 5
Predicted PER of Tyrell Terry: 1
Predicted PER of Nico Mannion: 5
Predicted PER of Tre Jones: 5
Predicted PER of Devon Dotson: 1
Predicted PER of Grant Riller: 1
Predicted PER of Cassius Winston: 1
Predicted PER of Malachi Flynn: 1
Predicted PER of Ashton Hagans: 5
Predicted PER of Payton Pritchard: 5
Predicted PER of Ayo Dosunmu: 5
Predicted PER of Markus Howard: 1
Predicted PER of Myles Powell: 5
Predicted PER of Colbey Ross: 4
Predicted PER of Jordan Ford: 5


## Most Important Features in Determining the PER of NBA PG's 

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier

#Read .csv File into Pandas Dataframe
df = pd.read_csv("data.csv")

#Split into x and y Dataframes
y = df['Classification']
del df['NBA PER']
del df['Classification']
del df['Classification2']
del df['Player']
#del df['Conf']
x = df
codes, uniques = pd.factorize(df['Conf'])
del df['Conf']
df['Conf'] = codes

#Split Into Training & Testing Data Sets
x_Train, x_Test, y_Train, y_Test = train_test_split(x, y, test_size = .2, random_state = 42)

#Implement Classifier Model
forest = ExtraTreesClassifier(n_estimators=250,random_state=0)

forest.fit(x_Train, y_Train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the Feature Ranking
columnLabels = df.columns.values
print("Feature ranking:")
print("")

for f in range(x_Train.shape[1]):
    print("%d. %s (%f)" % (f + 1, columnLabels[indices[f]], importances[indices[f]]))

Feature ranking:

1. ORB (0.063742)
2. FT% (0.050517)
3. TRB (0.047168)
4. MP (0.046563)
5. FG% (0.042613)
6. FT (0.039498)
7. FGA (0.038442)
8. 2PA (0.038001)
9. FTA (0.037974)
10. 3PA (0.037602)
11. STL (0.037585)
12. 2P% (0.037436)
13. PTS (0.037435)
14. Conf (0.037080)
15. 3P (0.036864)
16. SOS (0.036829)
17. 3P% (0.035921)
18. GS (0.034887)
19. PF (0.034097)
20. DRB (0.033822)
21. G (0.033759)
22. AST (0.033506)
23. 2P (0.033279)
24. BLK (0.032827)
25. FG (0.031820)
26. TOV (0.030731)
