In [3]:
import pandas as pd

URL = 'https://www.basketball-reference.com/awards/all_league.html'
all_nba = pd.read_html(URL)[0]

In [4]:
#Drop null rows, filters between the years
all_nba = all_nba.dropna()
#Create a numeric variable for the closing year of the NBA season
all_nba['Season'] = all_nba['Season'].apply(lambda x: pd.to_numeric((x[0]+x[1]+x[-2]+x[-1])))

#Add a condition to fix the 1999-00 season
all_nba['Season'] = all_nba['Season'].replace([1900],2000)

In [5]:
#Filter data down to the year after the NBA-ABA merger and remove position designation
all_nba = all_nba[all_nba['Season']>=1977]
all_nba['Unnamed: 3'] = all_nba['Unnamed: 3'].apply(lambda x: x[0:-2])
all_nba['Unnamed: 4'] = all_nba['Unnamed: 4'].apply(lambda x: x[0:-2])
all_nba['Unnamed: 5'] = all_nba['Unnamed: 5'].apply(lambda x: x[0:-2])
all_nba['Unnamed: 6'] = all_nba['Unnamed: 6'].apply(lambda x: x[0:-2])
all_nba['Unnamed: 7'] = all_nba['Unnamed: 7'].apply(lambda x: x[0:-2])

In [6]:
#Restructure table of All-NBA players
all_nba_final = pd.melt(all_nba, id_vars='Season', value_vars = ['Unnamed: 3', 'Unnamed: 4','Unnamed: 5',
                                                 'Unnamed: 6','Unnamed: 7'],
       value_name='Player', var_name='Position')

In [7]:
all_nba_final['Is_All_NBA'] = 1
all_nba_final = all_nba_final.drop(columns='Position')

In [8]:
all_nba_final

Unnamed: 0,Season,Player,Is_All_NBA
0,2020,Anthony Davis,1
1,2020,Nikola Jokić,1
2,2020,Rudy Gobert,1
3,2019,Nikola Jokić,1
4,2019,Joel Embiid,1
...,...,...,...
595,1979,World B. Free,1
596,1978,David Thompson,1
597,1978,Paul Westphal,1
598,1977,Paul Westphal,1


In [9]:
URL2 = 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html'
players = pd.read_html(URL2)[0]

In [10]:
#Use a for loop to read in and clean stats for every year and combine into one table
for i in range(1980,2021):
    URL = 'https://www.basketball-reference.com/leagues/NBA_'+ str(i) + '_per_game.html'
    players = pd.read_html(URL)[0]
    players=players[players.Rk != 'Rk']
    players=players[players.Tm != 'TOT']
    players['Season'] = i
    players['G'] = players['G'].apply(lambda x: float(x))
    players = players[players.G >60]
    if (i == 1980):
        all_player_seasons = players
    else:
        all_player_seasons = pd.concat([all_player_seasons,players],axis=0)
    
    URL_advanced = 'https://www.basketball-reference.com/leagues/NBA_'+ str(i) + '_advanced.html'
    players_advanced = pd.read_html(URL_advanced)[0]
    players_advanced=players_advanced[players_advanced.Rk != 'Rk']
    players_advanced=players_advanced[players_advanced.Tm != 'TOT']
    players_advanced['Season'] = i
    players_advanced['G'] = players_advanced['G'].apply(lambda x: float(x))
    players_advanced = players_advanced[players_advanced.G>60]
    if (i == 1980):
        all_player_seasons_advanced = players_advanced
    else:
        all_player_seasons_advanced = pd.concat([all_player_seasons_advanced,players_advanced],axis=0)
    
    print(i)


1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


In [77]:
#Select columns for advanced stats
all_player_seasons_advanced = all_player_seasons_advanced[['Player','Season','Age','PER','USG%','TS%','BPM','WS/48','VORP']]

In [78]:
#Select columns for traditional stats
all_player_seasons = all_player_seasons[['Player','Season','Age','FGA','eFG%','TRB','AST','STL','BLK','PTS']]

In [79]:
#Clean data, remove * from name that represented All-Star
all_player_seasons['Player'] = all_player_seasons['Player'].apply(lambda x: x.replace('*',''))
all_player_seasons_advanced['Player'] = all_player_seasons_advanced['Player'].apply(lambda x: x.replace('*',''))
all_player_seasons = all_player_seasons.reset_index()
all_player_seasons_advanced = all_player_seasons_advanced.reset_index()

In [80]:
#Merge the two seperate datasets with the All-NBA table and change the NA's to 0 to represent not making an All-NBA team
all_player_seasons_merged = all_player_seasons.merge(all_nba_final, on=['Player','Season'], how='left')
all_player_seasons_merged['Is_All_NBA'] = all_player_seasons_merged['Is_All_NBA'].fillna(value=0)

cols = all_player_seasons_merged.columns.drop('Player')
all_player_seasons_merged[cols] = all_player_seasons_merged[cols].apply(pd.to_numeric, errors='coerce')

In [81]:
all_player_seasons_advanced_merged = all_player_seasons_advanced.merge(all_nba_final, on=['Player','Season'], how='left')
all_player_seasons_advanced_merged['Is_All_NBA'] = all_player_seasons_advanced_merged['Is_All_NBA'].fillna(value=0)

cols = all_player_seasons_advanced_merged.columns.drop('Player')
all_player_seasons_advanced_merged[cols] = all_player_seasons_advanced_merged[cols].apply(pd.to_numeric, errors='coerce')

In [82]:
#Merge traditional and advanced statistics into one dataset
total = all_player_seasons_merged.merge(all_player_seasons_advanced_merged, on=['Player','Season','Age', 'Is_All_NBA'], 
                                        how='inner')
total = total.drop(columns=['index_x','index_y'])
data = total.pop('Is_All_NBA')
total['Is_All_NBA'] = data


In [83]:
from sklearn.model_selection import train_test_split
cols = total.columns.drop(['Player','Season','Is_All_NBA'])
X = total[cols]
Y = total['Is_All_NBA']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, random_state=824)

In [84]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
x_train_scaled = min_max_scaler.fit_transform(X_train)
X_train_TOT = pd.DataFrame(x_train_scaled)


x_test_scaled = min_max_scaler.fit_transform(X_test)
X_test_TOT = pd.DataFrame(x_test_scaled)

In [85]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score
from sklearn.naive_bayes import MultinomialNB
#Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_TOT, Y_train)
yhat = nb.predict(X_test_TOT)
yhat_train = nb.predict(X_train_TOT)
print(confusion_matrix(Y_test, yhat))

print("Train accuracy", accuracy_score(Y_train, yhat_train))
print("Test accuracy", accuracy_score(Y_test, yhat))
print("F1", f1_score(Y_test, yhat))
print("AUC", roc_auc_score(Y_test, yhat))

[[1568    0]
 [ 103    0]]
Train accuracy 0.9378928464531577
Test accuracy 0.93836026331538
F1 0.0
AUC 0.5


  'precision', 'predicted', average, warn_for)


In [86]:
from sklearn.tree import DecisionTreeClassifier
#Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
yhat2 = dt.predict(X_test)
yhat_train2 = dt.predict(X_train)
print(confusion_matrix(Y_test, yhat2))

print("Train accuracy", accuracy_score(Y_train, yhat_train2))
print("Test accuracy", accuracy_score(Y_test, yhat2))
print("F1", f1_score(Y_test, yhat2))
print("AUC", roc_auc_score(Y_test, yhat2))

[[1531   37]
 [  23   80]]
Train accuracy 1.0
Test accuracy 0.9640933572710951
F1 0.7272727272727274
AUC 0.8765510451753518


In [87]:
from sklearn.ensemble import RandomForestClassifier
#Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
yhat3 = rf.predict(X_test)
yhat_train3 = rf.predict(X_train)

print(confusion_matrix(Y_test, yhat3))
print("Train accuracy", accuracy_score(Y_train, yhat_train3))
print("Test accuracy", accuracy_score(Y_test, yhat3))
print("F1", f1_score(Y_test, yhat3))
print("AUC", roc_auc_score(Y_test, yhat3))

[[1548   20]
 [  31   72]]
Train accuracy 0.9962586052080216
Test accuracy 0.9694793536804309
F1 0.7384615384615385
AUC 0.843137012086388




In [88]:
#Random Forest Modeling using only the advanced analytics
cols = all_player_seasons_advanced_merged.columns.drop(['Player','Season','Is_All_NBA'])
X = all_player_seasons_advanced_merged[cols]
Y = all_player_seasons_advanced_merged['Is_All_NBA']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, random_state=824)

In [89]:
min_max_scaler = preprocessing.MinMaxScaler()
x_train_scaled = min_max_scaler.fit_transform(X_train)
X_train = pd.DataFrame(x_train_scaled)

x_test_scaled = min_max_scaler.fit_transform(X_test)
X_test = pd.DataFrame(x_test_scaled)

In [90]:
#Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
yhat3 = rf.predict(X_test)
yhat_train3 = rf.predict(X_train)

print(confusion_matrix(Y_test, yhat3))
print("Train accuracy", accuracy_score(Y_train, yhat_train3))
print("Test accuracy", accuracy_score(Y_test, yhat3))
print("F1", f1_score(Y_test, yhat3))
print("AUC", roc_auc_score(Y_test, yhat3))

[[1550   18]
 [  36   67]]
Train accuracy 0.9976055073331338
Test accuracy 0.9676840215439856
F1 0.7127659574468085
AUC 0.8195029225282346




In [91]:
#Random Forest modeling using only tradiitonal statistics
cols = all_player_seasons_merged.columns.drop(['Player','Season','Is_All_NBA'])
X = all_player_seasons_merged[cols]
Y = all_player_seasons_merged['Is_All_NBA']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, random_state=824)

In [92]:
min_max_scaler = preprocessing.MinMaxScaler()
x_train_scaled = min_max_scaler.fit_transform(X_train)
X_train = pd.DataFrame(x_train_scaled)

x_test_scaled = min_max_scaler.fit_transform(X_test)
X_test = pd.DataFrame(x_test_scaled)

In [93]:
#Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
yhat3 = rf.predict(X_test)
yhat_train3 = rf.predict(X_train)

print(confusion_matrix(Y_test, yhat3))
print("Train accuracy", accuracy_score(Y_train, yhat_train3))
print("Test accuracy", accuracy_score(Y_test, yhat3))
print("F1", f1_score(Y_test, yhat3))
print("AUC", roc_auc_score(Y_test, yhat3))

[[1528   40]
 [  30   73]]
Train accuracy 0.9959592936246633
Test accuracy 0.9581089168162776
F1 0.6759259259259259
AUC 0.8416138299980187


