In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('fivethirtyeight')

In [3]:
#reading in original dataframe of seasonal statistics
players = pd.read_csv('Data/NBA_player_seasonal.csv')
players.head()

Unnamed: 0,Rk,Player,Season,Age,Tm,Lg,WS,G,GS,MP,...,BLK,TOV,PF,PTS,FG%,2P%,3P%,eFG%,FT%,TS%
0,1,Kareem Abdul-Jabbar\abdulka01,1971-72,24.0,MIL,NBA,25.4,81,,3583.0,...,,,235,2822,0.574,0.574,,0.574,0.689,0.603
1,2,Wilt Chamberlain\chambwi01,1963-64,27.0,SFW,NBA,25.0,80,,3689.0,...,,,182,2948,0.524,0.524,,0.524,0.531,0.537
2,3,George Mikan\mikange01,1950-51,26.0,MNL,NBA,23.4,68,,,...,,,308,1932,0.428,0.428,,0.428,0.803,0.509
3,4,Wilt Chamberlain\chambwi01,1961-62,25.0,PHW,NBA,23.1,80,,3882.0,...,,,123,4029,0.506,0.506,,0.506,0.613,0.536
4,5,Kareem Abdul-Jabbar\abdulka01,1970-71,23.0,MIL,NBA,22.3,82,,3288.0,...,,,264,2596,0.577,0.577,,0.577,0.69,0.606


In [4]:
#sorting dataframe by players and seasons
players.sort_values(by=['Player', 'Season'], inplace = True)

In [5]:
#creating a column that incrementally counts seasons players have played in
players['season_count'] = players.groupby('Player').cumcount()+1
#removing current season that is not finished yet
players = players[players['Season'] != '2020-21']

In [6]:
#creating per game statistics
players['ppg'] = players['PTS']/players['G']
players['apg'] = players['AST']/players['G']
players['rpg'] = players['TRB']/players['G']
players['spg'] = players['STL']/players['G']
players['bpg'] = players['BLK']/players['G']
players['tpg'] = players['TOV']/players['G']
players['ftpg'] = players['FT']/players['G']

In [7]:
#splitting player names into name and unique ID
players['name_split'] = players['Player'].str.split('\\')
players['name'] = [x[0] for x in players['name_split']]
players['unique_id'] = [x[1] for x in players['name_split']]
#splitting year to seperate year awards are given
players['year_split'] = players['Season'].str.split('-')
players['award_year'] = [x[1] for x in players['year_split']]
players['season_year'] = [x[0] for x in players['year_split']]
players['season_year'] = players['season_year'].astype(int)
#concat player and award year to merge df on
players['concat'] = players['name'] + players['award_year']
#unique ID match to allow merge of advanced stats
players['unique_id_match'] = players['unique_id'] + players['Season']
players.head()

Unnamed: 0,Rk,Player,Season,Age,Tm,Lg,WS,G,GS,MP,...,tpg,ftpg,name_split,name,unique_id,year_split,award_year,season_year,concat,unique_id_match
7583,7584,A.C. Green\greenac01,1985-86,22.0,LAL,NBA,3.3,82,1.0,1542.0,...,1.207317,1.243902,"[A.C. Green, greenac01]",A.C. Green,greenac01,"[1985, 86]",86,1985,A.C. Green86,greenac011985-86
1934,1935,A.C. Green\greenac01,1986-87,23.0,LAL,NBA,7.6,79,72.0,2240.0,...,1.291139,2.78481,"[A.C. Green, greenac01]",A.C. Green,greenac01,"[1986, 87]",87,1986,A.C. Green87,greenac011986-87
1735,1736,A.C. Green\greenac01,1987-88,24.0,LAL,NBA,7.9,82,64.0,2636.0,...,1.463415,3.573171,"[A.C. Green, greenac01]",A.C. Green,greenac01,"[1987, 88]",88,1987,A.C. Green88,greenac011987-88
1026,1027,A.C. Green\greenac01,1988-89,25.0,LAL,NBA,9.4,82,82.0,2510.0,...,1.45122,3.439024,"[A.C. Green, greenac01]",A.C. Green,greenac01,"[1988, 89]",89,1988,A.C. Green89,greenac011988-89
1853,1854,A.C. Green\greenac01,1989-90,26.0,LAL,NBA,7.7,82,82.0,2709.0,...,1.414634,3.390244,"[A.C. Green, greenac01]",A.C. Green,greenac01,"[1989, 90]",90,1989,A.C. Green90,greenac011989-90


In [8]:
#reading in all nba first teams and adding them to players data frame
all_nba1 = pd.read_csv('Data/All_NBA_1.csv')
#adding award year to compare against award
all_nba1['award_year'] = [str(x)[2:] for x in all_nba1['YEAR']]
#concat player name and award year for merge
all_nba1['concat'] = all_nba1['PLAYER'] + all_nba1['award_year']
all_nba1['all_nba_first'] = 1
all_nba1 = all_nba1[['concat', 'all_nba_first']]
#creating categorical variable for all nba first team award
players = players.merge(all_nba1,  how = 'left', on ='concat')

#reading in all nba second teams and adding them to players data frame
all_nba2 = pd.read_csv('Data/All_NBA_2.csv')
#adding award year to compare against award
all_nba2['award_year'] = [str(x)[2:] for x in all_nba2['YEAR']]
#concat player name and award year for merge
all_nba2['concat'] = all_nba2['PLAYER'] + all_nba2['award_year']
all_nba2['all_nba_second'] = 1
all_nba2 = all_nba2[['concat', 'all_nba_second']]
#creating categorical variable for all nba first team award
players = players.merge(all_nba2,  how = 'left', on ='concat')

#reading in all nba second teams and adding them to players data frame
all_nba3 = pd.read_csv('Data/All_NBA_3.csv')
#adding award year to compare against award
all_nba3['award_year'] = [str(x)[2:] for x in all_nba3['YEAR']]
#concat player name and award year for merge
all_nba3['concat'] = all_nba3['PLAYER'] + all_nba3['award_year']
all_nba3['all_nba_third'] = 1
all_nba3 = all_nba3[['concat', 'all_nba_third']]
#creating categorical variable for all nba first team award
players = players.merge(all_nba3,  how = 'left', on ='concat')
players.head()

Unnamed: 0,Rk,Player,Season,Age,Tm,Lg,WS,G,GS,MP,...,name,unique_id,year_split,award_year,season_year,concat,unique_id_match,all_nba_first,all_nba_second,all_nba_third
0,7584,A.C. Green\greenac01,1985-86,22.0,LAL,NBA,3.3,82,1.0,1542.0,...,A.C. Green,greenac01,"[1985, 86]",86,1985,A.C. Green86,greenac011985-86,,,
1,1935,A.C. Green\greenac01,1986-87,23.0,LAL,NBA,7.6,79,72.0,2240.0,...,A.C. Green,greenac01,"[1986, 87]",87,1986,A.C. Green87,greenac011986-87,,,
2,1736,A.C. Green\greenac01,1987-88,24.0,LAL,NBA,7.9,82,64.0,2636.0,...,A.C. Green,greenac01,"[1987, 88]",88,1987,A.C. Green88,greenac011987-88,,,
3,1027,A.C. Green\greenac01,1988-89,25.0,LAL,NBA,9.4,82,82.0,2510.0,...,A.C. Green,greenac01,"[1988, 89]",89,1988,A.C. Green89,greenac011988-89,,,
4,1854,A.C. Green\greenac01,1989-90,26.0,LAL,NBA,7.7,82,82.0,2709.0,...,A.C. Green,greenac01,"[1989, 90]",90,1989,A.C. Green90,greenac011989-90,,,


In [9]:
#adding all rookie team one
rookie_1 = pd.read_csv('Data/All_Rookie_1.csv')
rookie_1['award_year'] = [str(x)[2:] for x in rookie_1['YEAR']]
#concat player name and award year for merge
rookie_1['concat'] = rookie_1['PLAYER'] + rookie_1['award_year']
rookie_1['all_rookie_1'] = 1
rookie_1 = rookie_1[['concat', 'all_rookie_1']]
#creating categorical variable for all nba first team award
players = players.merge(rookie_1,  how = 'left', on ='concat')

#adding all rookie team two
rookie_2 = pd.read_csv('Data/All_Rookie_2.csv')
rookie_2['award_year'] = [str(x)[2:] for x in rookie_2['YEAR']]
#concat player name and award year for merge
rookie_2['concat'] = rookie_2['PLAYER'] + rookie_2['award_year']
rookie_2['all_rookie_2'] = 1
rookie_2 = rookie_2[['concat', 'all_rookie_2']]
#creating categorical variable for all nba first team award
players = players.merge(rookie_2,  how = 'left', on ='concat')
players.head()

Unnamed: 0,Rk,Player,Season,Age,Tm,Lg,WS,G,GS,MP,...,year_split,award_year,season_year,concat,unique_id_match,all_nba_first,all_nba_second,all_nba_third,all_rookie_1,all_rookie_2
0,7584,A.C. Green\greenac01,1985-86,22.0,LAL,NBA,3.3,82,1.0,1542.0,...,"[1985, 86]",86,1985,A.C. Green86,greenac011985-86,,,,,
1,1935,A.C. Green\greenac01,1986-87,23.0,LAL,NBA,7.6,79,72.0,2240.0,...,"[1986, 87]",87,1986,A.C. Green87,greenac011986-87,,,,,
2,1736,A.C. Green\greenac01,1987-88,24.0,LAL,NBA,7.9,82,64.0,2636.0,...,"[1987, 88]",88,1987,A.C. Green88,greenac011987-88,,,,,
3,1027,A.C. Green\greenac01,1988-89,25.0,LAL,NBA,9.4,82,82.0,2510.0,...,"[1988, 89]",89,1988,A.C. Green89,greenac011988-89,,,,,
4,1854,A.C. Green\greenac01,1989-90,26.0,LAL,NBA,7.7,82,82.0,2709.0,...,"[1989, 90]",90,1989,A.C. Green90,greenac011989-90,,,,,


In [10]:
#adding all defense team one
All_D_1 = pd.read_csv('Data/All_D_1.csv')
All_D_1['award_year'] = [str(x)[2:] for x in All_D_1['YEAR']]
#concat player name and award year for merge
All_D_1['concat'] = All_D_1['PLAYER'] + All_D_1['award_year']
All_D_1['all_d_1'] = 1
All_D_1 = All_D_1[['concat', 'all_d_1']]
#creating categorical variable for all nba first team award
players = players.merge(All_D_1,  how = 'left', on ='concat')

#adding all defense team two
All_D_2 = pd.read_csv('Data/All_D_2.csv')
All_D_2['award_year'] = [str(x)[2:] for x in All_D_2['YEAR']]
#concat player name and award year for merge
All_D_2['concat'] = All_D_2['PLAYER'] + All_D_2['award_year']
All_D_2['all_d_2'] = 1
All_D_2 = All_D_2[['concat', 'all_d_2']]
#creating categorical variable for all nba first team award
players = players.merge(All_D_2,  how = 'left', on ='concat')
players.head()

Unnamed: 0,Rk,Player,Season,Age,Tm,Lg,WS,G,GS,MP,...,season_year,concat,unique_id_match,all_nba_first,all_nba_second,all_nba_third,all_rookie_1,all_rookie_2,all_d_1,all_d_2
0,7584,A.C. Green\greenac01,1985-86,22.0,LAL,NBA,3.3,82,1.0,1542.0,...,1985,A.C. Green86,greenac011985-86,,,,,,,
1,1935,A.C. Green\greenac01,1986-87,23.0,LAL,NBA,7.6,79,72.0,2240.0,...,1986,A.C. Green87,greenac011986-87,,,,,,,
2,1736,A.C. Green\greenac01,1987-88,24.0,LAL,NBA,7.9,82,64.0,2636.0,...,1987,A.C. Green88,greenac011987-88,,,,,,,
3,1027,A.C. Green\greenac01,1988-89,25.0,LAL,NBA,9.4,82,82.0,2510.0,...,1988,A.C. Green89,greenac011988-89,,,,,,,1.0
4,1854,A.C. Green\greenac01,1989-90,26.0,LAL,NBA,7.7,82,82.0,2709.0,...,1989,A.C. Green90,greenac011989-90,,,,,,,


In [11]:
#reading in rookie of the year data
roy = pd.read_csv('Data/ROY.csv')
roy['award_year'] = [str(x)[2:] for x in roy['YEAR']]
roy['concat'] = roy['PLAYER'] + roy['award_year']
roy['roy'] = 1
roy = roy[['concat', 'roy']]
#merging rookie of there year data into players dataframe
players = players.merge(roy, how = 'left', on = 'concat')
players.head()

Unnamed: 0,Rk,Player,Season,Age,Tm,Lg,WS,G,GS,MP,...,concat,unique_id_match,all_nba_first,all_nba_second,all_nba_third,all_rookie_1,all_rookie_2,all_d_1,all_d_2,roy
0,7584,A.C. Green\greenac01,1985-86,22.0,LAL,NBA,3.3,82,1.0,1542.0,...,A.C. Green86,greenac011985-86,,,,,,,,
1,1935,A.C. Green\greenac01,1986-87,23.0,LAL,NBA,7.6,79,72.0,2240.0,...,A.C. Green87,greenac011986-87,,,,,,,,
2,1736,A.C. Green\greenac01,1987-88,24.0,LAL,NBA,7.9,82,64.0,2636.0,...,A.C. Green88,greenac011987-88,,,,,,,,
3,1027,A.C. Green\greenac01,1988-89,25.0,LAL,NBA,9.4,82,82.0,2510.0,...,A.C. Green89,greenac011988-89,,,,,,,1.0,
4,1854,A.C. Green\greenac01,1989-90,26.0,LAL,NBA,7.7,82,82.0,2709.0,...,A.C. Green90,greenac011989-90,,,,,,,,


In [12]:
#reading in advanced statistics to combine with players dataframe
players_advanced = pd.read_csv('Data/NBA_player_advanced.csv')
players_advanced['name_split'] = players_advanced['Player'].str.split('\\')
players_advanced['name'] = [x[0] for x in players_advanced['name_split']]
players_advanced['unique_id'] = [x[1] for x in players_advanced['name_split']]
players_advanced['unique_id_match'] = players_advanced['unique_id'] + players_advanced['Season']
#choosing neccesary columns
players_advanced = players_advanced[[
       'PER', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'ORtg', 'DRtg', 'OWS', 'DWS', 'WS/48', 'OBPM', 'DBPM',
       'BPM', 'VORP','unique_id_match']]
#merging advanced statistics into players dataframe
players = players.merge(players_advanced, how = 'left', on = 'unique_id_match')
players.head()

Unnamed: 0,Rk,Player,Season,Age,Tm,Lg,WS,G,GS,MP,...,USG%,ORtg,DRtg,OWS,DWS,WS/48,OBPM,DBPM,BPM,VORP
0,7584,A.C. Green\greenac01,1985-86,22.0,LAL,NBA,3.3,82,1.0,1542.0,...,14.7,108.0,105.0,1.4,2.0,0.103,-1.6,0.1,-1.6,0.2
1,1935,A.C. Green\greenac01,1986-87,23.0,LAL,NBA,7.6,79,72.0,2240.0,...,14.7,122.0,105.0,4.3,3.3,0.163,0.9,0.1,1.0,1.7
2,1736,A.C. Green\greenac01,1987-88,24.0,LAL,NBA,7.9,82,64.0,2636.0,...,14.7,119.0,106.0,4.5,3.4,0.144,0.4,-0.1,0.3,1.5
3,1027,A.C. Green\greenac01,1988-89,25.0,LAL,NBA,9.4,82,82.0,2510.0,...,17.0,122.0,105.0,5.8,3.5,0.179,1.8,-0.2,1.6,2.3
4,1854,A.C. Green\greenac01,1989-90,26.0,LAL,NBA,7.7,82,82.0,2709.0,...,17.1,116.0,107.0,4.4,3.3,0.137,0.3,-0.9,-0.6,1.0


In [13]:
#Creating target variable of all nba awards received in season 4-6 to merge into final dataframe
targets = players[(players['season_count'] >= 4) & (players['season_count'] < 7)]
targets = targets[['Player','season_count','all_nba_first', 'all_nba_second',
       'all_nba_third']]
targets['all_nba_target'] = targets.fillna(0)['all_nba_first'] + targets.fillna(0)['all_nba_second'] + targets.fillna(0)['all_nba_third']
targets = targets.groupby('Player')['all_nba_target'].sum().reset_index()
targets['target'] = np.where(targets['all_nba_target'] >= 1,1,0)
targets = targets[['Player', 'target']]
targets.head()

Unnamed: 0,Player,target
0,A.C. Green\greenac01,0
1,A.J. Price\priceaj01,0
2,Aaron Brooks\brookaa01,0
3,Aaron Gordon\gordoaa01,0
4,Aaron Gray\grayaa01,0


In [14]:
targets['target'].sum()

163

In [15]:
players['award_year'].value_counts()

18    540
19    530
20    529
15    492
17    486
     ... 
58     99
57     99
61     93
56     92
59     92
Name: award_year, Length: 74, dtype: int64

In [16]:
players.columns

Index(['Rk', 'Player', 'Season', 'Age', 'Tm', 'Lg', 'WS', 'G', 'GS', 'MP',
       'FG', 'FGA', '2P', '2PA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '2P%', '3P%', 'eFG%',
       'FT%', 'TS%', 'season_count', 'ppg', 'apg', 'rpg', 'spg', 'bpg', 'tpg',
       'ftpg', 'name_split', 'name', 'unique_id', 'year_split', 'award_year',
       'season_year', 'concat', 'unique_id_match', 'all_nba_first',
       'all_nba_second', 'all_nba_third', 'all_rookie_1', 'all_rookie_2',
       'all_d_1', 'all_d_2', 'roy', 'PER', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'OWS',
       'DWS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [17]:
#creating dataframe of only first 3 years and after 1977 season (advanced statistics werent tracked before 1977)
players_first3= players[(players['season_count'] < 4) & (players['season_year'] > 1976)]
#aggregating down to single line
players_total = players_first3.groupby(['Player'])[['G', 'GS', 'MP',
       'FG', 'FGA', '2P', '2PA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS','season_count', 'roy', 'all_rookie_1', 'all_rookie_2']].sum().reset_index()
#only including players who played at least 3 seasons (adds to 6)
players_total = players_total[(players_total['season_count'] == 6)]
players_total.head()

Unnamed: 0,Player,G,GS,MP,FG,FGA,2P,2PA,3P,3PA,...,AST,STL,BLK,TOV,PF,PTS,season_count,roy,all_rookie_1,all_rookie_2
0,A.C. Green\greenac01,243,137.0,6418.0,847,1615,846,1602,1.0,13.0,...,231,206.0,174.0,321.0,604,2310,6,0.0,0.0,0.0
3,A.J. Guyton\guytoaj01,80,14.0,1246.0,166,440,93,247,73.0,193.0,...,147,20.0,12.0,62.0,58,442,6,0.0,0.0,0.0
5,A.J. Price\priceaj01,150,3.0,2228.0,318,848,191,437,127.0,411.0,...,303,84.0,6.0,144.0,144,905,6,0.0,0.0,0.0
7,Aaron Brooks\brookaa01,213,117.0,5525.0,984,2339,626,1396,358.0,943.0,...,759,128.0,27.0,401.0,420,2762,6,0.0,0.0,0.0
8,Aaron Gordon\gordoaa01,205,117.0,4958.0,760,1652,628,1195,132.0,457.0,...,311,145.0,117.0,193.0,408,1981,6,0.0,0.0,0.0


### Merging Seasonal Statistics

In [18]:
#creating 3 different dataframes for years 1-3
players_first = players[players['season_count'] == 1].set_index('Player')
players_second = players[players['season_count'] == 2].set_index('Player')
players_third = players[players['season_count'] == 3].set_index('Player')

In [19]:
players_first = players_first[['WS', 'G', 'GS', 'MP', 'FG', 'FGA',
       '2P', '2PA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '2P%', '3P%', 'eFG%', 'FT%',
       'TS%', 'ppg', 'apg', 'rpg', 'spg', 'bpg', 'tpg', 'ftpg','PER',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'ORtg', 'DRtg', 'OWS', 'DWS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP']]

players_first.columns = ['WS_1', 'G_1', 'GS_1', 'MP_1', 'FG_1', 'FGA_1',
       '2P_1', '2PA_1', '3P_1', '3PA_1', 'FT_1', 'FTA_1', 'ORB_1', 'DRB_1', 'TRB_1', 'AST_1',
       'STL_1', 'BLK_1', 'TOV_1', 'PF_1', 'PTS_1', 'FG%_1', '2P%_1', '3P%_1', 'eFG%_1', 'FT%_1',
       'TS%_1', 'ppg_1', 'apg_1', 'rpg_1', 'spg_1', 'bpg_1', 'tpg_1', 'ftpg_1','PER_1',
       '3PAr_1', 'FTr_1', 'ORB%_1', 'DRB%_1', 'TRB%_1', 'AST%_1', 'STL%_1', 'BLK%_1', 'TOV%_1',
       'USG%_1', 'ORtg_1', 'DRtg_1', 'OWS_1', 'DWS_1', 'WS/48_1', 'OBPM_1', 'DBPM_1', 'BPM_1',
       'VORP_1']

In [20]:
players_t1 = pd.merge(left = players_total, right = players_first, left_on = 'Player', right_on = players_first.index)

In [21]:
players_second = players_second[['WS', 'G', 'GS', 'MP', 'FG', 'FGA',
       '2P', '2PA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '2P%', '3P%', 'eFG%', 'FT%',
       'TS%', 'ppg', 'apg', 'rpg', 'spg', 'bpg', 'tpg', 'ftpg','PER',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'ORtg', 'DRtg', 'OWS', 'DWS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP']]

players_second.columns = ['WS_2', 'G_2', 'GS_2', 'MP_2', 'FG_2', 'FGA_2',
       '2P_2', '2PA_2', '3P_2', '3PA_2', 'FT_2', 'FTA_2', 'ORB_2', 'DRB_2', 'TRB_2', 'AST_2',
       'STL_2', 'BLK_2', 'TOV_2', 'PF_2', 'PTS_2', 'FG%_2', '2P%_2', '3P%_2', 'eFG%_2', 'FT%_2',
       'TS%_2', 'ppg_2', 'apg_2', 'rpg_2', 'spg_2', 'bpg_2', 'tpg_2', 'ftpg_2','PER_2',
       '3PAr_2', 'FTr_2', 'ORB%_2', 'DRB%_2', 'TRB%_2', 'AST%_2', 'STL%_2', 'BLK%_2', 'TOV%_2',
       'USG%_2', 'ORtg_2', 'DRtg_2', 'OWS_2', 'DWS_2', 'WS/48_2', 'OBPM_2', 'DBPM_2', 'BPM_2',
       'VORP_2']

In [22]:
players_t2 = pd.merge(left = players_t1, right = players_second, left_on = 'Player', right_on = players_second.index)

In [23]:
players_third = players_third[['WS', 'G', 'GS', 'MP', 'FG', 'FGA',
       '2P', '2PA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '2P%', '3P%', 'eFG%', 'FT%',
       'TS%', 'ppg', 'apg', 'rpg', 'spg', 'bpg', 'tpg', 'ftpg','PER',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'ORtg', 'DRtg', 'OWS', 'DWS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP']]

players_third.columns = ['WS_3', 'G_3', 'GS_3', 'MP_3', 'FG_3', 'FGA_3',
       '2P_3', '2PA_3', '3P_3', '3PA_3', 'FT_3', 'FTA_3', 'ORB_3', 'DRB_3', 'TRB_3', 'AST_3',
       'STL_3', 'BLK_3', 'TOV_3', 'PF_3', 'PTS_3', 'FG%_3', '2P%_3', '3P%_3', 'eFG%_3', 'FT%_3',
       'TS%_3', 'ppg_3', 'apg_3', 'rpg_3', 'spg_3', 'bpg_3', 'tpg_3', 'ftpg_3','PER_3',
       '3PAr_3', 'FTr_3', 'ORB%_3', 'DRB%_3', 'TRB%_3', 'AST%_3', 'STL%_3', 'BLK%_3', 'TOV%_3',
       'USG%_3', 'ORtg_3', 'DRtg_3', 'OWS_3', 'DWS_3', 'WS/48_3', 'OBPM_3', 'DBPM_3', 'BPM_3',
       'VORP_3']

In [24]:
players_t3 = pd.merge(left = players_t2, right = players_third, left_on = 'Player', right_on = players_third.index)

In [25]:
players_t3.head()

Unnamed: 0,Player,G,GS,MP,FG,FGA,2P,2PA,3P,3PA,...,USG%_3,ORtg_3,DRtg_3,OWS_3,DWS_3,WS/48_3,OBPM_3,DBPM_3,BPM_3,VORP_3
0,A.C. Green\greenac01,243,137.0,6418.0,847,1615,846,1602,1.0,13.0,...,14.7,119.0,106.0,4.5,3.4,0.144,0.4,-0.1,0.3,1.5
1,A.J. Guyton\guytoaj01,80,14.0,1246.0,166,440,93,247,73.0,193.0,...,23.6,24.0,107.0,-0.1,0.0,-0.45,-14.1,-2.3,-16.4,0.0
2,A.J. Price\priceaj01,150,3.0,2228.0,318,848,191,437,127.0,411.0,...,17.7,100.0,106.0,0.2,0.5,0.063,-0.6,-0.5,-1.0,0.1
3,Aaron Brooks\brookaa01,213,117.0,5525.0,984,2339,626,1396,358.0,943.0,...,25.7,108.0,112.0,3.9,1.6,0.091,2.3,-1.6,0.7,1.9
4,Aaron Gordon\gordoaa01,205,117.0,4958.0,760,1652,628,1195,132.0,457.0,...,20.1,107.0,111.0,2.0,1.7,0.077,-0.3,-0.7,-1.0,0.6


In [26]:
#creating season over season statistics
players_t3['G_1-2'] = players_t3['GS_2'] - players_t3['GS_1'] #games played year 2 v 1 
players_t3['G_2-3'] = players_t3['GS_3'] - players_t3['GS_2'] #games played year 3 v 2
players_t3['GS_1-2'] = players_t3['GS_2'] - players_t3['GS_1'] #games started year 2 v 1 
players_t3['GS_2-3'] = players_t3['GS_3'] - players_t3['GS_2'] #games started year 3 v 2
players_t3['MP_1-2'] = players_t3['MP_2'] - players_t3['MP_1'] #minutes played 2 v 1 
players_t3['MP_2-3'] = players_t3['MP_3'] - players_t3['MP_2'] #minutes played 3 v 2
players_t3['PPG_1-2'] = players_t3['ppg_2'] - players_t3['ppg_1'] #ppg 2 v 1 
players_t3['PPG_2-3'] = players_t3['ppg_3'] - players_t3['ppg_2'] #ppg 3 v 2
players_t3['APG_1-2'] = players_t3['apg_2'] - players_t3['apg_1'] #apg 2 v 1 
players_t3['APG_2-3'] = players_t3['apg_3'] - players_t3['apg_2'] #apg 3 v 2
players_t3['RPG_1-2'] = players_t3['rpg_2'] - players_t3['rpg_1'] #rpg 2 v 1 
players_t3['RPG_2-3'] = players_t3['rpg_3'] - players_t3['rpg_2'] #rpg 3 v 2
players_t3['SPG_1-2'] = players_t3['spg_2'] - players_t3['spg_1'] #spg 2 v 1 
players_t3['SPG_2-3'] = players_t3['spg_3'] - players_t3['spg_2'] #spg 3 v 2
players_t3['BPG_1-2'] = players_t3['bpg_2'] - players_t3['bpg_1'] #bpg 2 v 1 
players_t3['BPG_2-3'] = players_t3['bpg_3'] - players_t3['bpg_2'] #bpg 3 v 2
players_t3['WS_1-2'] = players_t3['WS_2'] - players_t3['WS_1'] #wins shares season 2 v 1 
players_t3['WS_2-3'] = players_t3['WS_3'] - players_t3['WS_2'] #wins shares season 3 v 2
players_t3['OWS_1-2'] = players_t3['OWS_2'] - players_t3['OWS_1'] #offensive wins shares season 2 v 1 
players_t3['OWS_2-3'] = players_t3['OWS_3'] - players_t3['OWS_2'] #offensive wins shares season 3 v 2
players_t3['DWS_1-2'] = players_t3['DWS_2'] - players_t3['DWS_1'] #defensive wins shares season 2 v 1 
players_t3['DWS_2-3'] = players_t3['DWS_3'] - players_t3['DWS_2'] #defensive wins shares season 3 v 2
players_t3['PER_1-2'] = players_t3['PER_2'] - players_t3['PER_1'] #PER 2 v 1 
players_t3['PER_2-3'] = players_t3['PER_3'] - players_t3['PER_2'] #PER 3 v 3 
players_t3['USG_1-2'] = players_t3['USG%_2'] - players_t3['USG%_1'] #USG% 2 v 1 
players_t3['USG_2-3'] = players_t3['USG%_3'] - players_t3['USG%_2'] #USG% 3 v 2 
players_t3['VORP_1-2'] = players_t3['VORP_2'] - players_t3['VORP_1'] #value over replacement player diff year 1 v 2 
players_t3['VORP_2-3'] = players_t3['VORP_3'] - players_t3['VORP_2'] #value over replacement player diff year 2 v 3

In [27]:
players_t3.head()

Unnamed: 0,Player,G,GS,MP,FG,FGA,2P,2PA,3P,3PA,...,OWS_1-2,OWS_2-3,DWS_1-2,DWS_2-3,PER_1-2,PER_2-3,USG_1-2,USG_2-3,VORP_1-2,VORP_2-3
0,A.C. Green\greenac01,243,137.0,6418.0,847,1615,846,1602,1.0,13.0,...,2.9,0.2,1.3,0.1,3.9,-1.2,0.0,0.0,1.5,-0.2
1,A.J. Guyton\guytoaj01,80,14.0,1246.0,166,440,93,247,73.0,193.0,...,-0.4,-0.1,0.1,0.0,1.5,-20.8,5.7,1.4,0.4,-0.1
2,A.J. Price\priceaj01,150,3.0,2228.0,318,848,191,437,127.0,411.0,...,-0.8,0.6,-0.1,-0.2,-3.3,0.8,0.0,-5.0,-0.5,0.3
3,Aaron Brooks\brookaa01,213,117.0,5525.0,984,2339,626,1396,358.0,943.0,...,0.8,2.5,1.4,-0.6,-0.2,3.1,1.1,2.8,0.3,1.3
4,Aaron Gordon\gordoaa01,205,117.0,4958.0,760,1652,628,1195,132.0,457.0,...,2.9,-1.2,1.5,-0.5,5.6,-2.5,1.8,2.8,1.6,-0.8


In [28]:
#merging final df & target variables
df = pd.merge(left = players_t3, right = targets, left_on = 'Player', right_on = 'Player')
print(len(df))
df.head()

1544


Unnamed: 0,Player,G,GS,MP,FG,FGA,2P,2PA,3P,3PA,...,OWS_2-3,DWS_1-2,DWS_2-3,PER_1-2,PER_2-3,USG_1-2,USG_2-3,VORP_1-2,VORP_2-3,target
0,A.C. Green\greenac01,243,137.0,6418.0,847,1615,846,1602,1.0,13.0,...,0.2,1.3,0.1,3.9,-1.2,0.0,0.0,1.5,-0.2,0
1,A.J. Price\priceaj01,150,3.0,2228.0,318,848,191,437,127.0,411.0,...,0.6,-0.1,-0.2,-3.3,0.8,0.0,-5.0,-0.5,0.3,0
2,Aaron Brooks\brookaa01,213,117.0,5525.0,984,2339,626,1396,358.0,943.0,...,2.5,1.4,-0.6,-0.2,3.1,1.1,2.8,0.3,1.3,0
3,Aaron Gordon\gordoaa01,205,117.0,4958.0,760,1652,628,1195,132.0,457.0,...,-1.2,1.5,-0.5,5.6,-2.5,1.8,2.8,1.6,-0.8,0
4,Aaron Gray\grayaa01,149,19.0,1639.0,226,453,226,451,0.0,2.0,...,0.0,0.0,-0.4,-2.4,5.1,-7.5,1.9,0.0,0.3,0


In [29]:
#reading in NBA draft data to remove players drafted within the past 5 seasons as they don't qualify for our target variable of all nba in season 4-6
draft = pd.read_csv('Data/NBA_Draft.csv')
draft = draft[['Player', 'Qualified']]

final_df = df.merge(draft, how = 'left', on = 'Player')

final_df = final_df[final_df['Qualified'] == 1]
final_df.head()

Unnamed: 0,Player,G,GS,MP,FG,FGA,2P,2PA,3P,3PA,...,DWS_1-2,DWS_2-3,PER_1-2,PER_2-3,USG_1-2,USG_2-3,VORP_1-2,VORP_2-3,target,Qualified
0,A.C. Green\greenac01,243,137.0,6418.0,847,1615,846,1602,1.0,13.0,...,1.3,0.1,3.9,-1.2,0.0,0.0,1.5,-0.2,0,1.0
1,A.J. Price\priceaj01,150,3.0,2228.0,318,848,191,437,127.0,411.0,...,-0.1,-0.2,-3.3,0.8,0.0,-5.0,-0.5,0.3,0,1.0
2,Aaron Brooks\brookaa01,213,117.0,5525.0,984,2339,626,1396,358.0,943.0,...,1.4,-0.6,-0.2,3.1,1.1,2.8,0.3,1.3,0,1.0
3,Aaron Gordon\gordoaa01,205,117.0,4958.0,760,1652,628,1195,132.0,457.0,...,1.5,-0.5,5.6,-2.5,1.8,2.8,1.6,-0.8,0,1.0
4,Aaron Gray\grayaa01,149,19.0,1639.0,226,453,226,451,0.0,2.0,...,0.0,-0.4,-2.4,5.1,-7.5,1.9,0.0,0.3,0,1.0


In [30]:
final_df['target'].value_counts()

0    1183
1      90
Name: target, dtype: int64

In [31]:
final_df.to_csv('Data/Final_NBA_Seasons1-3_1977_final.csv')
players.to_csv('Data/Final_NBA_Players_updated.csv')