#Setup

##Google Drive

In [0]:
!pip install sportsreference
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

Collecting sportsreference
[?25l  Downloading https://files.pythonhosted.org/packages/4a/16/64f2181463018c00df5612cd3319a7cbf4403bd7b5c56ba8db1b9bf21a8d/sportsreference-0.4.7-py2.py3-none-any.whl (373kB)
[K     |▉                               | 10kB 22.0MB/s eta 0:00:01[K     |█▊                              | 20kB 6.2MB/s eta 0:00:01[K     |██▋                             | 30kB 8.8MB/s eta 0:00:01[K     |███▌                            | 40kB 5.7MB/s eta 0:00:01[K     |████▍                           | 51kB 7.0MB/s eta 0:00:01[K     |█████▎                          | 61kB 8.2MB/s eta 0:00:01[K     |██████▏                         | 71kB 9.3MB/s eta 0:00:01[K     |███████                         | 81kB 10.4MB/s eta 0:00:01[K     |███████▉                        | 92kB 11.3MB/s eta 0:00:01[K     |████████▊                       | 102kB 9.3MB/s eta 0:00:01[K     |█████████▋                      | 112kB 9.3MB/s eta 0:00:01[K     |██████████▌                     

##Import Packages

In [0]:
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
from sportsreference.nba.teams import Teams
from sportsreference.nba.roster import Roster
from sportsreference.nba.roster import Player
from sportsreference.nba.boxscore import Boxscore
from sportsreference.nba.boxscore import Boxscores
import warnings
warnings.filterwarnings("ignore")

##Download files

In [0]:
# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('~/data/pickle/')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1e8GS0L0xUXQDgiRorx__FQViQjHNza7c' in parents"}).GetList()

for f in file_list:
  # 3. Create & download by id.
  print('title: %s, id: %s' % (f['title'], f['id']))
  fname = os.path.join(local_download_path, f['title'])
  print('downloading to {}'.format(fname))
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(fname)

title: logistic_model.pkl, id: 1Z0rSoSuRYMgebMgVnGA0gk6tF_mZlRJw
downloading to /root/data/pickle/logistic_model.pkl
title: match_df_processed.pkl, id: 1Dfg_kOpUmeWTuYxTLrOctErJQL3iyvk1
downloading to /root/data/pickle/match_df_processed.pkl
title: all_players_data_all_season_processed.pkl, id: 1wuFPczmUxGikAqT-mKlu6uWhZXN9folv
downloading to /root/data/pickle/all_players_data_all_season_processed.pkl
title: all_players_data_all_season.pkl, id: 1Ngu5JnhvdPLE5VPo5LutFfZ2Q5uT9Lgg
downloading to /root/data/pickle/all_players_data_all_season.pkl
title: players_list.pkl, id: 17dLZoeKIEeAcS3ZffJf5kt_xEqUab6Gi
downloading to /root/data/pickle/players_list.pkl
title: games_df.p, id: 18opuKiNf9IM7lbh3xa9f5k7rGH056RTt
downloading to /root/data/pickle/games_df.p
title: Simulation_2019_20.xlsx, id: 1wUhiDOCQ1sYVab6GkbXzkBvXyBLbaCz4
downloading to /root/data/pickle/Simulation_2019_20.xlsx


#Load data

In [0]:
# retrieve data from pickle files

# all_players_data
all_players_data_file = '/root/data/pickle/all_players_data_all_season.pkl'
with open(all_players_data_file, 'rb') as f:
  all_players_data = pickle.load(f)

# retrieve match data from pickle files
match_df = pd.read_pickle('/root/data/pickle/games_df.p')

#Data Manipulation for players' stats

##Fix Variables

###Index

In [0]:
all_players_data = all_players_data.reset_index().set_index(['season', 'player_name'])

###Height

In [0]:
all_players_data['height'] = all_players_data['height'].apply(lambda x: int(x[0])*12 + int(x[2:]))

###Datatypes

In [0]:
string_cols = ['nationality',
              'player_id',
              'position',
              'team_abbreviation',
              ]
float_cols = [x for x in all_players_data.columns if x not in string_cols]
float_cols.append('player_id')

all_players_data_string = all_players_data[string_cols]
all_players_data_float = all_players_data[float_cols]
all_players_data_float = all_players_data_float.astype('float', errors='ignore')

##Missing value treatment

###Fill with mean for numeric columns

In [0]:
for item in all_players_data_float.columns:
  if item == 'player_id':
    continue
  mean_ = all_players_data_float[item].mean()
  all_players_data_float[item] = all_players_data_float[item].fillna(mean_)

# Join
all_players_data_string = all_players_data_string.reset_index()
all_players_data_float = all_players_data_float.reset_index().drop(columns = ['player_name'])
all_players_data = all_players_data_string.merge(all_players_data_float,
                                                 left_on = ['season', 'player_id'],
                                                 right_on = ['season', 'player_id'],
                                                 how = 'inner')

###Manually impute missing values for positions

In [0]:
replace = {'C.J. Watson':'PG', 'P.J. Hairston':'SF', 'Jarrod Uthoff':'PF', 'Baron Davis':'PG', 'Adonis Thomas':'SF', 'Samuel Dalembert':'C', 'Christian Eyenga':'SF', 'Carlos Delfino':'SF', 'Travis Leslie':'PG', 'Josh Childress':'SF', 'Patric Young':'PF', 'Paul Pierce':'SF', 'Henry Sims':'C', 'Ronald Roberts':'PF', 'Elton Brand':'C', 'Mike Dunleavy':'SF', 'Richard Solomon':'C', 'Malcolm Lee':'SG', 'Kevin Murphy':'SG', 'Justin Hamilton':'C', 'Phil Pressey':'PG', 'Cartier Martin':'SF', 'Andrea Bargnani':'PF', 'Monta Ellis':'SG', 'Daniel Ochefu':'PF', 'Mitch McGary':'PF', 'Leandro Barbosa':'SG', 'Kendrick Nunn':'PG', 'Chinanu Onuaku':'PF', 'A.J. Hammons':'C', 'Jon Brockman':'PF', 'Josh Smith':'PF', 'Ben Moore':'PF', 'Trey Thompkins':'PF', 'Nikola Peković':'C', 'Andray Blatche':'C', 'Jonathan Gibson':'PG', 'Quentin Richardson':'SG', 'Scotty Hopson':'SG', 'Brandon Rush':'SG', 'Alexis Ajinça':'C', 'Robbie Hummel':'SF', 'Dejounte Murray':'PG', 'Diante Garrett':'PG', 'Dionte Christmas':'SG', 'Cole Aldrich':'C', 'Jason Thompson':'C', 'Delonte West':'PG', 'Furkan Aldemir':'PF', 'Andre Roberson':'SF', 'Anthony Randolph':'PF', 'Jabari Bird':'SG', 'Chris Bosh':'PF', 'Tim Duncan':'PF', 'Grant Jerrett':'PF', 'Antonio Daniels':'PG', 'Andy Rautins':'SG', 'Larry Sanders':'C', 'Eric Maynor':'PG', 'Festus Ezeli':'C', 'Vítor Luiz Faverani':'PF', 'Tony Wroten':'PG', 'Lou Amundson':'PF', 'Danny Granger':'SF', 'Renaldo Balkman':'SF', 'D.J. Stephens':'SG', 'Cody Demps':'PG', 'Kelenna Azubuike':'SG', 'Tibor Pleiß':'C', 'Mirza Teletović':'PF', 'Al Harrington':'PF', 'Ivan Johnson':'PF', 'Isaiah Taylor':'PG', 'Chris Babb':'SG', 'Michael Porter Jr.':'SF', 'Joey Dorsey':'C', 'Charlie Bell':'SG', 'Erik Murphy':'PF', 'Sheldon Mac':'SG', 'Gerald Henderson':'SG', 'Spencer Hawes':'C', 'Michael Frazier':'SG', 'Zoran Dragić':'SG', 'Perry Jones':'PF', 'Keith Benson':'C', 'Timofey Mozgov':'C', 'JaJuan Johnson':'PF', 'C.J. Wilcox':'SG', 'Cliff Alexander':'C', 'Manu Ginóbili':'SG', 'Ray McCallum':'PG', 'Peyton Siva':'PG', 'Yi Jianlian':'PF', 'Dorell Wright':'PG', 'Jordan Adams':'SG', 'Tyler Honeycutt':'SF', 'Antonio McDyess':'PF', 'Zach Randolph':'PF', 'D.J. Kennedy':'SG', 'Jordan Farmar':'PG', 'Richard Hamilton':'SF', 'Hamed Haddadi':'C', 'Yakuba Ouattara':'SG', 'Ty Lawson':'PG', 'Kristaps Porziņģis':'PF', 'Brad Miller':'PF', 'Kevin Garnett':'PF', 'Rakeem Christmas':'PF', 'Luke Harangody':'PF', 'Andrew Nicholson':'PF', 'Charles Cooke':'SG', 'Kyle Singler':'SF', 'Quincy Miller':'SF', 'Gerald Wallace':'SF', 'K.J. McDaniels':'SG', 'Wenyen Gabriel':'PF', 'Beno Udrih':'PG', 'Nick Johnson':'PG', 'Steve Nash':'PG', 'Michael Gbinije':'SG', 'Tony Mitchell':'PF', 'Nemanja Nedović':'PG', 'Kevin Martin':'SG', 'Martell Webster':'SF', 'Raja Bell':'SG', 'Ömer Aşık':'C', 'Jarvis Varnado':'PF', 'Chris Wright':'PF', 'Axel Toupane':'SG', 'Dakari Johnson':'C', 'Elliot Williams':'SG', 'Gary Forbes':'SF', 'Kwame Brown':'C', 'Chris Duhon':'PG', 'Miroslav Raduljica':'PF', 'Matt Barnes':'SF', 'Brandon Roy':'SG', 'Arnett Moultrie':'PF', 'Michael Young':'SF', 'Mo Williams':'PG', 'Ronnie Price':'PG', 'Trevon Bluiett':'SG', 'Gilbert Arenas':'PG', 'Gal Mekel':'PG', 'Diamond Stone':'C', 'Tornike Shengelia':'PF', 'Travis Outlaw':'SF', 'Anderson Varejão':'C', 'Al Jefferson':'C', 'Marcus Camby':'PF', 'Isaiah Whitehead':'PG', 'Jamaal Tinsley':'PG', 'James Posey':'SF', 'Chris Johnson':'C', 'Caron Butler':'SF', 'Sasha Kaun':'C', 'Josh Akognon':'PG', 'Carl Landry':'PF', 'Deron Williams':'PG', 'Pablo Prigioni':'PG', 'Fab Melo':'C', 'Garret Siler':'C', 'Aaron Gray':'C', 'Georgios Papagiannis':'C', 'Denzel Valentine':'SG', 'Tim Ohlbrecht':'PF', 'Jordan Williams':'PF', 'Derrick Caracter':'PF', 'Jamaal Franklin':'SG', 'Carrick Felix':'SG', 'Darrell Arthur':'PF', 'Norvel Pelle': 'C'}

for i in all_players_data.loc[(all_players_data.position.isna()) | (all_players_data['position'] == '')]['player_name']:
  all_players_data.loc[all_players_data['player_name'] == i,'position'] = replace[i]

In [0]:
cols_to_drop = ['team_abbreviation', 
                'and_ones', 
                'blocking_fouls', 
                'lost_ball_turnovers', 
                'net_plus_minus', 
                'offensive_fouls', 
                'on_court_plus_minus', 
                'other_turnovers', 
                'passing_turnovers',
                'points_generated_by_assists', 
                'shooting_fouls', 
                'shooting_fouls_drawn', 
                'shots_blocked', 
                'take_fouls',
                'nationality']
all_players_data = all_players_data.drop(columns=cols_to_drop)

##Percentage games started 

In [0]:
all_players_data['perc_games_started'] = all_players_data['games_started'] / all_players_data['games_played']

##Fix multiple positions

In [0]:
all_players_data.loc[all_players_data['position'].apply(lambda x: '-' in x),'position'] = \
    all_players_data.loc[all_players_data['position'].apply(lambda x: '-' in x),'position'].apply(lambda x: x.split('-')[0])

##Create dummies for categorical variables

In [0]:
# Position
position = pd.get_dummies(all_players_data['position'], prefix='position',)
all_players_data = all_players_data.join(position)

##Save processed players data to pickle

In [0]:
# with open('all_players_data_all_season_processed.pkl', 'wb') as f:
#     pickle.dump(all_players_data, f)

# # save to drive
# link = 'https://drive.google.com/open?id=1e8GS0L0xUXQDgiRorx__FQViQjHNza7c'
# _, id = link.split("=")

# # get the folder id where you want to save your file
# file = drive.CreateFile({'parents':[{u'id': id}]})
# file.SetContentFile('all_players_data_all_season_processed.pkl')
# file.Upload() 

#Data Manipulation for matches

##Fix data type

###Date of Match

In [0]:
# all_players_data.loc[all_players_data['player_id']== match_df['home_players'][0][0]]
match_df['Date_of_Match'] = pd.to_datetime(match_df['Date_of_Match'])

##Extract season from date of match column

###Current season

In [0]:
match_df["yearp1"] = (match_df['Date_of_Match'].dt.year+1).astype(str)
match_df["year"] = (match_df['Date_of_Match'].dt.year).astype(str)
match_df["yearm1"] = (match_df['Date_of_Match'].dt.year-1).astype(str)
match_df["month"] = (match_df['Date_of_Match'].dt.month)

match_df.loc[match_df['month'] >= 10, 'curr_season'] = \
    match_df.loc[match_df['month'] >= 10, 'year'] + '-' + match_df.loc[match_df['month'] >= 10, 'yearp1'].apply(lambda x: x[2:])

match_df.loc[match_df['month'] < 10, 'curr_season'] = \
    match_df.loc[match_df['month'] < 10, 'yearm1'] + '-' + match_df.loc[match_df['month'] < 10, 'year'].apply(lambda x: x[2:])
    
match_df = match_df.drop(columns=['yearp1', 'year', 'yearm1', 'month'])

###Previous season

In [0]:
match_df['prev_season'] = match_df['curr_season'].apply(lambda x: str(int(x[:4])-1) + "-" + str(int(x[-2:])-1))

##Expand home and away players list into columns

In [0]:
home_colnames = ['home_player_' + str(i) for i in range(1,16)]
away_colnames = ['away_player_' + str(i) for i in range(1,16)]

home_players_df = pd.DataFrame(list(match_df['home_players']), columns=home_colnames)
away_players_df = pd.DataFrame(list(match_df['away_players']), columns=away_colnames)

match_df = match_df.join(home_players_df)
match_df = match_df.join(away_players_df)

##Reset index for all players data

In [0]:
all_players_data = all_players_data.reset_index()

In [0]:
match_df_copy = match_df.copy()
# match_df = match_df_copy

##Join player ratings, position and starting percentage for matches

In [0]:
player_types = ['home_player_', 'away_player_']
for player_type in player_types:
  for i in range(1,16):
    #Join ratings, position and starting percentage from all players data
    match_df = match_df.merge(all_players_data[['player_efficiency_rating', 'position', 'season', 'player_id', 'perc_games_started', 'height']], 
                              left_on = [player_type+str(i), 'prev_season'],
                              right_on=['player_id', 'season'], 
                              how='left')
    ##Fix position NaNs
    match_df_na_pos = match_df.loc[match_df['position'].isna(), :].drop(columns = ['position', 'season', 'player_id'])
    match_df_not_na_pos = match_df.loc[~match_df['position'].isna(), :]
    match_df_na_pos = match_df_na_pos.merge(all_players_data[['position', 'season', 'player_id']], 
                              left_on = [player_type+str(i), 'curr_season'],
                              right_on=['player_id', 'season'], 
                              how='left')
    match_df = pd.concat([match_df_na_pos, match_df_not_na_pos], ignore_index=True)

    ##Fix perc_games_started NaNs
    match_df_na_pct = match_df.loc[match_df['perc_games_started'].isna(), :].drop(columns = ['perc_games_started', 'season', 'player_id'])
    match_df_not_na_pct = match_df.loc[~match_df['perc_games_started'].isna(), :]
    match_df_na_pct = match_df_na_pct.merge(all_players_data[['perc_games_started', 'season', 'player_id']], 
                              left_on = [player_type+str(i), 'curr_season'],
                              right_on=['player_id', 'season'], 
                              how='left')
    match_df = pd.concat([match_df_na_pct, match_df_not_na_pct], ignore_index=True)

    ##Fix height NaNs
    match_df_na_ht = match_df.loc[match_df['height'].isna(), :].drop(columns = ['height', 'season', 'player_id'])
    match_df_not_na_ht = match_df.loc[~match_df['height'].isna(), :]
    match_df_na_ht = match_df_na_ht.merge(all_players_data[['height', 'season', 'player_id']], 
                              left_on = [player_type+str(i), 'curr_season'],
                              right_on=['player_id', 'season'], 
                              how='left')
    match_df = pd.concat([match_df_na_ht, match_df_not_na_ht], ignore_index=True)

    ##Rename columns
    match_df = match_df.rename(columns = {'player_efficiency_rating': player_type+str(i)+'_rating', 
                                          'position': player_type+str(i)+'_position', 
                                          'perc_games_started': player_type+str(i)+'_perc_games_started',
                                          'height': player_type+str(i)+'_height'})
    match_df = match_df.drop(columns=['player_id', 'season'])

##Fix rows with multiple occurance of same position in starting 5

###Define function for fixing

In [0]:
def sort_position(data):
  srt_data = [sorted(item, key=lambda x:x[1]) for item in data]
  pos = ['PG','SG','SF','PF','C']
  for k in range(0, len(srt_data)):
    for idx, position in zip([0,1,2,3,4], pos):
      srt_data[k][idx][1] = position
  return srt_data 

###Fix Home positions

####Split correct and incorrect position rows

In [0]:
positions_to_fix = ['home_player_1_position', 'home_player_2_position', 'home_player_3_position', 'home_player_4_position', 'home_player_5_position']
len_set = [len(set(x)) for x in match_df[positions_to_fix].values]
correct_positions_idx = np.where(np.array(len_set) == 5)
incorrect_positions_idx = np.where(np.array(len_set) != 5)
match_df_home_correct = match_df.iloc[correct_positions_idx]
match_df_home_incorrect = match_df.iloc[incorrect_positions_idx].reset_index(drop=True)

####Create list for fixing

In [0]:
home_height_vals = match_df_home_incorrect[['home_player_1_height', 
                                            'home_player_2_height', 
                                            'home_player_3_height', 
                                            'home_player_4_height', 
                                            'home_player_5_height']].values

position_labels = ['home_player_1_position', 
                  'home_player_2_position', 
                  'home_player_3_position', 
                  'home_player_4_position', 
                  'home_player_5_position']

pos_ht = []
for height in home_height_vals:
  pos_ht.append([[pos_lab, ht] for pos_lab, ht in zip(position_labels, height)])

####Sort and assign values to dataframe

In [0]:
player_pos_list = sort_position(pos_ht)
for idx, positions in zip(range(0,len(match_df_home_incorrect)), player_pos_list):
  for pos_list in positions:
    match_df_home_incorrect.loc[idx, pos_list[0]] = pos_list[1]

####Append correct and fixed positions dataframe

In [0]:
match_df = pd.concat([match_df_home_incorrect, match_df_home_correct], ignore_index=True)

###Fix Away positions

####Split correct and incorrect position rows

In [0]:
positions_to_fix = ['away_player_1_position', 'away_player_2_position', 'away_player_3_position', 'away_player_4_position', 'away_player_5_position']
len_set = [len(set(x)) for x in match_df[positions_to_fix].values]
correct_positions_idx = np.where(np.array(len_set) == 5)
incorrect_positions_idx = np.where(np.array(len_set) != 5)
match_df_away_correct = match_df.iloc[correct_positions_idx]
match_df_away_incorrect = match_df.iloc[incorrect_positions_idx].reset_index(drop=True)

####Create list for fixing

In [0]:
away_height_vals = match_df_away_incorrect[['away_player_1_height', 
                                            'away_player_2_height', 
                                            'away_player_3_height', 
                                            'away_player_4_height', 
                                            'away_player_5_height']].values

position_labels = ['away_player_1_position', 
                  'away_player_2_position', 
                  'away_player_3_position', 
                  'away_player_4_position', 
                  'away_player_5_position']

pos_ht = []
for height in away_height_vals:
  pos_ht.append([[pos_lab, ht] for pos_lab, ht in zip(position_labels, height)])

####Sort and assign values to dataframe

In [0]:
player_pos_list = sort_position(pos_ht)
for idx, positions in zip(range(0,len(match_df_away_incorrect)), player_pos_list):
  for pos_list in positions:
    match_df_away_incorrect.loc[idx, pos_list[0]] = pos_list[1]

####Append correct and fixed positions dataframe

In [0]:
match_df = pd.concat([match_df_away_incorrect, match_df_away_correct], ignore_index=True)

In [0]:
match_df_copy = match_df.copy(deep=True)
# match_df = match_df_copy

##Impute ratings for new players with 15

In [0]:
columns = ['home_player_' + str(i) for i in range(1,16)] + ['away_player_' + str(i) for i in range(1,16)]
for col in columns:
  match_df.loc[(~match_df[col].isna()) & (match_df[col + "_rating"].isna()), col + "_rating"] = 15

##Mean ratings for bench

In [0]:
match_df['bench_away_rating'] = match_df[["away_player_" + str(i) + "_rating" for i in range(6,16)]].apply(lambda x: np.mean(x), axis=1)
match_df['bench_home_rating'] = match_df[["home_player_" + str(i) + "_rating" for i in range(6,16)]].apply(lambda x: np.mean(x), axis=1)

##Ratings for each position in starting 5

In [0]:
# match_df_copy = match_df.copy()
match_df = match_df_copy

In [0]:
pos = ['PG','SG','SF','PF','C']
team_types = ['home', 'away']
for team_type in team_types:
  for position in pos:
    for i in range(len(match_df)):
      for j in range(1,6):
        if(match_df.loc[i, team_type + '_player_' + str(j) + '_position'] == position):
          match_df.loc[i, position + '_' + team_type + '_rating'] = match_df.loc[i, team_type +'_player_' + str(j) + '_rating']

##Match winner target variable

In [0]:
match_df['home_win_flag'] = np.where(match_df['home_abbr'] == match_df['winning_abbr'],1,0)

##Drop NAs

In [0]:
print(match_df.shape)
columns = ['home_player_' + str(i) + '_position' for i in range(1,6)] + ['away_player_' + str(i) + '_position' for i in range(1,6)]
match_df = match_df.dropna(subset = columns)
print(match_df.shape)

(11844, 179)
(11678, 179)


##Save to picle

In [0]:
with open('match_df_processed.pkl', 'wb') as f:
    pickle.dump(match_df, f)

# save to drive
link = 'https://drive.google.com/open?id=1e8GS0L0xUXQDgiRorx__FQViQjHNza7c'
_, id = link.split("=")

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': id}]})
file.SetContentFile('match_df_processed.pkl')
file.Upload() 