In [43]:
import pandas as pd
import re
#import unidecode
import numpy as np
import matplotlib.pyplot as plt
import datetime
from google.colab import drive
from tqdm.notebook import tqdm
tqdm.pandas()
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
#Seasons in data
seasons = [2021,2020,2019,2018,2017,2016,2015]
#How many time steps to include in input
day_look_back = 30

In [45]:
#path for folder with pitch data
pitch_path = '/content/drive/MyDrive/data_490/pitch_data'
#path for folder with injury data
injury_path = '/content/drive/MyDrive/data_490/injury_data'

In [46]:
#Creates a list of the pitch data data frames
data_frames = []
for season in seasons:
    data_frames.append(pd.read_csv(f'{pitch_path}/pbp_{season}.csv', encoding = "ISO-8859-1"))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [47]:
#list of pitch data data frames into one pitch data data frames and list deleted
pbp = pd.concat(data_frames)
del data_frames

In [48]:
#Changes column types to most suited type
pbp = pbp.convert_dtypes()

In [49]:
#Changes name format from Doe, John to John Doe
def name_format(player_name):
  return f'{player_name.split(",")[1].strip()} {player_name.split(",")[0]}'
pbp['player_name'] = pbp.player_name.apply(name_format)

In [50]:
#Drops rows that have null values or values that or values dont make sense in the pitch value
pbp = pbp.loc[ (~pbp.pitch_type.isin(['FO', 'PO', 'IN', 'CS', 'FA'])) & (~pbp.pitch_type.isnull())]
#Forkballs, Screwballs, and splitters are counted as changeups
pbp.pitch_type[pbp.pitch_type.isin(['FO','SC', 'FS'])] = 'CH'
#Knuckle-curves and ephesus are counted as curveballs
pbp.pitch_type[pbp.pitch_type.isin(['KC','EP'])] = 'CU'
#two seams are counted as sinkers
pbp.pitch_type[pbp.pitch_type == 'FT'] = 'SI'

In [51]:
def get_pitcher_team(row):
  if row.inning_topbot == 'Bot':
    return row.away_team
  else:
    return row.home_team

pbp['team'] = pbp.apply(get_pitcher_team, axis=1)

  import sys


In [52]:
pitch_types = pbp.pitch_type.unique()

In [53]:
attributes = ['player_name','game_date','team','release_speed', 'release_pos_x', 'release_pos_y', 
       'release_pos_z','release_extension' ,'spin_axis', 'release_spin_rate', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z']

In [84]:
pitch_types_dfs = []
for pitch in pitch_types:
  pitch_types_dfs.append(pbp[pbp.pitch_type == pitch][attributes].groupby(['player_name','game_date','team']).mean().reset_index())

In [85]:
for i, pitch_df in enumerate(pitch_types_dfs):
  pitch_df['count'] = pbp[pbp.pitch_type == pitch_types[i]][attributes].groupby(['player_name','game_date','team']).release_speed.count().values

In [86]:
pitch_types_dfs[-1].columns = pitch_types_dfs[-1].columns[:3].to_list() + [column + pitch_types[-1] for column in pitch_types_dfs[-1].columns.to_list()[3:]]

In [87]:
df = pitch_types_dfs[0]
for i in range(1,len(pitch_types_dfs)):
  df = df.merge(pitch_types_dfs[i], on=['player_name', 'game_date','team'], how='outer',suffixes=[pitch_types[i-1], pitch_types[i]])

In [90]:
#Creates a list of injury dfs where where each entry in list is a season injury df
injury_dfs = []
for i in range(len(seasons)):
    injury_dfs.append(pd.read_csv(f'{injury_path}/injury_{seasons[i]}.csv'))
    #reneames column containing player name to player_name
    injury_dfs[i] = injury_dfs[i].rename(columns={injury_dfs[i].columns[0]:'player_name','Dates':f'Dates_{seasons[i]}'})
    #changes column name of column containing injury dates to Dates_season
    injury_dfs[i] = injury_dfs[i][['player_name',f'Dates_{seasons[i]}']]
    injury_dfs[i].dropna(subset=['player_name',f'Dates_{seasons[i]}'],inplace=True)
    #formats player name
    injury_dfs[i]['player_name'] = injury_dfs[i].apply(lambda x: str(x.player_name).split(",")[0],axis=1)

In [91]:
injury_dfs[0]

Unnamed: 0,player_name,Dates_2021
0,Sean Guenther,Array \n10/3 - 10/3
1,Clayton Kershaw,Array \n10/2 - 10/3\n7/7 - 9/13
2,Josh Rogers,Array \n10/2 - 10/3
3,Joe Smith,Array \n10/2 - 10/3\n6/9 - 7/7
4,John Gant,Array \n10/2 - 10/3\n9/14 - 9/25
...,...,...
484,Oliver Drake,Array \n4/1 - 10/3
485,Jace Fry,Array \n4/1 - 6/26
486,Jose Soriano,Array \n4/1 - 10/3
487,Edwar Colina,Array \n4/1 - 10/3


In [92]:
#merges all injurys df into one injury df with each year column containing all injuries of that year
injury_df = injury_dfs[0].merge(injury_dfs[1],on=['player_name'], how='outer')
for i in range(2,len(injury_dfs),1):
    injury_df = injury_df.merge(injury_dfs[i],on=['player_name'], how='outer')
injury_df

Unnamed: 0,player_name,Dates_2021,Dates_2020,Dates_2019,Dates_2018,Dates_2017,Dates_2016,Dates_2015
0,Sean Guenther,Array \n10/3 - 10/3,,,,,,
1,Clayton Kershaw,Array \n10/2 - 10/3\n7/7 - 9/13,Array \n7/23 - 8/2,Array \n3/28 - 4/15,Array \n6/1 - 6/23\n5/6 - 5/31,Array \n7/24 - 9/1,Array \n6/27 - 9/9,
2,Josh Rogers,Array \n10/2 - 10/3,,Array \n6/26 - 9/29,,,,
3,Joe Smith,Array \n10/2 - 10/3\n6/9 - 7/7,,Array \n3/28 - 7/12,Array \n6/10 - 7/3,Array \n6/19 - 7/22,Array \n8/17 - 9/1\n6/5 - 7/1,
4,John Gant,Array \n10/2 - 10/3\n9/14 - 9/25,Array \n9/25 - 9/27,,,Array \n4/2 - 5/16,Array \n6/28 - 8/21,
...,...,...,...,...,...,...,...,...
1334,Kyuji Fujikawa,,,,,,,Array \n4/1 - 5/14
1335,Vic Black,,,,,,,Array \n3/27 - 6/7
1336,Erik Cordier,,,,,,,Array \n3/27 - 5/17
1337,Tsuyoshi Wada,,,,,,,Array \n3/27 - 5/18


In [93]:
#formats injury dates as yyyy-mm-dd
def get_injury_dates(df):
    injured_dates = []
    for season in seasons:
        for injury in str(df[f'Dates_{season}']).split("\n")[1:]:
            injury = injury.split(" ")[0]
            injury = f'{season}-{injury.replace("/","-")}'
            if injury[-2] == "-":
                injury = injury[:-1] + injury[-1:]
            injured_dates.append(injury)
    for i in range(1,len(injured_dates)+1,1):
        df["injury_" + str(i)] = injured_dates[-i]
    return df

In [94]:
#creates columns for each injury for each player
injury_df = injury_df.apply(get_injury_dates,axis=1).drop(columns=['Dates_' + str(season) for season in seasons])
injury_df

Unnamed: 0,injury_1,injury_10,injury_2,injury_3,injury_4,injury_5,injury_6,injury_7,injury_8,injury_9,player_name
0,2021-10-3,,,,,,,,,,Sean Guenther
1,2016-6-27,,2017-7-24,2018-5-6,2018-6-1,2019-3-28,2020-7-23,2021-7-7,2021-10-2,,Clayton Kershaw
2,2019-6-26,,2021-10-2,,,,,,,,Josh Rogers
3,2016-6-5,,2016-8-17,2017-6-19,2018-6-10,2019-3-28,2021-6-9,2021-10-2,,,Joe Smith
4,2016-6-28,,2017-4-2,2020-9-25,2021-9-14,2021-10-2,,,,,John Gant
...,...,...,...,...,...,...,...,...,...,...,...
1334,2015-4-1,,,,,,,,,,Kyuji Fujikawa
1335,2015-3-27,,,,,,,,,,Vic Black
1336,2015-3-27,,,,,,,,,,Erik Cordier
1337,2015-3-27,,,,,,,,,,Tsuyoshi Wada


In [95]:
#merges df with injuries
df = df.merge(injury_df, how='left', on='player_name')

In [96]:
df.game_date = pd.to_datetime(df.game_date)

In [97]:
df

Unnamed: 0,player_name,game_date,team,release_speedFF,release_pos_xFF,release_pos_yFF,release_pos_zFF,release_extensionFF,spin_axisFF,release_spin_rateFF,...,injury_1,injury_10,injury_2,injury_3,injury_4,injury_5,injury_6,injury_7,injury_8,injury_9
0,A.J. Achter,2015-08-07,MIN,90.825,-1.6,-1.6,6.0275,6.65,,2380.625,...,,,,,,,,,,
1,A.J. Achter,2015-08-08,MIN,89.844444,-1.855556,-1.855556,6.021111,6.588889,,2422.444444,...,,,,,,,,,,
2,A.J. Achter,2015-08-12,MIN,90.811111,-1.725556,-1.725556,6.075556,6.566667,,2277.888889,...,,,,,,,,,,
3,A.J. Achter,2015-08-14,MIN,91.045455,-1.774545,-1.774545,6.02,6.654545,,2339.454545,...,,,,,,,,,,
4,A.J. Achter,2015-08-18,MIN,92.075,-1.67,-1.67,6.1275,6.775,,2456.75,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132269,Zach Duke,2016-05-09,CWS,,,,,,,,...,2017-4-2,,2019-5-20,,,,,,,
132270,Zack Godley,2016-09-17,ARI,,,,,,,,...,2020-9-7,,2021-4-28,,,,,,,
132271,Erick Aybar,2017-04-18,SD,,,,,,,,...,,,,,,,,,,
132272,Steven Wright,2018-09-16,BOS,,,,,,,,...,2015-8-15,,2015-8-28,2016-8-8,2017-5-2,2018-6-26,2019-7-14,,,


In [98]:
#Changes all injuries columns to datetime
for injury_column in injury_df.columns[:-1]:
    df[injury_column] = pd.to_datetime(df[injury_column])

In [99]:
#creates columns for time between injury and last appereance for each injury for each player
for injury_column in injury_df.columns[:-1]:
    df[f'between_{injury_column}_and_last_appearance'] = df[injury_column]- df.game_date

  This is separate from the ipykernel package so we can avoid doing imports until


In [100]:
#Creates column injured. 1 if it is the players last appereance before being add to the injured list. 0 otherwise. 
injured_indexes = []
df['injured'] = 0
for i in range(1,len(injury_df.columns[:-1])+1):
    injured_indexes.append(df[(~df['between_injury_1_and_last_appearance'.replace('1',str(i))].isna()) & (df['injury_' + str(i)] >= df.game_date)].groupby(['player_name','injury_1'])['between_injury_1_and_last_appearance'.replace('1',str(i))].idxmin().values)
for injured_index in injured_indexes:
    for value in injured_index:
        df.loc[value, 'injured'] = 1

  This is separate from the ipykernel package so we can avoid doing imports until


In [101]:
df.to_csv('/content/drive/MyDrive/data_490/processed_data/pandas_30_day')

In [None]:
features = []
for attribute in attributes[2:]:
  for pitch in pitch_types:
    features.append(attribute + pitch)
for pitch in pitch_types:
  features.append('count'+pitch)
features.append('injured')

In [None]:
y = df.injured

In [None]:
#X Holds model inputs in shape (samples, features, timesteps) 
X = np.full((len(df), len(features), 30),-100, dtype='float')

In [None]:
#Same amount of rows as original data frame and holds every date from game_date to game_date - lookback length
game_dates = pd.DataFrame()
for i in range(1,day_look_back+1):
  #creates column game_date-i for i in range(1,31) to hold date for game_date - i
  game_dates[f'game_date-{i}'] = df.game_date - datetime.timedelta(days=i)
game_dates['player_name'] = df.player_name

In [None]:
game_dates

In [None]:
for day in range(day_look_back):
  #Gets indexes of game_dates where there is a player_name and game_date - i match in df
  game_indexes = game_dates.index[game_dates.set_index(['player_name',f'game_date-{day+1}']).index.isin(df.set_index(['player_name','game_date']).index)]
  #Gets indexes of df where there is a player_name and game_date - i match in game_dates
  feature_indexes = df.index[df.set_index(['player_name','game_date']).index.isin(game_dates.set_index(['player_name',f'game_date-{day+1}']).index)]
  #Gets features
  feature_values  = df.iloc[feature_indexes][features]
  feature_values = feature_values.fillna(-1)
  #adds features to X
  X[game_indexes,:,day] = feature_values

In [None]:
X[X == -100] = -1

In [None]:
del pbp
del pitch_types
del injury_df
del injury_dfs

In [None]:
df = df[['player_name','game_date','injured']]
df['time_series'] = np.zeros(len(df)).astype(object)

In [None]:
pos_encode = np.expand_dims([i for i in range(1,31)], -1)

In [None]:
for index, row in df.iterrows():
  df['time_series'].iloc[index] = np.hstack((np.transpose(X[index,:,:]), pos_encode))

In [None]:
#Saved as pickle becuase csv saves the nested data frames as strings
df.to_pickle('/content/drive/MyDrive/data_490/processed_data/thirty_day_timestep_df')