<a href="https://colab.research.google.com/github/ryancloude/Statcast-TIme-Series-Pitcher-Injury/blob/main/thirty_day_timestep_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
#import unidecode
import numpy as np
import matplotlib.pyplot as plt
import datetime
from google.colab import drive
from tqdm.notebook import tqdm
tqdm.pandas()
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Seasons in data
seasons = [2021,2020,2019,2018,2017,2016,2015]
#How many time steps to include in input
day_look_back = 30

In [3]:
#path for folder with pitch data
pitch_path = '/content/drive/MyDrive/data_490/pitch_data'
#path for folder with injury data
injury_path = '/content/drive/MyDrive/data_490/injury_data'

In [4]:
#Creates a list of the pitch data data frames
data_frames = []
for season in seasons:
    data_frames.append(pd.read_csv(f'{pitch_path}/pbp_{season}.csv', encoding = "ISO-8859-1"))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
#list of pitch data data frames into one pitch data data frames and list deleted
pbp = pd.concat(data_frames)
del data_frames

In [6]:
#Changes column types to most suited type
pbp = pbp.convert_dtypes()

In [7]:
#Changes name format from Doe, John to John Doe
def name_format(player_name):
  return f'{player_name.split(",")[1].strip()} {player_name.split(",")[0]}'
pbp['player_name'] = pbp.player_name.apply(name_format)

In [8]:
#Drops rows that have null values or values that or values dont make sense in the pitch value
pbp = pbp.loc[ (~pbp.pitch_type.isin(['FO', 'PO', 'IN', 'CS', 'FA'])) & (~pbp.pitch_type.isnull())]
#Forkballs, Screwballs, and splitters are counted as changeups
pbp.pitch_type[pbp.pitch_type.isin(['FO','SC', 'FS'])] = 'CH'
#Knuckle-curves and ephesus are counted as curveballs
pbp.pitch_type[pbp.pitch_type.isin(['KC','EP'])] = 'CU'
#two seams are counted as sinkers
pbp.pitch_type[pbp.pitch_type == 'FT'] = 'SI'

In [9]:
pitch_types = pbp.pitch_type.unique()

In [10]:
attributes = ['player_name', 'game_date','release_speed', 'release_pos_x', 'release_pos_y', 
       'release_pos_z','release_extension' ,'spin_axis', 'release_spin_rate', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z']

In [11]:
pitch_types_dfs = []
for pitch in pitch_types:
  pitch_types_dfs.append(pbp[pbp.pitch_type == pitch][attributes].groupby(['player_name','game_date']).mean().reset_index())

In [12]:
for i, pitch_df in enumerate(pitch_types_dfs):
  pitch_df['count'] = pbp[pbp.pitch_type == pitch_types[i]][attributes].groupby(['player_name','game_date']).release_speed.count().values

In [13]:
pitch_types_dfs[-1].columns = pitch_types_dfs[-1].columns[:2].to_list() + [column + pitch_types[-1] for column in pitch_types_dfs[-1].columns.to_list()[2:]]

In [14]:
df = pitch_types_dfs[0]
for i in range(1,len(pitch_types_dfs)):
  df = df.merge(pitch_types_dfs[i], on=['player_name', 'game_date'], how='outer',suffixes=[pitch_types[i-1], pitch_types[i]])

In [15]:
#Creates a list of injury dfs where where each entry in list is a season injury df
injury_dfs = []
for i in range(len(seasons)):
    injury_dfs.append(pd.read_csv(f'{injury_path}/injury_{seasons[i]}.csv'))
    #reneames column containing player name to player_name
    injury_dfs[i] = injury_dfs[i].rename(columns={injury_dfs[i].columns[0]:'player_name','Dates':f'Dates_{seasons[i]}'})
    #changes column name of column containing injury dates to Dates_season
    injury_dfs[i] = injury_dfs[i][['player_name',f'Dates_{seasons[i]}']]
    injury_dfs[i].dropna(subset=['player_name',f'Dates_{seasons[i]}'],inplace=True)
    #formats player name
    injury_dfs[i]['player_name'] = injury_dfs[i].apply(lambda x: str(x.player_name).split(",")[0],axis=1)

In [16]:
injury_dfs[0]

Unnamed: 0,player_name,Dates_2021
0,Sean Guenther,Array \n10/3 - 10/3
1,Clayton Kershaw,Array \n10/2 - 10/3\n7/7 - 9/13
2,Josh Rogers,Array \n10/2 - 10/3
3,Joe Smith,Array \n10/2 - 10/3\n6/9 - 7/7
4,John Gant,Array \n10/2 - 10/3\n9/14 - 9/25
...,...,...
484,Oliver Drake,Array \n4/1 - 10/3
485,Jace Fry,Array \n4/1 - 6/26
486,Jose Soriano,Array \n4/1 - 10/3
487,Edwar Colina,Array \n4/1 - 10/3


In [17]:
#merges all injurys df into one injury df with each year column containing all injuries of that year
injury_df = injury_dfs[0].merge(injury_dfs[1],on=['player_name'], how='outer')
for i in range(2,len(injury_dfs),1):
    injury_df = injury_df.merge(injury_dfs[i],on=['player_name'], how='outer')
injury_df

Unnamed: 0,player_name,Dates_2021,Dates_2020,Dates_2019,Dates_2018,Dates_2017,Dates_2016,Dates_2015
0,Sean Guenther,Array \n10/3 - 10/3,,,,,,
1,Clayton Kershaw,Array \n10/2 - 10/3\n7/7 - 9/13,Array \n7/23 - 8/2,Array \n3/28 - 4/15,Array \n6/1 - 6/23\n5/6 - 5/31,Array \n7/24 - 9/1,Array \n6/27 - 9/9,
2,Josh Rogers,Array \n10/2 - 10/3,,Array \n6/26 - 9/29,,,,
3,Joe Smith,Array \n10/2 - 10/3\n6/9 - 7/7,,Array \n3/28 - 7/12,Array \n6/10 - 7/3,Array \n6/19 - 7/22,Array \n8/17 - 9/1\n6/5 - 7/1,
4,John Gant,Array \n10/2 - 10/3\n9/14 - 9/25,Array \n9/25 - 9/27,,,Array \n4/2 - 5/16,Array \n6/28 - 8/21,
...,...,...,...,...,...,...,...,...
1334,Kyuji Fujikawa,,,,,,,Array \n4/1 - 5/14
1335,Vic Black,,,,,,,Array \n3/27 - 6/7
1336,Erik Cordier,,,,,,,Array \n3/27 - 5/17
1337,Tsuyoshi Wada,,,,,,,Array \n3/27 - 5/18


In [18]:
#formats injury dates as yyyy-mm-dd
def get_injury_dates(df):
    injured_dates = []
    for season in seasons:
        for injury in str(df[f'Dates_{season}']).split("\n")[1:]:
            injury = injury.split(" ")[0]
            injury = f'{season}-{injury.replace("/","-")}'
            if injury[-2] == "-":
                injury = injury[:-1] + injury[-1:]
            injured_dates.append(injury)
    for i in range(1,len(injured_dates)+1,1):
        df["injury_" + str(i)] = injured_dates[-i]
    return df

In [19]:
#creates columns for each injury for each player
injury_df = injury_df.apply(get_injury_dates,axis=1).drop(columns=['Dates_' + str(season) for season in seasons])
injury_df

Unnamed: 0,injury_1,injury_10,injury_2,injury_3,injury_4,injury_5,injury_6,injury_7,injury_8,injury_9,player_name
0,2021-10-3,,,,,,,,,,Sean Guenther
1,2016-6-27,,2017-7-24,2018-5-6,2018-6-1,2019-3-28,2020-7-23,2021-7-7,2021-10-2,,Clayton Kershaw
2,2019-6-26,,2021-10-2,,,,,,,,Josh Rogers
3,2016-6-5,,2016-8-17,2017-6-19,2018-6-10,2019-3-28,2021-6-9,2021-10-2,,,Joe Smith
4,2016-6-28,,2017-4-2,2020-9-25,2021-9-14,2021-10-2,,,,,John Gant
...,...,...,...,...,...,...,...,...,...,...,...
1334,2015-4-1,,,,,,,,,,Kyuji Fujikawa
1335,2015-3-27,,,,,,,,,,Vic Black
1336,2015-3-27,,,,,,,,,,Erik Cordier
1337,2015-3-27,,,,,,,,,,Tsuyoshi Wada


In [20]:
#merges df with injuries
df = df.merge(injury_df, how='left', on='player_name')

In [21]:
df.game_date = pd.to_datetime(df.game_date)

In [22]:
df

Unnamed: 0,player_name,game_date,release_speedFF,release_pos_xFF,release_pos_yFF,release_pos_zFF,release_extensionFF,spin_axisFF,release_spin_rateFF,pfx_xFF,...,injury_1,injury_10,injury_2,injury_3,injury_4,injury_5,injury_6,injury_7,injury_8,injury_9
0,A.J. Achter,2015-08-07,90.825,-1.6,-1.6,6.0275,6.65,,2380.625,-0.24875,...,,,,,,,,,,
1,A.J. Achter,2015-08-08,89.844444,-1.855556,-1.855556,6.021111,6.588889,,2422.444444,-0.404444,...,,,,,,,,,,
2,A.J. Achter,2015-08-12,90.811111,-1.725556,-1.725556,6.075556,6.566667,,2277.888889,-0.192222,...,,,,,,,,,,
3,A.J. Achter,2015-08-14,91.045455,-1.774545,-1.774545,6.02,6.654545,,2339.454545,-0.222727,...,,,,,,,,,,
4,A.J. Achter,2015-08-18,92.075,-1.67,-1.67,6.1275,6.775,,2456.75,-0.505,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132252,Zach Duke,2016-05-09,,,,,,,,,...,2017-4-2,,2019-5-20,,,,,,,
132253,Zack Godley,2016-09-17,,,,,,,,,...,2020-9-7,,2021-4-28,,,,,,,
132254,Erick Aybar,2017-04-18,,,,,,,,,...,,,,,,,,,,
132255,Steven Wright,2018-09-16,,,,,,,,,...,2015-8-15,,2015-8-28,2016-8-8,2017-5-2,2018-6-26,2019-7-14,,,


In [23]:
#Changes all injuries columns to datetime
for injury_column in injury_df.columns[:-1]:
    df[injury_column] = pd.to_datetime(df[injury_column])

In [24]:
#creates columns for time between injury and last appereance for each injury for each player
for injury_column in injury_df.columns[:-1]:
    df[f'between_{injury_column}_and_last_appearance'] = df[injury_column]- df.game_date

  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
#Creates column injured. 1 if it is the players last appereance before being add to the injured list. 0 otherwise. 
injured_indexes = []
df['injured'] = 0
for i in range(1,len(injury_df.columns[:-1])+1):
    injured_indexes.append(df[(~df['between_injury_1_and_last_appearance'.replace('1',str(i))].isna()) & (df['injury_' + str(i)] >= df.game_date)].groupby(['player_name','injury_1'])['between_injury_1_and_last_appearance'.replace('1',str(i))].idxmin().values)
for injured_index in injured_indexes:
    for value in injured_index:
        df.loc[value, 'injured'] = 1

  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
features = []
for attribute in attributes[2:]:
  for pitch in pitch_types:
    features.append(attribute + pitch)
for pitch in pitch_types:
  features.append('count'+pitch)
features.append('injured')

In [27]:
y = df.injured

In [28]:
#.to_csv('/content/drive/MyDrive/data_490/processed_data/thirty_day_features',index=False)

In [29]:
#X Holds model inputs in shape (samples, features, timesteps) 
X = np.full((len(df), len(features), 30),-100, dtype='float')

In [30]:
#Same amount of rows as original data frame and holds every date from game_date to game_date - lookback length
game_dates = pd.DataFrame()
for i in range(1,day_look_back+1):
  #creates column game_date-i for i in range(1,31) to hold date for game_date - i
  game_dates[f'game_date-{i}'] = df.game_date - datetime.timedelta(days=i)
game_dates['player_name'] = df.player_name

In [31]:
game_dates

Unnamed: 0,game_date-1,game_date-2,game_date-3,game_date-4,game_date-5,game_date-6,game_date-7,game_date-8,game_date-9,game_date-10,...,game_date-22,game_date-23,game_date-24,game_date-25,game_date-26,game_date-27,game_date-28,game_date-29,game_date-30,player_name
0,2015-08-06,2015-08-05,2015-08-04,2015-08-03,2015-08-02,2015-08-01,2015-07-31,2015-07-30,2015-07-29,2015-07-28,...,2015-07-16,2015-07-15,2015-07-14,2015-07-13,2015-07-12,2015-07-11,2015-07-10,2015-07-09,2015-07-08,A.J. Achter
1,2015-08-07,2015-08-06,2015-08-05,2015-08-04,2015-08-03,2015-08-02,2015-08-01,2015-07-31,2015-07-30,2015-07-29,...,2015-07-17,2015-07-16,2015-07-15,2015-07-14,2015-07-13,2015-07-12,2015-07-11,2015-07-10,2015-07-09,A.J. Achter
2,2015-08-11,2015-08-10,2015-08-09,2015-08-08,2015-08-07,2015-08-06,2015-08-05,2015-08-04,2015-08-03,2015-08-02,...,2015-07-21,2015-07-20,2015-07-19,2015-07-18,2015-07-17,2015-07-16,2015-07-15,2015-07-14,2015-07-13,A.J. Achter
3,2015-08-13,2015-08-12,2015-08-11,2015-08-10,2015-08-09,2015-08-08,2015-08-07,2015-08-06,2015-08-05,2015-08-04,...,2015-07-23,2015-07-22,2015-07-21,2015-07-20,2015-07-19,2015-07-18,2015-07-17,2015-07-16,2015-07-15,A.J. Achter
4,2015-08-17,2015-08-16,2015-08-15,2015-08-14,2015-08-13,2015-08-12,2015-08-11,2015-08-10,2015-08-09,2015-08-08,...,2015-07-27,2015-07-26,2015-07-25,2015-07-24,2015-07-23,2015-07-22,2015-07-21,2015-07-20,2015-07-19,A.J. Achter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132252,2016-05-08,2016-05-07,2016-05-06,2016-05-05,2016-05-04,2016-05-03,2016-05-02,2016-05-01,2016-04-30,2016-04-29,...,2016-04-17,2016-04-16,2016-04-15,2016-04-14,2016-04-13,2016-04-12,2016-04-11,2016-04-10,2016-04-09,Zach Duke
132253,2016-09-16,2016-09-15,2016-09-14,2016-09-13,2016-09-12,2016-09-11,2016-09-10,2016-09-09,2016-09-08,2016-09-07,...,2016-08-26,2016-08-25,2016-08-24,2016-08-23,2016-08-22,2016-08-21,2016-08-20,2016-08-19,2016-08-18,Zack Godley
132254,2017-04-17,2017-04-16,2017-04-15,2017-04-14,2017-04-13,2017-04-12,2017-04-11,2017-04-10,2017-04-09,2017-04-08,...,2017-03-27,2017-03-26,2017-03-25,2017-03-24,2017-03-23,2017-03-22,2017-03-21,2017-03-20,2017-03-19,Erick Aybar
132255,2018-09-15,2018-09-14,2018-09-13,2018-09-12,2018-09-11,2018-09-10,2018-09-09,2018-09-08,2018-09-07,2018-09-06,...,2018-08-25,2018-08-24,2018-08-23,2018-08-22,2018-08-21,2018-08-20,2018-08-19,2018-08-18,2018-08-17,Steven Wright


In [32]:
for day in range(day_look_back):
  #Gets indexes of game_dates where there is a player_name and game_date - i match in df
  game_indexes = game_dates.index[game_dates.set_index(['player_name',f'game_date-{day+1}']).index.isin(df.set_index(['player_name','game_date']).index)]
  #Gets indexes of df where there is a player_name and game_date - i match in game_dates
  feature_indexes = df.index[df.set_index(['player_name','game_date']).index.isin(game_dates.set_index(['player_name',f'game_date-{day+1}']).index)]
  #Gets features
  feature_values  = df.iloc[feature_indexes][features]
  #Numpy wont accept NAType, -100 will be changed back to Nan
  feature_values = feature_values.fillna(-100)
  #adds features to X
  X[game_indexes,:,day] = feature_values

In [33]:
del pbp
del pitch_types
del injury_df
del injury_dfs

In [34]:
X[X == -100] = np.nan

In [35]:
df = df[['player_name','game_date','injured']]
df['time_series'] = np.zeros(len(df)).astype(object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [36]:
for index, row in df.iterrows():
  df['time_series'].iloc[index] = pd.DataFrame(X[index,:,:])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [37]:
df.to_csv('/content/drive/MyDrive/data_490/processed_data/thirty_day_timestep_df.csv')