This notebook focuses on extracting features from event data for each player according to gamedate as per 'PITCH' or 'HIT' event. 

My other notebook contributions:

please upvote if you like it!

* https://www.kaggle.com/debojit23/mlb-sabermetrics-batting-and-pitching-stats
* https://www.kaggle.com/debojit23/baseball-field-structure-matplotlib

In [None]:
import numpy as np

#basic declaration
null=np.nan;true=True;false=False
ball_weight=dict();ball_rad=dict()
air_density=dict()
ball_weight['lb']=0.32;ball_weight['oz']=5.125;
ball_rad['ft']=0.12;ball_rad['inch']=1.45;
air_density['mass']=0.0023
air_density['weight']=0.075
pi=3.14;drag_coeff=0.3
gravity=32.1740 

#https://www.grc.nasa.gov/www/k-12/airplane/balldrag.html

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [None]:
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
import mlb
import gc
import tensorflow as tf
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
pd.set_option('display.max_columns', 500)

In [None]:
players=pd.read_pickle('../input/mlb-pdef-train-dataset/playerTwitterFollowers_train.pkl')
zz=players[['playerId','playerName']].to_dict(orient='l')
player_map=dict(zip(zz['playerId'],zz['playerName']))
players.head(2)


In [None]:
train=pd.read_csv('../input/mlb-player-digital-engagement-forecasting/train.csv')
train.head()

In [None]:
#pitching events
sum_cols_pitch=['is_bad_pitch','is_pitchplayed_beforehomeplate']
mean_std_cols_pitch=['pfxX', 'pfxZ','net_pitch_acceleration','net_pitchlaunch_velocity', 'approxhorz_pitch_distance',
               'is_pitchplayed_beforehomeplate', 'drag_force', 'magnus_force','startSpeed', 'endSpeed','nastyFactor',\
               'breakAngle','spinRate', 'spinDirection', 'breakY',]


def get_eventpitching(temp_event) -> pd.DataFrame:

    all_events_pitch=pd.DataFrame()
    
    temp_pitcher=pd.DataFrame(columns=['playerId','date'])
    temp_pitcher['playerId']=temp_event['pitcherId'].unique()
    temp_pitcher['date']=temp_event['gameDate'].iloc[0]

    pitchevents=temp_event[temp_event.aX.notnull()]
    pitchevents=pitchevents.fillna(0)
    pitchevents=get_pitchkinematics(pitchevents)

    zz=get_meanstd_stats(pitchevents,mean_std_cols=mean_std_cols_pitch)
    temp_pitcher=temp_pitcher.merge(zz,left_on='playerId',right_on='pitcherId',how='left')
    temp_pitcher.drop(columns=['pitcherId'],inplace=True)
    zz=get_sum_stats(pitchevents)
    temp_pitcher=temp_pitcher.merge(zz,left_on='playerId',right_on='pitcherId',how='left')
    temp_pitcher.drop(columns=['pitcherId'],inplace=True)
    all_events_pitch=all_events_pitch.append(temp_pitcher)
    
    return all_events_pitch

def get_singleevent(event) -> pd.DataFrame:
    return pd.DataFrame(eval(event))

def get_meanstd_stats(pitchevents,mean_std_cols) -> pd.DataFrame:
    all_stats=pd.DataFrame(columns=['pitcherId'])
    all_stats['pitcherId']=pitchevents.pitcherId.unique()
    for col in mean_std_cols:
        temp = pitchevents.groupby('pitcherId')[col].agg({'mean','std','min','max'}).reset_index().fillna(0)
        temp.rename(columns={'mean':f'{col}_mean','std':f'{col}_std','min':f'{col}_min','max':f'{col}_max'},inplace=True)
        all_stats=all_stats.merge(temp,on='pitcherId')
    return all_stats
def get_sum_stats(pitchevents) -> pd.DataFrame:
    all_stats=pd.DataFrame(columns=['pitcherId'])
    all_stats['pitcherId']=pitchevents.pitcherId.unique()
    for col in sum_cols_pitch:
        temp = pitchevents.groupby('pitcherId')[col].agg({'sum'}).reset_index().fillna(0)
        temp.rename(columns={'sum':f'{col}_sum'},inplace=True)
        all_stats=all_stats.merge(temp,on='pitcherId')
    return all_stats

def get_pitchkinematics(pitchevents) -> pd.DataFrame:
    pitchevents['net_pitch_acceleration']=np.sqrt((pitchevents.aX.values)**2+(pitchevents.aY.values)**2+(pitchevents.aZ.values)**2)
    pitchevents['net_pitchlaunch_velocity']=np.sqrt((pitchevents.vX0.values)**2+(pitchevents.vY0.values)**2+(pitchevents.vZ0.values)**2)
    pitchevents['approxhorz_pitch_distance']=np.sqrt((pitchevents.x.values-pitchevents.x0.values)**2+(pitchevents.y.values-pitchevents.y0.values)**2)
    pitchevents['is_bad_pitch']=[1 if val<0 else 0 for val in pitchevents.pZ.values]
    pitchevents['is_pitchplayed_beforehomeplate']=[1 if val<0 else 0 for val in pitchevents.pX.values]
    pitchevents['drag_force']=0.5*drag_coeff*pi*air_density['mass']*(ball_rad['ft']**2)*(pitchevents['net_pitchlaunch_velocity'].values**2)
    pitchevents['magnus_force']=0.5*pi*air_density['mass']*(ball_rad['ft']**3)*(pitchevents['spinRate'].values)*(pitchevents['net_pitchlaunch_velocity'].values)
    return pitchevents
for i in tqdm(range(len(train))):
    if str(train.iloc[i]['events'])!='nan':
        temp_event = get_singleevent(train.iloc[i]['events'])
        all_events_pitch = get_eventpitching(temp_event)
        
    else:
        continue
        
all_events_pitch.sample(5)

In [None]:
#*0.0174533->degree convert to rad
# *1.467 -> MPH to ft/s
#g=32.1740 ft/s2

mean_std_cols_hit=['launchSpeed','launchAngle','max_heightreached','flighttime']

def get_eventhit(temp_event) -> pd.DataFrame:
    
    all_events_hit=pd.DataFrame()

    temp_pitcher=pd.DataFrame(columns=['playerId','date'])
    temp_pitcher['playerId']=temp_event['pitcherId'].unique()
    temp_pitcher['date']=temp_event['gameDate'].iloc[0]

    hitevents=temp_event[temp_event.launchSpeed.notnull()].fillna(0)
    hitevents=get_hitkinematics(hitevents)

    zz=get_meanstd_stats(hitevents,mean_std_cols=mean_std_cols_hit)
    temp_pitcher=temp_pitcher.merge(zz,left_on='playerId',right_on='hitterId',how='left')
    temp_pitcher.drop(columns=['hitterId'],inplace=True)
    temp_pitcher=temp_pitcher.fillna(0)
    all_events_hit=all_events_hit.append(temp_pitcher)

    return all_events_hit

def get_meanstd_stats(hitevents,mean_std_cols) -> pd.DataFrame:
    all_stats=pd.DataFrame(columns=['hitterId'])
    all_stats['hitterId']=hitevents.hitterId.unique()
    for col in mean_std_cols:
        temp = hitevents.groupby('hitterId')[col].agg({'mean','std','min','max'}).reset_index().fillna(0)
        temp.rename(columns={'mean':f'{col}_mean','std':f'{col}_std','min':f'{col}_min','max':f'{col}_max'},inplace=True)
        all_stats=all_stats.merge(temp,on='hitterId')
    return all_stats

def get_hitkinematics(hitevents) -> pd.DataFrame:
    hitevents['max_heightreached']=((np.sin(hitevents['launchAngle'].values*0.0174533)*(hitevents['launchSpeed'].values*1.467))**2)/(2*gravity)
    hitevents['flighttime']=2*(np.sin(hitevents['launchAngle'].values*0.0174533)*(hitevents['launchSpeed'].values*1.467))/gravity
    return hitevents


for i in tqdm(range(len(train))):
    if str(train.iloc[i]['events'])!='nan':
        temp_event = get_singleevent(train.iloc[i]['events'])
        all_events_hit=get_eventhit(temp_event)
        
    else:
        continue

        
        
all_events_hit.sample(5)

In [None]:
def get_eventsum(temp_event) -> pd.DataFrame:

    all_events_sum=pd.DataFrame()
    temp_pitcher=pd.DataFrame(columns=['playerId','date'])
    temp_pitcher['playerId']=temp_event['pitcherId'].unique()
    temp_pitcher['date']=temp_event['gameDate'].iloc[0]      

    zz=get_sum_stats(temp_event)

    temp_pitcher=temp_pitcher.merge(zz,left_on='playerId',right_on='hitterId',how='left')
    temp_pitcher.drop(columns=['hitterId'],inplace=True)
    temp_pitcher=temp_pitcher.fillna(0)
    all_events_sum=all_events_sum.append(temp_pitcher)
    return all_events_sum

sum_cols=['isGB', 'isLD', 'isFB', 'isPU','isPaOver']

def get_sum_stats(temp_event) -> pd.DataFrame:
    all_stats=pd.DataFrame(columns=['hitterId'])
    all_stats['hitterId']=temp_event.hitterId.unique()
    for col in sum_cols:
        temp = temp_event.groupby('hitterId')[col].agg({'sum'}).reset_index().fillna(0)
        temp.rename(columns={'sum':f'{col}_sum'},inplace=True)
        all_stats=all_stats.merge(temp,on='hitterId')
    return all_stats

for i in tqdm(range(len(train))):
    if str(train.iloc[i]['events'])!='nan':
        temp_event = get_singleevent(train.iloc[i]['events'])
        temp_event=temp_event.fillna(0)
        all_events_sum=get_eventsum(temp_event)
        
        
all_events_sum.sample(5)

In [None]:
pivot_cols_pitch=['breakLength','pitchNumber','menOnBase','call']
def get_eventcrosstab(temp_event) -> pd.DataFrame:

    all_events_crosstab=pd.DataFrame()

    temp_pitcher=pd.DataFrame(columns=['playerId','date'])
    temp_pitcher['playerId']=temp_event['pitcherId'].unique()
    temp_pitcher['date']=temp_event['gameDate'].iloc[0]      

    zz=get_crosstab_stats(temp_event)

    temp_pitcher=temp_pitcher.merge(zz,left_on='playerId',right_on='pitcherId',how='left')
    temp_pitcher.drop(columns=['pitcherId'],inplace=True)
    temp_pitcher=temp_pitcher.fillna(0)
    all_events_crosstab=all_events_crosstab.append(temp_pitcher)
    return all_events_crosstab
def get_crosstab_stats(temp_event) -> pd.DataFrame:
    all_stats=pd.DataFrame(columns=['pitcherId'])
    all_stats['pitcherId']=temp_event.pitcherId.unique()
    for col in pivot_cols_pitch:
        zz=pd.crosstab(temp_event['pitcherId'],temp_event[col])
        zz.columns=[f'{col}_'+str(val) for val in zz.columns]
        all_stats=all_stats.merge(zz,on='pitcherId')
    return all_stats
for i in tqdm(range(len(train))):
    if str(train.iloc[i]['events'])!='nan':
        temp_event = get_singleevent(train.iloc[i]['events'])
        temp_event=temp_event.fillna(0)
        all_events_crosstab=get_eventcrosstab(temp_event)
        
all_events_crosstab.sample(5)

In [None]:
standings=pd.read_pickle('../input/mlb-pdef-train-dataset/standings_train.pkl')

for cols in ['divisionChamp','divisionLeader','wildCardLeader']:
    standings[cols]=standings[cols].fillna(False)


In [None]:
all_teamids=standings.teamId.unique()
zz=standings[standings.teamId==all_teamids[10]]

In [None]:
crosstab_cols=['streakCode','divisionRank', 'leagueRank', 'wildCardRank','divisionChamp', 'divisionLeader','wildCardLeader']
normal_cols=[ 'leagueGamesBack','sportGamesBack', 'divisionGamesBack', 'wins', 'losses', 'pct',\
               'runsAllowed', 'runsScored']+list(standings.columns[20:-2])
def get_single_team(zz,crosstab_cols) -> pd.DataFrame:
    cols2_drop=[f'wildCardRank_{val}' for val in np.arange(6,12)]+[f'leagueRank_{val}' for val in np.arange(6,16)]    
    temp_feat=pd.DataFrame()
    temp_feat['teamId']=zz['teamId']
    temp_feat['gameDate']=zz['gameDate']
    for cols in crosstab_cols:
        temp=pd.crosstab(zz.gameDate,zz[cols])
        temp.columns=[f'{cols}_{val}' for val in temp.columns]   
        try:

            if cols == 'wildCardRank':
                temp=temp[[f'wildCardRank_{val}' for val in np.arange(1,6)]]
            if cols == 'leagueRank':
                temp=temp[[f'leagueRank_{val}' for val in np.arange(1,6)]]
        except:
            continue
        temp_feat=temp_feat.merge(temp,on='gameDate',how='left')
    return temp_feat

In [None]:
zz=standings[standings.teamId==all_teamids[0]].reset_index(drop=True)
temp_feat=get_single_team(zz,crosstab_cols)
feat_cols=temp_feat.columns

def get_standing_feats(standings,crosstab_cols,normal_cols) -> pd.DataFrame:
    standing_data=pd.DataFrame(columns=feat_cols)
    
    all_teamids=standings.teamId.unique()
    for ids in tqdm(all_teamids):
        zz=standings[standings.teamId==ids].reset_index(drop=True)
        temp_feat=get_single_team(zz,crosstab_cols)
        #print(temp_feat.shape,zz.shape)
        for cols in normal_cols:
            temp_feat[cols]=zz[cols]
        standing_data=standing_data.append(temp_feat)
        
    return standing_data

In [None]:

standing_data=get_standing_feats(standings,crosstab_cols=crosstab_cols,normal_cols=normal_cols)
standing_data

In [None]:
standing_data=standing_data.fillna(0)
standing_data.to_csv('standing_data.csv',index=None)

In [None]:
standing_data.isna().sum()[:49]