## Events vs Target
In this notebook, I will use the event level meta-model to analyze how engagement is related to the play-by-play information recorded in events.

The event-level meta model was not only used for the EDA, but also for the actual features of the [3rd place solution](https://www.kaggle.com/c/mlb-player-digital-engagement-forecasting/discussion/256620), which improved the score by about 0.001.

In [None]:
import gc
import json

import pandas as pd
import numpy as np
import shap
import lightgbm as lgb
import matplotlib.pyplot as plt

from tqdm import tqdm
%matplotlib inline

In [None]:
%%capture
!pip install nyaggle

In [None]:
from nyaggle.validation import TimeSeriesSplit

In [None]:
df = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/train.csv', 
                 usecols=['date', 'nextDayPlayerEngagement', 'events'])

In [None]:
def load_subdata(df, col):
    eng = []
    for i, row in tqdm(df.iterrows()):
        try:
            loaded = json.loads(row[col])
            for l in loaded:
                l['dailyDataDate'] = row.date
            eng.extend(loaded)
        except:
            pass

    return pd.DataFrame(eng)

engagement = load_subdata(df, 'nextDayPlayerEngagement')
events = load_subdata(df, 'events')

for c in events.columns:
    if events[c].dtype.name == 'float64':
        events[c] = events[c].astype(np.float32)

del df
gc.collect()

In [None]:
def prep_events(events: pd.DataFrame, sort_by_date: bool = True):
    events_p = events.copy()
    events_p['asPitcher'] = 1
    events_p['playerId'] = events_p['pitcherId']
    events_p['teamId'] = events_p['pitcherTeamId']

    events_h = events.copy()
    events_h['asPitcher'] = 0
    events_h['playerId'] = events_p['hitterId']
    events_h['teamId'] = events_p['hitterTeamId']

    if sort_by_date:
        events_stacked = pd.concat([events_p, events_h]).sort_values(
            by=['dailyDataDate', 'gamePk', 'inning']).reset_index(
            drop=True)
        events_stacked['dailyDataDate'] = pd.to_datetime(events_stacked['dailyDataDate'], format='%Y%m%d')
    else:
        events_stacked = pd.concat([events_p, events_h]).reset_index(drop=True)

    drop_cols = [
        'gameDate', 'season', 'playId', 'pitcherTeam', 'hitterTeam', 'pitcherName', 'pitcherHand', 'hitterName',
        'batSide', 'atBatDesc', 'gameTimeUTC',
        'description', 'halfInning', 'hitterTeamId', 'pitcherTeamId', 'gamePk', 'pitcherId', 'hitterId'
    ]

    events_stacked.drop(drop_cols, axis=1, inplace=True)

    events_stacked['scoreDiff'] = events_stacked['homeScore'] - events_stacked['awayScore']

    cats = {
        'menOnBase': {
            'Empty': 1,
            'Men_On': 2,
            'RISP': 3,
            'Loaded': 4
        },
        'gameType': {
            'R': 1,
            'D': 2,
            'L': 3,
            'F': 4,
            'W': 5
        },
        'atBatEvent': {
            'Strikeout': 1,
            'Groundout': 2,
            'Single': 3,
            'Walk': 4,
            'Flyout': 5,
            'Lineout': 6,
            'Pop Out': 7,
            'Double': 8,
            'Home Run': 9,
            'Forceout': 10,
            'Grounded Into DP': 11,
            'Hit By Pitch': 12,
            'Field Error': 13,
            'Sac Fly': 14,
            'Intent Walk': 15,
            'Triple': 16,
            'Double Play': 17,
            'Sac Bunt': 18,
            'Fielders Choice Out': 19,
            'Fielders Choice': 20,
            'Strikeout Double Play': 21,
            'Caught Stealing 2B': 22,
            'Bunt Groundout': 23,
            'Catcher Interference': 24,
            'Bunt Pop Out': 25,
            'Batter Interference': 26,
            'Runner Out': 27,
            'Pickoff Caught Stealing 2B': 28,
            'Fan Interference': 29,
            'Pickoff 1B': 30,
            'Caught Stealing 3B': 31,
            'Caught Stealing Home': 32,
            'Pickoff 2B': 33,
            'Sac Fly Double Play': 34,
            'Bunt Lineout': 35,
            'Wild Pitch': 36,
            'Pickoff Caught Stealing Home': 37,
            'Triple Play': 38,
            'Pickoff Caught Stealing 3B': 39,
            'Pickoff 3B': 40,
            'Game Advisory': 41,
            'Stolen Base 2B': 42,
            'Sac Bunt Double Play': 43,
            'Runner Double Play': 44,
            'Passed Ball': 45,
            'Pickoff Error 1B': 46,
            'Balk': 47
        },
        'event': {
            'Strikeout': 1,
            'Groundout': 2,
            'Single': 3,
            'Game Advisory': 4,
            'Flyout': 5,
            'Pitching Substitution': 6,
            'Walk': 7,
            'Lineout': 8,
            'Pop Out': 9,
            'Double': 10,
            'Home Run': 11,
            'Offensive Substitution': 12,
            'Defensive Switch': 13,
            'Forceout': 14,
            'Grounded Into DP': 15,
            'Defensive Sub': 16,
            'Hit By Pitch': 17,
            'Stolen Base 2B': 18,
            'Wild Pitch': 19,
            'Field Error': 20,
            'Sac Fly': 21,
            'Intent Walk': 22,
            'Triple': 23,
            'Sac Bunt': 24,
            'Caught Stealing 2B': 25,
            'Double Play': 26,
            'Passed Ball': 27,
            'Injury': 28,
            'Fielders Choice Out': 29,
            'Fielders Choice': 30,
            'Stolen Base 3B': 31,
            'Defensive Indiff': 32,
            'Bunt Groundout': 33,
            'Ejection': 34,
            'Balk': 35,
            'Strikeout Double Play': 36,
            'Runner Placed On Base': 37,
            'Pickoff Error 1B': 38,
            'Bunt Pop Out': 39,
            'Pitch Challenge': 40,
            'Runner Out': 41,
            'Pickoff 1B': 42,
            'Pickoff Caught Stealing 2B': 43,
            'Caught Stealing 3B': 44,
            'Catcher Interference': 45,
            'Error': 46,
            'Umpire Substitution': 47,
            'Batter Interference': 48,
            'Pickoff Error 2B': 49,
            'Pitcher Switch': 50,
            'Fan Interference': 51,
            'Pickoff 2B': 52,
            'Caught Stealing Home': 53,
            'Stolen Base Home': 54,
            'Sac Fly Double Play': 55,
            'Bunt Lineout': 56,
            'Pickoff Caught Stealing 3B': 57,
            'Pickoff 3B': 58,
            'Pickoff Caught Stealing Home': 59,
            'Other Advance': 60,
            'Pickoff Error 3B': 61,
            'Triple Play': 62,
            'Sac Bunt Double Play': 63},
        'pitchType': {'FF': 1,
                      'SL': 2,
                      'CH': 3,
                      'SI': 4,
                      'CU': 5,
                      'FT': 6,
                      'FC': 7,
                      'KC': 8,
                      'FS': 9,
                      'KN': 10,
                      'EP': 11,
                      'CS': 12,
                      'FO': 13,
                      'PO': 14,
                      'SC': 15,
                      'FA': 16,
                      'AB': 17},
        'call': {'B': 1,
                 'F': 2,
                 'C': 3,
                 'X': 4,
                 'S': 5,
                 'D': 6,
                 '*B': 7,
                 'E': 8,
                 'T': 9,
                 'W': 10,
                 'V': 11,
                 'H': 12,
                 'L': 13,
                 'M': 14,
                 'P': 15,
                 'O': 16,
                 'Q': 17,
                 'R': 18},
        'type': {
            'pitch': 1,
            'action': 2
        }
    }

    for k in cats:
        events_stacked[k] = events_stacked[k].map(cats[k])

    return events_stacked


In [None]:
engagement['dailyDataDate'] = pd.to_datetime(engagement['dailyDataDate'], format='%Y%m%d')

event_df = prep_events(events)

event_df = pd.merge(event_df, engagement, how='left', on=['dailyDataDate', 'playerId'])
event_df = event_df[~event_df['target1'].isnull()]

del engagement
gc.collect()

## Hitter Events vs Target

In [None]:
params = {
    'objective': 'mae',
    'metrics': 'mae',
    'num_leaves': 256,
    'max_depth': 16,
    'n_estimators': 100
}

hitter_df = event_df[event_df.asPitcher==0]

X = hitter_df.drop(['target1', 'target2', 'target3', 'target4', 'dailyDataDate', 'playerId', 'teamId', 'engagementMetricsDate'], axis=1).astype(np.float32)
y = hitter_df['target1']

model = lgb.LGBMRegressor(**params)

model.fit(X, y, categorical_feature=['atBatEvent', 'event', 'menOnBase', 'gameType', 'pitchType', 'call'])

del hitter_df

In [None]:
explainer = shap.Explainer(model)
shap_values_hitter = explainer(X.sample(50000))

In [None]:
shap.plots.beeswarm(shap_values_hitter)

In [None]:
shap.plots.scatter(shap_values_hitter[:,"inning"])

Hitters who appear in extra innings tend to get higher engagement.

In [None]:
shap.plots.scatter(shap_values_hitter[:,"scoreDiff"])

In [None]:
shap.plots.scatter(shap_values_hitter[:,"rbi"])

In [None]:
shap.plots.scatter(shap_values_hitter[:,"gameType"], show=False)
plt.xlim(0, 6)
plt.xticks(np.arange(0, 6), ['', 'Regular Season', 'Division Series', 'League Championship Series', 'Wild Card Game', 'World Series'], rotation='vertical')
plt.show()

In [None]:
bat_events = ['','Strikeout',
            'Groundout',
            'Single',
            'Walk',
            'Flyout',
            'Lineout',
            'Pop Out',
            'Double',
            'Home Run',
            'Forceout',
            'Grounded Into DP',
            'Hit By Pitch',
            'Field Error',
            'Sac Fly',
            'Intent Walk',
            'Triple',
            'Double Play',
            'Sac Bunt',
            'Fielders Choice Out',
             'Strikeout Double Play']

shap.plots.scatter(shap_values_hitter[:,"atBatEvent"], color=shap_values_hitter, show=False)
plt.xlim(0, 20)
plt.xticks(np.arange(21), bat_events, rotation='vertical')
plt.show()

Obviously, home runs are related to hitters' engagement. Interestingly, triples also seem to boost engagement.

## Pitcher Events vs Target

In [None]:
pitcher_df = event_df[event_df.asPitcher==1]

X = pitcher_df.drop(['target1', 'target2', 'target3', 'target4', 'dailyDataDate', 'playerId', 'teamId', 'engagementMetricsDate'], axis=1).astype(np.float32)
y = pitcher_df['target1']

model = lgb.LGBMRegressor(**params)

model.fit(X, y, categorical_feature=['atBatEvent', 'event', 'menOnBase', 'gameType', 'pitchType', 'call'])

del pitcher_df

In [None]:
explainer = shap.Explainer(model)
shap_values_pitcher = explainer(X.sample(50000))

In [None]:
shap.plots.beeswarm(shap_values_pitcher)

In [None]:
shap.plots.scatter(shap_values_pitcher[:,"nastyFactor"])

The relationship between nastyfactor and engagement seems to be weak.

In [None]:
shap.plots.scatter(shap_values_pitcher[:,"scoreDiff"])

The relationship between score difference and target shows that hitters who appear when the score difference is zero tend to get higher engagement, while the opposite is true for pitchers.

In [None]:
shap.plots.scatter(shap_values_pitcher[:,"gameType"], show=False)
plt.xlim(0, 6)
plt.xticks(np.arange(0, 6), ['', 'Regular Season', 'Division Series', 'League Championship Series', 'Wild Card Game', 'World Series'], rotation='vertical')
plt.show()

In [None]:
bat_events = ['','Strikeout',
            'Groundout',
            'Single',
            'Walk',
            'Flyout',
            'Lineout',
            'Pop Out',
            'Double',
            'Home Run',
            'Forceout',
            'Grounded Into DP',
            'Hit By Pitch',
            'Field Error',
            'Sac Fly',
            'Intent Walk',
            'Triple',
            'Double Play',
            'Sac Bunt',
            'Fielders Choice Out',
             'Strikeout Double Play']

shap.plots.scatter(shap_values_pitcher[:,"atBatEvent"], color=shap_values_pitcher, show=False)
plt.xlim(0, 20)
plt.xticks(np.arange(21), bat_events, rotation='vertical')
plt.show()

In [None]:
pitch_type = ['', 'FF', 'SL', 'CH', 'SI', 'CU', 'FT', 'FC', 'KC', 'FS', 'KN', 'EP', 'CS', 'FO', 'PO', 'SC', 'FA', 'AB']

shap.plots.scatter(shap_values_pitcher[:,"pitchType"], color=shap_values_pitcher, show=False)
plt.xlim(0, 19)
plt.xticks(np.arange(18), pitch_type, rotation='vertical')
plt.show()

In [None]:
shap.plots.scatter(shap_values_pitcher[:,"inning"])

In [None]:
shap.plots.scatter(shap_values_pitcher[:,"x0"])

In [None]:
shap.plots.scatter(shap_values_pitcher[:,"zone"])

In [None]:
shap.plots.scatter(shap_values_pitcher[:,"nastyFactor"])

In [None]:
shap.plots.scatter(shap_values_pitcher[:,"startSpeed"])