In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import itertools
import pandas as pd
import datatable as dt
import numpy as np
import mlb
import pickle as pkl
from tqdm import tqdm
from itertools import product
import lightgbm as lgb
from fuzzywuzzy import fuzz
import re
import numba as nb
from numba import njit
import xgboost as xgb

@njit
def nb_cumsum(arr):
    return arr.cumsum()

@njit
def nb_sum(arr):
    return arr.sum()

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float64)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def unnest(data, name):
    try:
        date_nested_table = data[['date', name]]

        date_nested_table = (date_nested_table[
          ~pd.isna(date_nested_table[name])
          ].
          reset_index(drop = True)
          )

        daily_dfs_collection = []

        for date_index, date_row in date_nested_table.iterrows():
            daily_df = pd.read_json(date_row[name])

            daily_df['dailyDataDate'] = date_row['date']

            daily_dfs_collection = daily_dfs_collection + [daily_df]

        if daily_dfs_collection:
            # Concatenate all daily dfs into single df for each row
            unnested_table = (pd.concat(daily_dfs_collection,
              ignore_index = True).
              # Set and reset index to move 'dailyDataDate' to front of df
              set_index('dailyDataDate').
              reset_index()
              )
            return reduce_mem_usage(unnested_table, False)
        else:
            return pd.DataFrame()
    except Exception as e:
        print(e)
        print(f'unnest failed for {name}. returning empty dataframe')
        return pd.DataFrame()


def get_unnested_data_dict(data, daily_data_nested_df_names):
    df_dict = {}
    for df_name in daily_data_nested_df_names:
        df_dict[df_name] = unnest(data, df_name)
    return df_dict

def get_unnested_data(data, colnames):
    return (unnest(data, df_name) for df_name in colnames)


## Find win expectancy and volatility given inning, out, base, run situation.

## no. of runs that score with HR in diff. base situations
baseHr = {1: 1,
          2: 2,
          3: 2,
          4: 3,
          5: 2,
          6: 3,
          7: 3,
          8: 4
          }
    
tangoRunExp = {'60': {1: 0.51400000000000001, 2: 0.19400000000000001, 3: 0.14999999999999999, 4: 0.076999999999999999, 5: 0.036999999999999998, 6: 0.017000000000000001, 7: 0.0060000000000000001, 8: 0.0030000000000000001, 9: 0.001, 10: 0.001, 'm': -0.216, 'b': 0.247}, '61': {1: 0.59599999999999997, 2: 0.17599999999999999, 3: 0.13200000000000001, 4: 0.057000000000000002, 5: 0.024, 6: 0.0089999999999999993, 7: 0.0040000000000000001, 8: 0.001, 9: 0.0, 10: 0.0, 'm': -0.11600000000000001, 'b': 0.40600000000000003}, '62': {1: 0.55900000000000005, 2: 0.20599999999999999, 3: 0.158, 4: 0.051999999999999998, 5: 0.017000000000000001, 6: 0.0050000000000000001, 7: 0.002, 8: 0.001, 9: 0.0, 10: 0.0, 'm': -0.19900000000000001, 'b': 0.82799999999999996}, '82': {1: 0.27300000000000002, 2: 0.35499999999999998, 3: 0.17000000000000001, 4: 0.13800000000000001, 5: 0.041000000000000002, 6: 0.014999999999999999, 7: 0.0050000000000000001, 8: 0.001, 9: 0.0, 10: 0.0, 'm': -0.20899999999999999, 'b': 0.78900000000000003}, '80': {1: 0.311, 2: 0.247, 3: 0.17000000000000001, 4: 0.14399999999999999, 5: 0.070999999999999994, 6: 0.031, 7: 0.012999999999999999, 8: 0.0080000000000000002, 9: 0.0030000000000000001, 10: 0.002, 'm': -0.127, 'b': 0.193}, '81': {1: 0.39700000000000002, 2: 0.24399999999999999, 3: 0.151, 4: 0.123, 5: 0.050999999999999997, 6: 0.021000000000000001, 7: 0.0080000000000000002, 8: 0.0030000000000000001, 9: 0.001, 10: 0.0, 'm': -0.14199999999999999, 'b': 0.40200000000000002}, '20': {1: 0.42399999999999999, 2: 0.29899999999999999, 3: 0.14999999999999999, 4: 0.071999999999999995, 5: 0.032000000000000001, 6: 0.012999999999999999, 7: 0.0050000000000000001, 8: 0.002, 9: 0.001, 10: 0.0, 'm': -0.27800000000000002, 'b': 0.71599999999999997}, '21': {1: 0.44400000000000001, 2: 0.32600000000000001, 3: 0.13700000000000001, 4: 0.056000000000000001, 5: 0.021999999999999999, 6: 0.0089999999999999993, 7: 0.0030000000000000001, 8: 0.001, 9: 0.0, 10: 0.0, 'm': -0.26800000000000002, 'b': 0.86299999999999999}, '22': {1: 0.45300000000000001, 2: 0.374, 3: 0.11600000000000001, 4: 0.039, 5: 0.012, 6: 0.0050000000000000001, 7: 0.001, 8: 0.0, 9: 0.0, 10: 0.0, 'm': -0.19400000000000001, 'b': 0.97399999999999998}, '42': {1: 0.49399999999999999, 2: 0.23699999999999999, 3: 0.18099999999999999, 4: 0.059999999999999998, 5: 0.017999999999999999, 6: 0.0070000000000000001, 7: 0.002, 8: 0.001, 9: 0.0, 10: 0.0, 'm': -0.14399999999999999, 'b': 0.84099999999999997}, '40': {1: 0.36199999999999999, 2: 0.25600000000000001, 3: 0.19400000000000001, 4: 0.104, 5: 0.048000000000000001, 6: 0.02, 7: 0.0089999999999999993, 8: 0.0040000000000000001, 9: 0.002, 10: 0.001, 'm': -0.16700000000000001, 'b': 0.45000000000000001}, '41': {1: 0.40100000000000002, 2: 0.25800000000000001, 3: 0.20300000000000001, 4: 0.083000000000000004, 5: 0.034000000000000002, 6: 0.012999999999999999, 7: 0.0050000000000000001, 8: 0.002, 9: 0.001, 10: 0.0, 'm': -0.17399999999999999, 'b': 0.66900000000000004}, '72': {1: 0.185, 2: 0.54800000000000004, 3: 0.16900000000000001, 4: 0.067000000000000004, 5: 0.023, 6: 0.0060000000000000001, 7: 0.002, 8: 0.0, 9: 0.0, 10: 0.0, 'm': -0.095000000000000001, 'b': 0.78500000000000003}, '71': {1: 0.41299999999999998, 2: 0.32800000000000001, 3: 0.13800000000000001, 4: 0.072999999999999995, 5: 0.029000000000000001, 6: 0.010999999999999999, 7: 0.0050000000000000001, 8: 0.002, 9: 0.001, 10: 0.0, 'm': -0.311, 'b': 0.47799999999999998}, '70': {1: 0.315, 2: 0.35599999999999998, 3: 0.16800000000000001, 4: 0.085999999999999993, 5: 0.043999999999999997, 6: 0.017999999999999999, 7: 0.0070000000000000001, 8: 0.0040000000000000001, 9: 0.002, 10: 0.0, 'm': -0.22900000000000001, 'b': 0.26100000000000001}, '11': {1: 0.59999999999999998, 2: 0.24299999999999999, 3: 0.097000000000000003, 4: 0.036999999999999998, 5: 0.014, 6: 0.0060000000000000001, 7: 0.002, 8: 0.001, 9: 0.0, 10: 0.0, 'm': -0.29599999999999999, 'b': 0.98799999999999999}, '12': {1: 0.67300000000000004, 2: 0.222, 3: 0.070999999999999994, 4: 0.023, 5: 0.0070000000000000001, 6: 0.002, 7: 0.001, 8: 0.0, 9: 0.0, 10: 0.0, 'm': -0.16300000000000001, 'b': 1.014}, '32': {1: 0.68600000000000005, 2: 0.20399999999999999, 3: 0.072999999999999995, 4: 0.025000000000000001, 5: 0.0080000000000000002, 6: 0.002, 7: 0.001, 8: 0.0, 9: 0.0, 10: 0.0, 'm': -0.107, 'b': 0.83199999999999996}, '31': {1: 0.59399999999999997, 2: 0.23400000000000001, 3: 0.104, 4: 0.042000000000000003, 5: 0.017000000000000001, 6: 0.0060000000000000001, 7: 0.0030000000000000001, 8: 0.001, 9: 0.0, 10: 0.0, 'm': -0.191, 'b': 0.69299999999999995}, '30': {1: 0.56599999999999995, 2: 0.22600000000000001, 3: 0.114, 4: 0.052999999999999999, 5: 0.023, 6: 0.01, 7: 0.0040000000000000001, 8: 0.002, 9: 0.001, 10: 0.0, 'm': -0.35799999999999998, 'b': 0.55900000000000005}, '51': {1: 0.73699999999999999, 2: 0.152, 3: 0.067000000000000004, 4: 0.027, 5: 0.010999999999999999, 6: 0.0040000000000000001, 7: 0.001, 8: 0.001, 9: 0.0, 10: 0.0, 'm': -0.27000000000000002, 'b': 0.47699999999999998}, '50': {1: 0.65400000000000003, 2: 0.185, 3: 0.088999999999999996, 4: 0.041000000000000002, 5: 0.017999999999999999, 6: 0.0080000000000000002, 7: 0.0030000000000000001, 8: 0.001, 9: 0.001, 10: 0.0, 'm': -0.37, 'b': 0.35499999999999998}, '52': {1: 0.73199999999999998, 2: 0.17699999999999999, 3: 0.059999999999999998, 4: 0.021000000000000001, 5: 0.0070000000000000001, 6: 0.002, 7: 0.001, 8: 0.0, 9: 0.0, 10: 0.0, 'm': -0.047, 'b': 0.76300000000000001}}

def getRunsInn(rpinn):
    runsinn = {0:   1/((rpinn*.761)+1),
               1:   (rpinn*(0.761**2))/(((rpinn*.761)+1)**2)
               }

    for i in range(2, 11):
        v = (rpinn*(0.761**2)*(((rpinn*.761) - 0.761 + 1)**(i-1)))/(((rpinn*.761)+1)**(i+1))
        runsinn[i] = v
    return runsinn


def getRunExp(rpinn, runsinn):
    runExp = {'10': runsinn
              }
    for i in range(0, 3):
        for j in range(1, 9):
            k = str(j) + str(i)
            if k == '10':   continue
            runExp[k] = {0: ((tangoRunExp[k]['m']*rpinn) + tangoRunExp[k]['b'])
                         }
            for r in range(1, 11):
                runExp[k][r] = ((1 - runExp[k][0])*tangoRunExp[k][r])
    return runExp

def getInnWinexp(runExp):
    ## Chance of home team winning with zero
    ## outs at the beg. of each inning

    innWinexp = {'101': {0: 0.5
                      }
              }

    for i in range(-25, 0):
        innWinexp['101'][i] = 0
    for i in range(1, 26):
        innWinexp['101'][i] = 1

    for i in range(9, 0, -1):
        for j in range(2, 0, -1):
            if j == 2:  next = str(i+1) + '1'
            else:   next = str(i) + '2'
            this = str(i) + str(j)
            innWinexp[this] = {}
            if j == 2:
                for k in range(-25, 26):
                    p = 0
                    if i == 9 and k > 0:
                        innWinexp[this][k] = 1
                        continue
                    else:   pass
                    for m in range(0, 11):
                        if k+m > 25:    iw = 1
                        else:   iw = innWinexp[next][k+m]
                        p += runExp['10'][m]*iw
                    innWinexp[this][k] = p
            else:
                for k in range(-25, 26):
                    p = 0
                    for m in range(0, 11):
                        if k-m < -25:   iw = 0
                        else:   iw = innWinexp[next][k-m]
                        p += runExp['10'][m]*iw
                    innWinexp[this][k] = p
    return innWinexp


def getWinexp(innWinexp, runExp, inn, half, base, outs, rdiff):    
    if inn > 9: inn = 9
    innkey = str(inn) + str(half)
    if outs > 2:    outs = 2
    sitkey = str(base) + str(outs)
    if half == 2:  next = str(inn+1) + '1'
    else:   next = str(inn) + '2'
    if sitkey == '10':  ## beginning of half inning
        if rdiff > 25:  rdiff = 25
        elif rdiff < -25:   rdiff = -25
        else:   pass
        Winexp = innWinexp[innkey][rdiff]
    elif half == 1:
        Winexp = 0
        for i in range(10, -1, -1):
            if rdiff-i < -25:   iw = 0
            elif rdiff-i > 25:  iw = 1
            else:   iw = innWinexp[next][rdiff-i]
            Winexp += runExp[sitkey][i]*iw
    else:
        Winexp = 0
        for i in range(0, 11):
            if rdiff-i < -25:   iw = 0
            elif rdiff+i > 25:   iw = 1
            else:   iw = innWinexp[next][rdiff+i]
            Winexp += runExp[sitkey][i]*iw
    return Winexp

def getVol(innWinexp, runExp, inn, half, base, outs, rdiff):
    ## changes if strikeout:
    if outs == 2:
        outsK = 0
        baseK = 1
        if half == 1:
            halfK = 2
            innK = inn
        else:
            halfK = 1
            innK = inn + 1
    else:
        outsK = outs + 1
        baseK, halfK, innK = base, half, inn
    WinexpK = getWinexp(innWinexp, runExp, innK, halfK, baseK, outsK, rdiff)
    ## changes if homerun
    if half == 1:
        rdiff -= baseHr[base]
    else:
        rdiff += baseHr[base]
    base = 1
    WinexpHr = getWinexp(innWinexp, runExp, inn, half, base, outs, rdiff)
    return (abs(WinexpHr - WinexpK))/0.133

def rpgToInnWinexp(rpg):
    rpinn = float(rpg)/9 ## r/inn
    runsinn = getRunsInn(rpinn)
    runExp = getRunExp(rpinn, runsinn)
    innWinexp = getInnWinexp(runExp)
    return innWinexp, runExp

def winnexp_feature(x):
    return getWinexp(innWinexp, runExp, x['inning'], x['halfInning_index'], x['base_state'], x['outs_beg'], x['run_diff'])

##################################################################################################
## Functions for extracting and matching ejected player names and getting their playerId
##################################################################################################
# Need to map names to the players.csv or playerBoxScores playerIds
def find_closest_playerName(playerName, players):
    players['fuzz_score'] = [fuzz.WRatio(playerName, x) for x in players['playerName']]
    best_match = players.loc[players['fuzz_score']==players['fuzz_score'].max(), 'playerName'].iloc[0]    
    
    return best_match

def find_playerId(x, players, rosters_players):
    # rosters_players is a merge of the rosters df and the players df on the playerId
    tmp = players[players['playerName']==x['playerName']]
    if tmp.shape[0]==1:
        return tmp['playerId'].iloc[0]
    else:
        # If there are two players with the same name in players, then use the daily roster data to find the player on the matching team
        return rosters_players.loc[(rosters_players['dailyDataDate']==x['dailyDataDate']) & (rosters_players['teamId']==x['teamId']) & (rosters_players['playerName']==x['playerName']), 'playerId'].iloc[0]

##################################################################################################

# Set up win expectancy variables
rpg = 4.5
innWinexp, runExp = rpgToInnWinexp(rpg)

def game_score_james(x):
    '''
    #     • Start with 50 points
    #     • Add 1 point for each out recorded (or 3 points per inning)
    #     • Add 2 points for each inning completed after the fourth
    #     • Add 1 additional point for every strikeout
    #     • Remove 2 points for each hit allowed
    #     • Remove 4 points for each earned run allowed
    #     • Remove 2 points for each unearned run allowed
    #     • Remove 1 point for each walk allowed
    '''
    score = 50
    score += x['outsPitching']
    score += 2*(x['inningsPitched'] - 4)
    score += x['strikeOutsPitching']
    score -= 2*x['hitsPitching']
    score -= 4*x['earnedRuns']
    score -= 2*(x['runsPitching'] - x['earnedRuns'])
    score -= (x['baseOnBallsPitching']+x['hitByPitchPitching'])
#     score = 50 + x['outsPitching'] + 2*(x['inningsPitched'] - 4) + x['strikeOutsPitching'] - 2*x['hitsPitching'] - 4*x['earnedRuns'] - 2*(x['runsPitching'] - x['earnedRuns']) - (x['baseOnBallsPitching']+x['hitByPitchPitching'])
    return score

    
def game_score_tango(x):
    '''
    Game Score formula (updated by Tom Tango)
    # • Start with 40 points
    # • Add 2 points for each out recorded (or 6 points per inning)
    # • Add 1 additional point for every strikeout
    # • Remove 2 points for each walk allowed
    # • Remove 2 points for each hit allowed
    # • Remove 3 points for each run allowed (earned or unearned)
    # • Remove 6 additional points for each home run allowed 
    '''
    score = 40
    score += 2*x['outsPitching']
    score += x['strikeOutsPitching']
    score -= 2*(x['baseOnBallsPitching']+x['hitByPitchPitching'])
    score -= 2*x['hitsPitching']
    score -= 3*x['runsPitching']
    score -= 6*x['homeRunsPitching']
    return score


In [None]:
pitching_features = ['gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']

In [None]:
train = dt.fread("../input/mlb-player-digital-engagement-forecasting/train_updated.csv").to_pandas()
test = True
if test:
    last_date = train['date'].max()
else:
    last_date = 20210430

eng = unnest(train, 'nextDayPlayerEngagement')
p_box_scores_og = unnest(train, 'playerBoxScores')
teams = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/teams.csv")
players = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/players.csv")
awards_history = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/awards.csv")
##################################################################################################
## Regex's for extracting and matching ejected player names and getting their playerId
##################################################################################################
team_names = list(teams['teamName'].unique()) + ["Diamondbacks"]
team_regex = re.compile('|'.join(map(re.escape, team_names)))
team_full_names = list(teams['name'].unique()) + list(teams['teamName'].unique())
team_full_regex = re.compile('|'.join(map(re.escape, team_full_names)))
coaching_names = ["Assistant Hitting Coach", "Manager", "Bench Coach", "Interim Manager", "Hitting Coach", "First Base Coach", "Pitching Coach", "bench caoch", "assistant hitting coach", "Third Base Coach", "catching coach", "field coordinator", "first base coach", "hitting coach", "major league coach", "manager", "pitching coach", "third base coach", "bench coach"]
coaching_regex = re.compile('|'.join(map(re.escape, coaching_names)))
positions = ['pitcher','catcher','first baseman','second baseman', 'third baseman','shortstop','left fielder','center fielder','right fielder', 'designated hitter']
pos_regex = re.compile('|'.join(map(re.escape, positions)))  
##################################################################################################
colnames = [
 'games',
 'rosters',
 'playerBoxScores',
 'teamBoxScores',
 'transactions',
 'standings',
 'awards',
 'events',
 'playerTwitterFollowers',
 'teamTwitterFollowers']


hitter_history_feats = ['hits','doubles','triples','homeRuns','rbi','totalBases', 'plateAppearances','strikeOuts','baseOnBalls','hitByPitch', 'atBats','sacFlies']
pitcher_history_feats = ['gamesPlayedPitching', 'gamesStartedPitching','inningsPitched', 'pitchesThrown', 'winsPitching', 'runsPitching', 'homeRunsPitching', 'strikeOutsPitching','earnedRuns', 'blownSaves', 'holds']
fielder_history_feats = ['errors']
keep_awards = ['NLPOW', 'ALPOW', 'NLROM', 'ALROM','NLPOM','ALPOM','NLRRELMON','ALRRELMON','ALPITOM','NLPITOM','MLBPLAYOW']
keep_annual_awards = ['ALMVP', 'NLMVP', 'ALCY', 'NLCY','ALROY','NLROY','ALPG','NLPG','ALSS','NLSS', 'ALGG','NLGG']

##################################################################################################
## Mappings
##################################################################################################
team_mapping = teams.set_index('teamName')['id'].to_dict()
team_mapping['Diamondbacks'] = 109

player_mapping = p_box_scores_og[['playerId','playerName']].drop_duplicates()

##################################################################################################

pitchers = players[players['primaryPositionName']=="Pitcher"]
players['value'] = 1
player_country_dummies = pd.pivot_table(players, values='value', index=['playerId'], columns=['birthCountry'], aggfunc='sum', fill_value=0).reset_index()
player_country_dummies.columns = player_country_dummies.columns.str.replace(' ','_')

quantile_20 = lambda x: x.quantile(.20)
quantile_20.__name__ = 'quantile_20'

quantile_80 = lambda x: x.quantile(.80)
quantile_80.__name__ = 'quantile_80'

# agg_list = ['median','var', quantile_20, quantile_80]
# player_aggs = eng.groupby('playerId')[['target1','target2','target3','target4']].agg({'target1': agg_list,
#                                                                                         'target2': agg_list,
#                                                                                         'target3': agg_list,
#                                                                                         'target4': agg_list}).round(6)
# player_aggs.columns = ["_".join(x) for x in player_aggs.columns.ravel()]
# player_aggs = player_aggs.reset_index()


# player_medians = eng.groupby('playerId')[['target1','target2','target3','target4']].median().round(6).reset_index()
# player_medians = player_medians.rename({'target1': 'target1_p_median',
#                                         'target2': 'target2_p_median',
#                                         'target3': 'target3_p_median',
#                                         'target4': 'target4_p_median'}, axis=1)

# player_variances = eng.groupby('playerId')[['target1','target2','target3','target4']].var().round(6).reset_index()
# player_variances = player_variances.rename({'target1': 'target1_p_var',
#                                         'target2': 'target2_p_var',
#                                         'target3': 'target3_p_var',
#                                         'target4': 'target4_p_var'}, axis=1)



# game_day_player_means = eng.merge(p_box_scores_og[['dailyDataDate','playerId', 'gamePk']], how='left')
# game_day_player_means['game_played'] = game_day_player_means['gamePk'].notnull().astype(int)
# off_day_player_means = game_day_player_means[game_day_player_means['game_played']==0].groupby(['playerId'])[['target1','target2','target3','target4']].mean().round(6).reset_index()
# off_day_player_means = off_day_player_means.rename({'target1': 'target1_p_mean_off_day',
#                                         'target2': 'target2_p_mean_off_day',
#                                         'target3': 'target3_p_mean_off_day',
#                                         'target4': 'target4_p_mean_off_day'}, axis=1)

# game_day_player_means = game_day_player_means[game_day_player_means['game_played']==1].groupby(['playerId'])[['target1','target2','target3','target4']].mean().round(6).reset_index()
# game_day_player_means = game_day_player_means.rename({'target1': 'target1_p_mean_game_day',
#                                         'target2': 'target2_p_mean_game_day',
#                                         'target3': 'target3_p_mean_game_day',
#                                         'target4': 'target4_p_mean_game_day'}, axis=1)

# game_day_player_vars = eng.merge(p_box_scores_og[['dailyDataDate','playerId']])
# game_day_player_vars = game_day_player_vars.groupby('playerId')[['target1','target2','target3','target4']].var().round(6).reset_index()
# game_day_player_vars = game_day_player_vars.rename({'target1': 'target1_p_var_game_day',
#                                         'target2': 'target2_p_var_game_day',
#                                         'target3': 'target3_p_var_game_day',
#                                         'target4': 'target4_p_var_game_day'}, axis=1)

In [None]:
eng['ddd_month'] = np.floor(eng.dailyDataDate/100).astype(int)

months = eng.ddd_month.unique()
## add one extra month to grab entire data set's TEs
months = np.append(months, months[months.size-1]+1)

MAX_MONTH = months[months.size-1]

ttl_player_medians = []
ttl_player_variances = []
ttl_player_means = []
ttl_roll12_player_medians = []
ttl_roll12_player_variances = []
ttl_roll12_player_means = []
for i in range(months.size-1):
    month=months[i+1]
    roll_month = months[0]
    if i >11:
        roll_month = months[i-12]
        
    #print(str(month) + ' roll:' + str(roll_month))
    player_medians = eng[eng.ddd_month<month].groupby('playerId')[['target1','target2','target3','target4']].median().round(6).reset_index()
    player_medians = player_medians.rename({'target1': 'target1_p_median',
                                            'target2': 'target2_p_median',
                                            'target3': 'target3_p_median',
                                            'target4': 'target4_p_median'}, axis=1)
    player_medians['ddd_month'] = month

    roll12_player_medians = eng[(eng.ddd_month<month) & (eng.ddd_month >=roll_month)].groupby('playerId')[['target1','target2','target3','target4']].median().round(6).reset_index()
    roll12_player_medians = roll12_player_medians.rename({'target1': 'roll12_target1_p_median',
                                            'target2': 'roll12_target2_p_median',
                                            'target3': 'roll12_target3_p_median',
                                            'target4': 'roll12_target4_p_median'}, axis=1)
    roll12_player_medians['ddd_month'] = month

    player_variances = eng[eng.ddd_month<month].groupby('playerId')[['target1','target2','target3','target4']].var().round(6).reset_index()
    player_variances = player_variances.rename({'target1': 'target1_p_var',
                                            'target2': 'target2_p_var',
                                            'target3': 'target3_p_var',
                                            'target4': 'target4_p_var'}, axis=1)
    player_variances['ddd_month'] = month

    roll12_player_variances = eng[(eng.ddd_month<month) & (eng.ddd_month >=roll_month)].groupby('playerId')[['target1','target2','target3','target4']].var().round(6).reset_index()
    roll12_player_variances = roll12_player_variances.rename({'target1': 'roll12_target1_p_var',
                                            'target2': 'roll12_target2_p_var',
                                            'target3': 'roll12_target3_p_var',
                                            'target4': 'roll12_target4_p_var'}, axis=1)
    roll12_player_variances['ddd_month'] = month

    player_means = eng[eng.ddd_month<month].groupby('playerId')[['target1','target2','target3','target4']].mean().round(6).reset_index()
    player_means = player_means.rename({'target1': 'target1_p_mean',
                                            'target2': 'target2_p_mean',
                                            'target3': 'target3_p_mean',
                                            'target4': 'target4_p_mean'}, axis=1)
    player_means['ddd_month'] = month

    roll12_player_means = eng[(eng.ddd_month<month) & (eng.ddd_month >=roll_month)].groupby('playerId')[['target1','target2','target3','target4']].mean().round(6).reset_index()
    roll12_player_means = roll12_player_means.rename({'target1': 'roll12_target1_p_mean',
                                            'target2': 'roll12_target2_p_mean',
                                            'target3': 'roll12_target3_p_mean',
                                            'target4': 'roll12_target4_p_mean'}, axis=1)
    roll12_player_means['ddd_month'] = month

    ttl_player_medians.append(player_medians)
    ttl_player_variances.append(player_variances)
    ttl_player_means.append(player_means)
    ttl_roll12_player_medians.append(roll12_player_medians)
    ttl_roll12_player_variances.append(roll12_player_variances)
    ttl_roll12_player_means.append(roll12_player_means)
    

dt_player_medians = pd.concat(ttl_player_medians)
dt_player_variances = pd.concat(ttl_player_variances)
dt_player_means = pd.concat(ttl_player_means)
dt_roll12_player_medians = pd.concat(ttl_roll12_player_medians)
dt_roll12_player_variances = pd.concat(ttl_roll12_player_variances)
dt_roll12_player_means = pd.concat(ttl_roll12_player_means)

dt_player_aggregations = dt_player_medians.merge(dt_player_variances,how="left",on=['playerId','ddd_month'])
dt_player_aggregations = dt_player_aggregations.merge(dt_player_means,how="left",on=['playerId','ddd_month'])
dt_player_aggregations = dt_player_aggregations.merge(dt_roll12_player_medians,how="left",on=['playerId','ddd_month'])
dt_player_aggregations = dt_player_aggregations.merge(dt_roll12_player_variances,how="left",on=['playerId','ddd_month'])
dt_player_aggregations = dt_player_aggregations.merge(dt_roll12_player_means,how="left",on=['playerId','ddd_month'])

In [None]:
played_game = p_box_scores_og.groupby(['playerId','dailyDataDate'])['gamePk'].count().reset_index()
played_game = played_game.rename({'gamePk': 'played_game'}, axis=1)

eng_box = eng.merge(played_game,how='left', on=['dailyDataDate','playerId'])
eng_box['played_game'] = eng_box['played_game'].fillna(0).clip(upper=1)

ttl_player_gameday_medians = []
ttl_player_gameday_variances = []
ttl_player_gameday_means = []
ttl_roll12_player_gameday_medians = []
ttl_roll12_player_gameday_variances = []
ttl_roll12_player_gameday_means = []
for i in range(months.size-1):
    month=months[i+1]
    roll_month = months[0]
    if i >11:
        roll_month = months[i-12]
        
    #print(str(month) + ' roll:' + str(roll_month))
    player_gameday_medians = eng_box[eng_box.ddd_month<month].groupby(['playerId','played_game'])[['target1','target2','target3','target4']].median().round(6).reset_index()
    player_gameday_medians = player_gameday_medians.rename({'target1': 'target1_p_gameday_median',
                                            'target2': 'target2_p_gameday_median',
                                            'target3': 'target3_p_gameday_median',
                                            'target4': 'target4_p_gameday_median'}, axis=1)
    player_gameday_medians['ddd_month'] = month

    roll12_player_gameday_medians = eng_box[(eng_box.ddd_month<month) & (eng_box.ddd_month >=roll_month)].groupby(['playerId','played_game'])[['target1','target2','target3','target4']].median().round(6).reset_index()
    roll12_player_gameday_medians = roll12_player_gameday_medians.rename({'target1': 'roll12_target1_p_gameday_median',
                                            'target2': 'roll12_target2_p_gameday_median',
                                            'target3': 'roll12_target3_p_gameday_median',
                                            'target4': 'roll12_target4_p_gameday_median'}, axis=1)
    roll12_player_gameday_medians['ddd_month'] = month

    player_gameday_variances = eng_box[eng_box.ddd_month<month].groupby(['playerId','played_game'])[['target1','target2','target3','target4']].var().round(6).reset_index()
    player_gameday_variances = player_gameday_variances.rename({'target1': 'target1_p_gameday_var',
                                            'target2': 'target2_p_gameday_var',
                                            'target3': 'target3_p_gameday_var',
                                            'target4': 'target4_p_gameday_var'}, axis=1)
    player_gameday_variances['ddd_month'] = month

    roll12_player_gameday_variances = eng_box[(eng_box.ddd_month<month) & (eng_box.ddd_month >=roll_month)].groupby(['playerId','played_game'])[['target1','target2','target3','target4']].var().round(6).reset_index()
    roll12_player_gameday_variances = roll12_player_gameday_variances.rename({'target1': 'roll12_target1_p_gameday_var',
                                            'target2': 'roll12_target2_p_gameday_var',
                                            'target3': 'roll12_target3_p_gameday_var',
                                            'target4': 'roll12_target4_p_gameday_var'}, axis=1)
    roll12_player_gameday_variances['ddd_month'] = month

    player_gameday_means = eng_box[eng_box.ddd_month<month].groupby(['playerId','played_game'])[['target1','target2','target3','target4']].mean().round(6).reset_index()
    player_gameday_means = player_gameday_means.rename({'target1': 'target1_p_gameday_mean',
                                            'target2': 'target2_p_gameday_mean',
                                            'target3': 'target3_p_gameday_mean',
                                            'target4': 'target4_p_gameday_mean'}, axis=1)
    player_gameday_means['ddd_month'] = month

    roll12_player_gameday_means = eng_box[(eng_box.ddd_month<month) & (eng_box.ddd_month >=roll_month)].groupby(['playerId','played_game'])[['target1','target2','target3','target4']].mean().round(6).reset_index()
    roll12_player_gameday_means = roll12_player_gameday_means.rename({'target1': 'roll12_target1_p_gameday_mean',
                                            'target2': 'roll12_target2_p_gameday_mean',
                                            'target3': 'roll12_target3_p_gameday_mean',
                                            'target4': 'roll12_target4_p_gameday_mean'}, axis=1)
    roll12_player_gameday_means['ddd_month'] = month

    ttl_player_gameday_medians.append(player_gameday_medians)
    ttl_player_gameday_variances.append(player_gameday_variances)
    ttl_player_gameday_means.append(player_gameday_means)
    ttl_roll12_player_gameday_medians.append(roll12_player_gameday_medians)
    ttl_roll12_player_gameday_variances.append(roll12_player_gameday_variances)
    ttl_roll12_player_gameday_means.append(roll12_player_gameday_means)
    

dt_player_gameday_medians = pd.concat(ttl_player_gameday_medians)
dt_player_gameday_variances = pd.concat(ttl_player_gameday_variances)
dt_player_gameday_means = pd.concat(ttl_player_gameday_means)
dt_roll12_player_gameday_medians = pd.concat(ttl_roll12_player_gameday_medians)
dt_roll12_player_gameday_variances = pd.concat(ttl_roll12_player_gameday_variances)
dt_roll12_player_gameday_means = pd.concat(ttl_roll12_player_gameday_means)

dt_player_game_aggregations = dt_player_gameday_medians.merge(dt_player_gameday_variances,how="left",on=['playerId','ddd_month','played_game'])
dt_player_game_aggregations = dt_player_game_aggregations.merge(dt_player_gameday_means,how="left",on=['playerId','ddd_month','played_game'])
dt_player_game_aggregations = dt_player_game_aggregations.merge(dt_roll12_player_gameday_medians,how="left",on=['playerId','ddd_month','played_game'])
dt_player_game_aggregations = dt_player_game_aggregations.merge(dt_roll12_player_gameday_variances,how="left",on=['playerId','ddd_month','played_game'])
dt_player_game_aggregations = dt_player_game_aggregations.merge(dt_roll12_player_gameday_means,how="left",on=['playerId','ddd_month','played_game'])

In [None]:
position_freq = p_box_scores_og['positionType'].fillna(-999).value_counts(normalize=True).to_dict()

position_target_agg = eng.merge(p_box_scores_og[['dailyDataDate','playerId','gamePk','gameTimeUTC','positionType']], how='left')
dh_games = position_target_agg[position_target_agg[['dailyDataDate','playerId']].duplicated(keep=False)].sort_values('gameTimeUTC')[['dailyDataDate','playerId','gamePk']].reset_index(drop=True)
dh_last_game = dh_games[dh_games[['dailyDataDate','playerId']].duplicated(keep='first')] #games to remove
position_target_agg = position_target_agg[~(position_target_agg['playerId'].isin(dh_last_game['playerId']) & position_target_agg['gamePk'].isin(dh_last_game['gamePk']))]
position_freq = position_target_agg['positionType'].fillna(-999).value_counts(normalize=True).to_dict()
position_target_agg = position_target_agg.groupby('positionType')[['target1','target2','target3','target4']].agg({'target1': ['median','var'],
                                                                                        'target2': ['median','var'],
                                                                                        'target3': ['median','var'],
                                                                                        'target4': ['median','var']}).round(6)
position_target_agg.columns = ["_".join(x + ('position',)) for x in position_target_agg.columns.ravel()]
position_target_agg = position_target_agg.reset_index()

In [None]:
last_day = train[train['date']==last_date]
eng, games, rosters, p_box_scores, t_box_scores, transactions, standings, awards, events, p_twitter, t_twitter = get_unnested_data(last_day, ['nextDayPlayerEngagement'] + colnames)
# eng_lag = eng.copy()
# eng_lag = eng[['playerId','target1','target2','target3','target4']].copy()
# eng_lag = eng_lag.rename({'target1': 'target1_lag',
#                 'target2': 'target2_lag',
#                 'target3': 'target3_lag',
#                 'target4': 'target4_lag'}, axis=1)
try:
    if not p_box_scores.empty:
        t_tmp = eng.merge(p_box_scores[['dailyDataDate','playerId','positionCode','pitchesThrown']], how='left', on=['dailyDataDate', 'playerId'])
        t_tmp['position_player_pitching'] = ((t_tmp['positionCode']>1) & (t_tmp['pitchesThrown']>0)).astype(int)
        prior_day_pos_player_pitching = t_tmp.loc[t_tmp['position_player_pitching']==1, ['playerId','position_player_pitching']].fillna(0)
    else:
        eng['position_player_pitching'] = 0
        prior_data_pos_player_pitching = eng[['playerId','position_player_pitching']]
except Exception as e:
    print(e)
    eng['position_player_pitching'] = 0
    prior_data_pos_player_pitching = eng[['playerId','position_player_pitching']]

In [None]:
# Instead of using specific date, find latest date with twitter data available
last_twitter_date = train.loc[train['playerTwitterFollowers'].notnull(), 'date'].max()
second_last_twitter_date = train.loc[train['playerTwitterFollowers'].notnull(), 'date'].iloc[-2]
last_twitter_update = train[train['date']==last_twitter_date]
second_last_twitter_update = train[train['date']==second_last_twitter_date]
p_twitter,_ = get_unnested_data(last_twitter_update, ['playerTwitterFollowers', 'teamTwitterFollowers'])
p_twitter_recent = p_twitter.copy()
p_twitter_second_last,_ = get_unnested_data(second_last_twitter_update, ['playerTwitterFollowers', 'teamTwitterFollowers'])

p_twitter_recent = p_twitter_recent.set_index("playerId")
p_twitter_second_last = p_twitter_second_last.set_index("playerId")
p_twitter_delta = (p_twitter_recent['numberOfFollowers'] - p_twitter_second_last['numberOfFollowers']).reset_index().rename(columns={'numberOfFollowers': 'numberOfFollowers_delta'})
p_twitter_recent = p_twitter_recent.reset_index()

In [None]:
# games = unnest(train, 'games')
# schedule_21 = pd.read_csv("../input/mlbdata/schedule_2021.csv")
# schedule_21['gameDate'] = pd.to_datetime(schedule_21['gameDate'])
# games['gameDate'] = pd.to_datetime(games['gameDate'])
# games = games.sort_values('gameDate')
# schedule = pd.concat([games[['dailyDataDate', 'homeId', 'gameDate']].rename({'homeId': 'teamId'}, axis=1),
#                       games[['dailyDataDate', 'awayId', 'gameDate']].rename({'awayId': 'teamId'}, axis=1)])
# schedule = schedule[schedule['dailyDataDate']<20210401]
# schedule = pd.concat([schedule, schedule_21[['dailyDataDate','teamId','gameDate']]])
# schedule['gameDate'] = pd.to_datetime(schedule['gameDate'])

# all_dates = pd.DataFrame(list(itertools.product(pd.date_range(start="2018-01-01", end="2021-12-31"), schedule['teamId'].unique())), columns=['gameDate', 'teamId'])
# all_dates = all_dates.merge(schedule, how='outer', on=['gameDate','teamId'])
# all_dates = all_dates.sort_values(['teamId','gameDate']).drop_duplicates()
# all_dates['dailyDataDate_lead'] = all_dates.groupby('teamId')['dailyDataDate'].shift(1)
# all_dates = all_dates[all_dates['dailyDataDate_lead'].notnull()].reset_index(drop=True)
# all_dates['nextDayGame'] = 1

In [None]:
awards_dict = awards_history[awards_history['awardId'].isin(keep_awards + keep_annual_awards)].groupby(['playerId','awardId']).size().to_dict()
awards_dict_tmp = {n: grp.to_dict('list') for n, grp in awards_history.loc[awards_history['awardId'].isin(keep_awards + keep_annual_awards), ['awardId','playerId']].groupby('playerId')}

awards_dict = {}
for k,v in awards_dict_tmp.items():
    if not k in awards_dict:
        awards_dict[k] = {}
    counts = np.unique(v['awardId'], return_counts=True)
    for feat, value in zip(counts[0], counts[1]):
        awards_dict[k][feat] = value
        

hitter_history_dict = {}
fielder_history_dict = {}
pitcher_history_dict = {}
for i, data in tqdm(train[train['date']<=last_date].iterrows()):
    try:
        data = data.to_frame().T
        daily_data_date = data['date'].iloc[0]
        season = int(str(daily_data_date)[:4])
        p_box_scores, games, rosters, awards = get_unnested_data(data, ['playerBoxScores', 'games', 'rosters', 'awards'])

        if rosters.empty:
            rosters = prior_day_rosters

        prior_day_rosters = rosters.copy()

        if not games.empty:
            games_filtered = games.loc[games['gameType'].isin(["R", "F","D","L","W","C","P"]) & ~games['detailedGameState'].isin(["Postponed"])]
            if not games_filtered.empty:
                schedule_day = pd.concat([games_filtered[['dailyDataDate', 'gamePk','homeId', 'gameDate', 'gameTimeUTC', 'homeWinner']].rename({'homeId': 'teamId', 'homeWinner': 'winner'}, axis=1),
                                  games_filtered[['dailyDataDate', 'gamePk','awayId', 'gameDate', 'gameTimeUTC','awayWinner']].rename({'awayId': 'teamId', 'awayWinner': 'winner'}, axis=1)])

                schedule_day = schedule_day.sort_values('gameTimeUTC')

                if not schedule_day.empty and not p_box_scores.empty:
                    game_rosters = schedule_day.merge(rosters, how='left', on=['gameDate','teamId'])
                    game_rosters = game_rosters[game_rosters['playerId'].notnull()] #missing roster for Nationals 20200910
                    game_rosters['playerId'] = game_rosters['playerId'].astype(int)
                    p_box_scores = p_box_scores.sort_values("gameTimeUTC")
                    p_box_scores['gameDate'] = pd.to_datetime(p_box_scores['gameDate'])
                    p_box_scores['season'] = p_box_scores['gameDate'].dt.year
                    player_history_daily = game_rosters.merge(p_box_scores, how='left', on=['gamePk', 'playerId']) 
                    player_history_daily['gameTimeUTC_y'] = player_history_daily['gameTimeUTC_y'].fillna(player_history_daily['gameTimeUTC_x'])
                    # NOTE: dailyDataDate==2020918 gamePk==631122 Start time of 2020-09-18T03:33:00Z is not accurate; that would imply the game started the day before at ~11:30PM local time
                    player_history_daily = player_history_daily.sort_values(['playerId','gameTimeUTC_y']) # SORT BY gameTimeUTC from p_box_scores. `gameTimeUTC` is not accurate from the `games` data
                    player_history_daily[hitter_history_feats] = player_history_daily[hitter_history_feats].fillna(0)



                    hitter_history_tmp = {n: grp.to_dict('list') for n, grp in player_history_daily[hitter_history_feats + ['season', 'playerId']].groupby('playerId')}
                    for k,v in hitter_history_tmp.items():
                        if not k in hitter_history_dict:
                            hitter_history_dict[k] = v
                        else:
                            for feat in hitter_history_feats + ['season']:
                                hitter_history_dict[k][feat].extend(v[feat])
                    # For hitters, only use games they played in. Pitchers need off days filled in because it's important to account for rest/off days
                    # Fill in days with 0 if hitter isn't in daily box scores
                    # for k,v in hitter_history_dict.items():
                    #     if not k in hitter_history_tmp:
                    #         for feat in hitter_history_feats + ['season']:
                    #             hitter_history_dict[k][feat].append(season if feat=='season' else 0.0)
                    fielder_history_tmp = {n: grp.to_dict('list') for n, grp in player_history_daily[fielder_history_feats + ['season', 'playerId']].groupby('playerId')}
                    for k,v in fielder_history_tmp.items():
                        if not k in fielder_history_dict:
                            fielder_history_dict[k] = v
                        else:
                            for feat in fielder_history_feats + ['season']:
                                fielder_history_dict[k][feat].extend(v[feat])


                    pitcher_history_tmp = {n: grp.to_dict('list') for n, grp in p_box_scores.loc[p_box_scores['positionName']=='Pitcher', pitcher_history_feats + ['season', 'playerId']].groupby('playerId')}
                    for k,v in pitcher_history_tmp.items():
                        if not k in pitcher_history_dict:
                            pitcher_history_dict[k] = v
                        else:
                            for feat in pitcher_history_feats + ['season']:
                                pitcher_history_dict[k][feat].extend(v[feat])
                    # Fill in days with 0 if pitcher isn't in daily box scores
                    for k,v in pitcher_history_dict.items():
                        if not k in pitcher_history_tmp:
                            for feat in pitcher_history_feats + ['season']:
                                pitcher_history_dict[k][feat].append(season if feat=='season' else 0.0)
    except Exception as e:
        # If fails, just move on to the next day
        print(f"history dicts loop failed: {e}")
        pass
                            
    try:
        if not awards.empty:
        
            awards_filtered = awards[awards['awardId'].isin(keep_awards + keep_annual_awards)].reset_index(drop=True)

            # Update awards counts
            awards_dict_tmp = {n: grp.to_dict('list') for n, grp in awards_filtered[['awardId','playerId']].groupby('playerId')}
            for k,v in awards_dict_tmp.items():
                try:
                    if not k in awards_dict:
                        awards_dict[k] = {}
                    counts = np.unique(v['awardId'], return_counts=True)
                    for feat, value in zip(counts[0], counts[1]):
                        if feat in awards_dict[k]:
                            awards_dict[k][feat] += value
                        else:
                            awards_dict[k][feat] = value
                except:
                    # If fails, move on to the next one
                    pass
    except Exception as e:
        # If fails, don't worry about updating dict
        print(e)
        pass

In [None]:
games_og = unnest(train, 'games')
schedule_og = pd.concat([games_og.loc[games_og['gameType'].isin(["R", "F","D","L","W","C","P"]) & ~games_og['detailedGameState'].isin(["Postponed"]),['dailyDataDate', 'gamePk','homeId', 'gameDate', 'gameTimeUTC', 'homeWinner']].rename({'homeId': 'teamId', 'homeWinner': 'winner'}, axis=1),
                    games_og.loc[games_og['gameType'].isin(["R", "F","D","L","W","C","P"]) & ~games_og['detailedGameState'].isin(["Postponed"]), ['dailyDataDate', 'gamePk','awayId', 'gameDate', 'gameTimeUTC','awayWinner']].rename({'awayId': 'teamId', 'awayWinner': 'winner'}, axis=1)])

schedule_og = schedule_og.sort_values('gameTimeUTC')
schedule_og = schedule_og[schedule_og['dailyDataDate']<=last_date]
schedule_og['gameDate'] = pd.to_datetime(schedule_og['gameDate'])
        
team_win_history = {}
team_win_dict = schedule_og.groupby("teamId")['winner'].apply(list).to_dict()
for k,v in team_win_dict.items():
    if not k in team_win_history:
        team_win_history[k] = v
    else:
        team_win_history[k].extend(v)

win_streaks = {k: v[::-1].index(0) if 0 in v else len(v) for k, v in team_win_history.items()}

In [None]:
#Load models
lgb_target1 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target1_v30_full.txt")
lgb_target2 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target2_v30_full.txt")
lgb_target3 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target3_v30_full.txt")
lgb_target4 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target4_v30_full.txt")

lgb_dblsqrt_target1 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target1_dblsqrt_full.txt")
lgb_dblsqrt_target2 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target2_dblsqrt_full.txt")
lgb_dblsqrt_target3 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target3_dblsqrt_full.txt")
lgb_dblsqrt_target4 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target4_dblsqrt_full.txt")

lgb_bfa_target1 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target1_v30_bfa_full.txt")
lgb_bfa_target2 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target2_v30_bfa_full.txt")
lgb_bfa_target3 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target3_v30_bfa_full.txt")
lgb_bfa_target4 = lgb.Booster(model_file = "../input/d/brandenkmurray/mlbmodels/lgb_target4_v30_bfa_full.txt")

xgb_target1 = xgb.Booster()
xgb_target2 = xgb.Booster()
xgb_target3 = xgb.Booster()
xgb_target4 = xgb.Booster()
xgb_target1.load_model("../input/d/brandenkmurray/mlbmodels/xgb_target1_v30_full.txt")
xgb_target2.load_model("../input/d/brandenkmurray/mlbmodels/xgb_target2_v30_full.txt")
xgb_target3.load_model("../input/d/brandenkmurray/mlbmodels/xgb_target3_v30_full.txt")
xgb_target4.load_model("../input/d/brandenkmurray/mlbmodels/xgb_target4_v30_full.txt")

lgb_john_target1 = lgb.Booster(model_file = "../input/mlb-models-and-files/lgb_target1_dubs_tripsX_all.txt")
lgb_john_target2 = lgb.Booster(model_file = "../input/mlb-models-and-files/lgb_target2_dubs_tripsX_all.txt")
lgb_john_target3 = lgb.Booster(model_file = "../input/mlb-models-and-files/lgb_target3_dubs_tripsX_all.txt")
lgb_john_target4 = lgb.Booster(model_file = "../input/mlb-models-and-files/lgb_target4_dubs_tripsX_all.txt")

lgb_dart_target1 = lgb.Booster(model_file = "../input/dart-model/lgb_target1_dubs_trips_dart_full_data.txt")
lgb_dart_target2 = lgb.Booster(model_file = "../input/dart-model/lgb_target2_dubs_trips_dart_full_data.txt")
lgb_dart_target3 = lgb.Booster(model_file = "../input/dart-model/lgb_target3_dubs_trips_dart_full_data.txt")
lgb_dart_target4 = lgb.Booster(model_file = "../input/dart-model/lgb_target4_dubs_trips_dart_full_data.txt")

In [None]:
yesterday = pd.DataFrame()
t  = []
sub_list = []
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set
for i, (data, sub) in enumerate(iter_test):
# for i, (i2, data) in enumerate(train[(train['date']>=20210501) & (train['date']<=20210731)].iloc[1:].iterrows()):

    ### REMOVE below
#     data = data.to_frame().T
#     sub = unnest(data, 'nextDayPlayerEngagement')
#     sub = sub.rename(columns={'target1': 'target1_act', 'target2': 'target2_act', 'target3': 'target3_act', 'target4': 'target4_act'})
#     sub['date_playerId'] = pd.to_datetime(sub['engagementMetricsDate']).dt.strftime("%Y%m%d") + "_" + sub['playerId'].astype(str)
    ### REMOVE above
    ### UNCOMMENT BELOW
    sub = sub.reset_index()
    sub = sub.rename({'date': 'dailyDataDate'}, axis=1)
    sub['playerId'] = sub['date_playerId'].apply(lambda x: int(x.split("_")[1]))
    
    data = data.reset_index()
    data = data.rename({'index': 'date'}, axis=1)
    ### UNCOMMENT ABOVE
    try:
        season = int(str(data['date'].iloc[0])[:4])
    except:
        season = 2021.0

    try:
        games, rosters, p_box_scores, t_box_scores, transactions, standings, awards, events, p_twitter, t_twitter = get_unnested_data(data, colnames)

        eng_shape = sub.shape
        t_tmp = sub.copy()

        if rosters.empty:
            rosters = prior_day_rosters

        prior_day_rosters = rosters.copy()

        if not p_twitter.empty:
            # Get twitter follower delta if not the first month
            if not p_twitter_recent.empty:
                p_twitter = p_twitter.set_index("playerId")
                p_twitter_recent = p_twitter_recent.set_index("playerId")
                p_twitter_delta = (p_twitter['numberOfFollowers'] - p_twitter_recent['numberOfFollowers']).reset_index().rename(columns={'numberOfFollowers': 'numberOfFollowers_delta'})
                p_twitter = p_twitter.reset_index()
            p_twitter_recent = p_twitter


        if not games.empty:
            schedule_daily = pd.concat([games.loc[games['gameType'].isin(["R", "F","D","L","W","C","P"]) & ~games['detailedGameState'].isin(["Postponed"]),['dailyDataDate', 'gamePk','homeId', 'gameDate', 'gameTimeUTC', 'homeWinner']].rename({'homeId': 'teamId', 'homeWinner': 'winner'}, axis=1),
                              games.loc[games['gameType'].isin(["R", "F","D","L","W","C","P"]) & ~games['detailedGameState'].isin(["Postponed"]), ['dailyDataDate', 'gamePk','awayId', 'gameDate', 'gameTimeUTC','awayWinner']].rename({'awayId': 'teamId', 'awayWinner': 'winner'}, axis=1)])

            schedule_daily = schedule_daily.sort_values('gameTimeUTC')
            team_win_dict = schedule_daily.groupby("teamId")['winner'].apply(list).to_dict()
            for k,v in team_win_dict.items():
                if not k in team_win_history:
                    team_win_history[k] = v
                else:
                    team_win_history[k].extend(v)

            win_streaks = {k: v[::-1].index(0) if 0 in v else len(v) for k, v in team_win_history.items()}

            if not schedule_daily.empty and not p_box_scores.empty:  
                game_rosters = schedule_daily.merge(rosters, how='left', on=['gameDate','teamId'])
                game_rosters = game_rosters[game_rosters['playerId'].notnull()] #missing roster for Nationals 20200910
                game_rosters['playerId'] = game_rosters['playerId'].astype(int)
                p_box_scores = p_box_scores.sort_values("gameTimeUTC")
                p_box_scores['gameDate'] = pd.to_datetime(p_box_scores['gameDate'])
                p_box_scores['season'] = p_box_scores['gameDate'].dt.year
                player_history_daily = game_rosters.merge(p_box_scores, how='left', on=['gamePk', 'playerId']) 
                player_history_daily['gameTimeUTC_y'] = player_history_daily['gameTimeUTC_y'].fillna(player_history_daily['gameTimeUTC_x'])
                # NOTE: dailyDataDate==2020918 gamePk==631122 Start time of 2020-09-18T03:33:00Z is not accurate; that would imply the game started the day before at ~11:30PM local time
                player_history_daily = player_history_daily.sort_values(['playerId','gameTimeUTC_y']) # SORT BY gameTimeUTC from p_box_scores. `gameTimeUTC` is not accurate from the `games` data
                player_history_daily[hitter_history_feats] = player_history_daily[hitter_history_feats].fillna(0)


                hitter_history_tmp = {n: grp.to_dict('list') for n, grp in player_history_daily[hitter_history_feats + ['season', 'playerId']].groupby('playerId')}
                for k,v in hitter_history_tmp.items():
                    if not k in hitter_history_dict:
                        hitter_history_dict[k] = v
                    else:
                        for feat in hitter_history_feats + ['season']:
                            hitter_history_dict[k][feat].extend(v[feat])
                # For hitters, only use games they played in. Pitchers need off days filled in because it's important to account for rest/off days
                # Fill in days with 0 if hitter isn't in daily box scores
                # for k,v in hitter_history_dict.items():
                #     if not k in hitter_history_tmp:
                #         for feat in hitter_history_feats + ['season']:
                #             hitter_history_dict[k][feat].append(season if feat=='season' else 0.0)
                fielder_history_tmp = {n: grp.to_dict('list') for n, grp in player_history_daily[fielder_history_feats + ['season', 'playerId']].groupby('playerId')}
                for k,v in fielder_history_tmp.items():
                    if not k in fielder_history_dict:
                        fielder_history_dict[k] = v
                    else:
                        for feat in fielder_history_feats + ['season']:
                            fielder_history_dict[k][feat].extend(v[feat])

                pitcher_history_tmp = {n: grp.to_dict('list') for n, grp in p_box_scores.loc[p_box_scores['positionName']=='Pitcher', pitcher_history_feats + ['season', 'playerId']].groupby('playerId')}
                for k,v in pitcher_history_tmp.items():
                    if not k in pitcher_history_dict:
                        pitcher_history_dict[k] = v
                    else:
                        for feat in pitcher_history_feats + ['season']:
                            pitcher_history_dict[k][feat].extend(v[feat])
                # Fill in days with 0 if pitcher isn't in daily box scores
                for k,v in pitcher_history_dict.items():
                    if not k in pitcher_history_tmp:
                        for feat in pitcher_history_feats + ['season']:
                            pitcher_history_dict[k][feat].append(season if feat=='season' else 0.0)

        days_of_history = list(range(2,21)) #[2,3,4,5,7,10,20] #also could be games_of_history depending how its used
        max_days_of_history = np.max(days_of_history)
        hitting_history_features = {}
        pitching_history_features = {}
        fielding_history_features = {}

        for k, v in hitter_history_dict.items():
            # only need to include players in the current eng
            hitting_history_features[k] = {} 
            hitting_history_features[k]['hit_streak'] =  v['hits'][::-1].index(0) if 0 in v['hits'] else len(v['hits'])       
            for feat in hitter_history_feats:            
                d = hitter_history_dict[k][feat]
                hitting_history_features[k][f'{feat}_season'] = sum([f for seas, f in zip(hitter_history_dict[k]['season'], d) if seas==season])
                if feat not in ['sacFlies','atBats']:
                    d_padded = np.zeros(max_days_of_history)
                    d_padded[:np.minimum(max_days_of_history, len(d))] = d[-np.minimum(max_days_of_history, len(d)):][::-1]
                    d_cumsum = nb_cumsum(d_padded)
                    for day in days_of_history:
                        hitting_history_features[k][f'{feat}_last{day}'] = d_cumsum[day-1]
        #                 hitting_history_features[k][f'{feat}_{day-1}_games_ago'] = d_padded[day-1]

        hitting_history_df = pd.DataFrame.from_dict(hitting_history_features, orient='index').reset_index().rename({'index': 'playerId'}, axis=1)
        if 'homeRuns_season' in hitting_history_df.columns:
            hitting_history_df['homeRuns_rank'] = hitting_history_df['homeRuns_season'].rank(method='min', ascending=False)
            hitting_history_df['BA'] = hitting_history_df['hits_season'] / hitting_history_df['atBats_season']
            hitting_history_df['OBP'] = hitting_history_df[['hits_season','baseOnBalls_season', 'hitByPitch_season']].sum(axis=1) / hitting_history_df[['atBats_season','baseOnBalls_season', 'hitByPitch_season', 'sacFlies_season']].sum(axis=1)
            hitting_history_df['SLG'] = ((hitting_history_df['hits_season'] - hitting_history_df[['doubles_season','triples_season','homeRuns_season']].sum(axis=1)) + 2*hitting_history_df['doubles_season'] + 3*hitting_history_df['triples_season'] + 4*hitting_history_df['homeRuns_season'])/ hitting_history_df['atBats_season']

        for k, v in fielder_history_dict.items():
            # only need to include players in the current eng
            fielding_history_features[k] = {} 
            for feat in fielder_history_feats:    
                d = fielder_history_dict[k][feat]
                d_padded = np.zeros(max_days_of_history)
                d_padded[:np.minimum(max_days_of_history, len(d))] = d[-np.minimum(max_days_of_history, len(d)):][::-1]
                # d_padded = np.pad(d[-days_of_history:], (np.maximum(0, days_of_history-len(d)+1), 0))[::-1]
                d_cumsum = nb_cumsum(d_padded)
                for day in days_of_history:
    #                 fielding_history_features[k][f'{feat}_last{day}'] = d_cumsum[day-1]
                    fielding_history_features[k][f'{feat}_{day-1}_games_ago'] = d_padded[day-1]

        fielding_history_df = pd.DataFrame.from_dict(fielding_history_features, orient='index').reset_index().rename({'index': 'playerId'}, axis=1)

        for k, v in pitcher_history_dict.items():
            # only need to include players in the current eng
            pitching_history_features[k] = {}            
            season_starts = [starts  for seas, starts in zip(pitcher_history_dict[k]['season'], pitcher_history_dict[k]['gamesStartedPitching']) if seas==season]
            season_played = [played  for seas, played in zip(pitcher_history_dict[k]['season'], pitcher_history_dict[k]['gamesPlayedPitching']) if seas==season]
            pitching_history_features[k]['season_starts_to_date'] = sum(season_starts)
            pitching_history_features[k]['days_since_last_start'] = season_starts[::-1].index(1.0) if 1 in season_starts else len(season_starts)
            pitching_history_features[k]['days_since_last_played'] = season_played[::-1].index(1.0) if 1 in season_played else len(season_played)
            for feat in ['gamesPlayedPitching', 'gamesStartedPitching','inningsPitched', 'pitchesThrown', 'winsPitching', 'runsPitching', 'homeRunsPitching', 'strikeOutsPitching','earnedRuns', 'blownSaves', 'holds']:    
                d = pitcher_history_dict[k][feat]
                pitching_history_features[k][f'{feat}_season'] = sum([f for seas, f in zip(pitcher_history_dict[k]['season'], d) if seas==season])
                d_padded = np.pad(d, (np.maximum(0, max_days_of_history-len(d)), 0))[::-1]
                d_cumsum = nb_cumsum(d_padded)
                for day in days_of_history:
                    pitching_history_features[k][f'{feat}_last{day}'] = d_cumsum[day-1]
                    pitching_history_features[k][f'{feat}_{day-1}_games_ago'] = d_padded[day-1]

        pitching_history_df = pd.DataFrame.from_dict(pitching_history_features, orient='index').reset_index().rename({'index': 'playerId'}, axis=1)


        if not p_twitter.empty:
            p_twitter_recent = p_twitter

        # How to handle doubleheaders? Taking stats from first game for now
        if not p_box_scores.empty and not t_box_scores.empty:
            t_tmp = t_tmp.merge(p_box_scores, how='left', on=['dailyDataDate', 'playerId'])
            dh_games = t_tmp[t_tmp[['dailyDataDate','playerId']].duplicated(keep=False)].sort_values('gameTimeUTC')[['dailyDataDate','playerId','gamePk']].reset_index(drop=True)
            dh_last_game = dh_games[dh_games[['dailyDataDate','playerId']].duplicated(keep='first')] #games to remove
            t_tmp = t_tmp[~(t_tmp['playerId'].isin(dh_last_game['playerId']) & t_tmp['gamePk'].isin(dh_last_game['gamePk']))]

            t_tmp['game_score_james'] = game_score_james(t_tmp)
            t_tmp['game_score_tango'] = game_score_tango(t_tmp)
            t_tmp['position_player_pitching'] = ((t_tmp['positionCode']>1) & (t_tmp['pitchesThrown']>0)).astype(int)
            t_tmp['pitcher_hit_home_run'] = ((t_tmp['positionCode']==1) & (t_tmp['homeRuns'] > 0)).astype(int)
    #         t_tmp['pos_player_pitched_prior_day'] = 0
    #         if not prior_day_pos_player_pitching.empty:
    #             t_tmp['pos_player_pitched_prior_day'] = t_tmp['playerId'].map(dict(zip(prior_day_pos_player_pitching.playerId, prior_day_pos_player_pitching.position_player_pitching)))

            t_tmp['no_hitter'] = ((t_tmp['inningsPitched']>=9) & (t_tmp['hitsPitching']==0)).astype(int)
            t_tmp['no_hitter_league'] = t_tmp['no_hitter'].max()

            t_tmp['position_player_pitching_league'] = t_tmp['position_player_pitching'].max()
            t_tmp['game_hour'] = (pd.to_datetime(t_tmp['gameTimeUTC']) + pd.Timedelta(hours=-5)).dt.hour

            t_tmp = t_tmp.merge(t_box_scores, how='left', on=['gamePk', 'teamId'], suffixes=['','_team_box_score'])
            t_tmp['positionType_freq'] = t_tmp['positionType'].fillna(-999).map(position_freq)
            if 'positionType' in t_tmp.columns:
                t_tmp = t_tmp.merge(position_target_agg, how='left', on='positionType')

            if t_tmp.shape[0]!=eng_shape[0]:
                print("t_tmp length does not match engagement frame length, check for duplicated data")
                t_tmp = t_tmp[~t_tmp[['playerId']].duplicated()]
        else:
            if 'teamId' not in t_tmp.columns and not rosters.empty:
                t_tmp = t_tmp.merge(rosters[['playerId','teamId']], how='left', on='playerId')
            else:
                t_tmp['teamId'] = np.nan
            if t_tmp.shape[0]!=eng_shape[0]:
                print("teamId: t_tmp length does not match engagement frame length, check for duplicated data")
                t_tmp = t_tmp[~t_tmp[['playerId']].duplicated()]

        # Did player have a walk-off hit/home run?
        if not events.empty:
            events  = events.sort_values(['inning','halfInning', 'atBatIndex', 'eventId'], ascending=[True, False, True, True])
            last_play = events.groupby('gamePk').tail(1)
            # filter out top of inning because one game was ended after the top of the inning
            walk_offs = last_play[(last_play['halfInning']=='bottom') & (last_play['rbi']>0)][['dailyDataDate','hitterId', 'pitcherId','rbi', 'event']]
            walk_offs.columns = ['dailyDataDate','hitterId', 'pitcherId','walk_off_rbi', 'walk_off_hr']
            walk_offs['walk_off_hr']  =  (walk_offs['walk_off_hr'].isin(['Home Run'])).astype(int)
            t_tmp = t_tmp.merge(walk_offs[['dailyDataDate','hitterId', 'walk_off_hr', 'walk_off_rbi']].rename({'hitterId': 'playerId'}, axis=1), how='left', on=['dailyDataDate','playerId'])
            t_tmp = t_tmp.merge(walk_offs[['dailyDataDate','pitcherId', 'walk_off_hr', 'walk_off_rbi']].rename({'pitcherId': 'playerId'}, axis=1), how='left', on=['dailyDataDate','playerId'], suffixes=["","_pitcher"])
            t_tmp[['walk_off_rbi', 'walk_off_hr', 'walk_off_hr_pitcher','walk_off_rbi_pitcher']] = t_tmp[['walk_off_rbi', 'walk_off_hr', 'walk_off_hr_pitcher','walk_off_rbi_pitcher']].fillna(0)
            t_tmp['walk_off_league'] = t_tmp['walk_off_rbi'].max()

            hr_dist = events[events['event']=='Home Run'].groupby('hitterId')['totalDistance'].max().reset_index()
            hr_launchSpeed = events[events['event']=='Home Run'].groupby('hitterId')['launchSpeed'].max().reset_index()
            t_tmp = t_tmp.merge(hr_dist.rename({'hitterId': 'playerId'}, axis=1), how='left', on='playerId')
            t_tmp = t_tmp.merge(hr_launchSpeed.rename({'hitterId': 'playerId'}, axis=1), how='left', on='playerId')

            # How long did a starting pitcher go without a hit? (Did they start picking up potential no-hitter hype?)
            starters = events[events['isStarter']==1].reset_index(drop=True)
            starters['hit'] = starters['event'].isin(['Single','Double', 'Triple', 'Home Run']).astype(int)
            starters['hits_cumsum'] = starters.groupby('pitcherId')['hit'].cumsum()
            starters_first_hit_inning = starters[starters['hits_cumsum']==1].groupby('pitcherId').first()[['inning','outs']].reset_index()
            starters_first_hit_inning['inning'] = starters_first_hit_inning['inning'] + starters_first_hit_inning['outs']/10
            starters_first_hit_inning = starters_first_hit_inning.rename({'inning': 'pitcher_first_hit_inning'}, axis=1)
            t_tmp = t_tmp.merge(starters_first_hit_inning[['pitcherId', 'pitcher_first_hit_inning']], how='left', left_on='playerId', right_on='pitcherId')

            starters_first_mob_inning = starters[~starters['menOnBase'].isin([None,"Empty"])]
            starters_first_mob_inning = starters_first_mob_inning.groupby('pitcherId').first()[['inning','outs']].reset_index()
            starters_first_mob_inning['inning'] = starters_first_mob_inning['inning'] + starters_first_mob_inning['outs']/10
            starters_first_mob_inning = starters_first_mob_inning.rename({'inning': 'pitcher_first_mob_inning'}, axis=1)
            t_tmp = t_tmp.merge(starters_first_mob_inning[['pitcherId', 'pitcher_first_mob_inning']], how='left', left_on='playerId', right_on='pitcherId')           

            # Pitch features
            nastyFactor_features = events[events['type']=='pitch'].groupby("pitcherId")['nastyFactor'].agg(['mean','median','min','max']).reset_index().rename(columns={f: f'nastyFactor_{f}' for f in ['mean','median','max','min']}).rename(columns={'pitcherId': 'playerId'})
            t_tmp = t_tmp.merge(nastyFactor_features, how='left', on='playerId')
            # Calculate player Win Probability Added
            #need to get assign 100% WPA to winning team to assign WPA scores to correct player/team

            player_wpa = pd.Series(dtype=float)
            for gamePk, game in events.groupby('gamePk'):
                game = game.reset_index(drop=True)
                game['run_diff'] = game['homeScore'] - game['awayScore']
                game['halfInning_index'] = game['halfInning'].map({'top':1, 'bottom' :2})
                game['base_state'] = game['menOnBase'].map({None: np.nan, 'Empty': 1, 'Men_On': 2, 'RISP': 3, 'Loaded': 8})  
                game['base_state'] = game['base_state'].ffill().fillna(1).astype(int)
                game['outs_beg'] = np.maximum(game['outs'] - 1, 0)
                game['win_exp'] = game.apply(winnexp_feature, axis=1)
                game['win_exp_lag'] = game['win_exp'].shift(-1)
                game.loc[game.shape[0]-1, 'win_exp_lag'] = 1 if game.loc[game.shape[0]-1, 'homeScore']>game.loc[game.shape[0]-1, 'awayScore'] else 0
                game['win_exp_delta'] = game['win_exp_lag'] - game['win_exp']
                # Increases in the top of the inning are assigned to the pitcher
                # Increases in the bottom of the inning are assigned to the hitter
                pitcher_wpa_top = game.loc[(game['halfInning']=='top') & (game['win_exp_delta']>0),['pitcherId','win_exp_delta']].groupby('pitcherId')['win_exp_delta'].sum()
                hitter_wpa_top = game.loc[(game['halfInning']=='top') & (game['win_exp_delta']>0),['hitterId','win_exp_delta']].groupby('hitterId')['win_exp_delta'].sum()
                hitter_wpa_top = -hitter_wpa_top

                pitcher_wpa_bot = game.loc[(game['halfInning']=='bottom') & (game['win_exp_delta']>0),['pitcherId','win_exp_delta']].groupby('pitcherId')['win_exp_delta'].sum()
                hitter_wpa_bot = game.loc[(game['halfInning']=='bottom') & (game['win_exp_delta']>0),['hitterId','win_exp_delta']].groupby('hitterId')['win_exp_delta'].sum()
                pitcher_wpa_bot = -pitcher_wpa_bot

                player_wpa = player_wpa.add(pitcher_wpa_top, fill_value=0)
                player_wpa = player_wpa.add(hitter_wpa_top, fill_value=0)
                player_wpa = player_wpa.add(pitcher_wpa_bot, fill_value=0)
                player_wpa = player_wpa.add(hitter_wpa_bot, fill_value=0)

            player_wpa = player_wpa.reset_index()
            player_wpa = player_wpa.rename({"index": "playerId", 0: "wpa"}, axis=1)

            t_tmp = t_tmp.merge(player_wpa, how='left', on='playerId')
            t_tmp['wpa_daily_max'] = t_tmp['wpa'].max()
            t_tmp['wpa_rank'] = t_tmp['wpa'].rank(method='min', ascending=False)

            # get ejections
            ejections = events.loc[events['event']=="Ejection", ['dailyDataDate','description']].reset_index(drop=True)
            if not ejections.empty:
                ejections['description'] = [x.split(" ejected by")[0] for x in ejections['description']]
                # Get team; needed for coach_ejected feature
                ejections['teamName'] = [team_regex.findall(x)[0] if team_regex.findall(x) else None for x in ejections['description']] # else None to account for names not spelled in a way that matches the regex
                ejections['teamId'] = ejections['teamName'].map(team_mapping)            
                ejections['coach_ejected'] = [1 if coaching_regex.search(x) else 0 for x in ejections['description']]            
                ejections['player_ejected'] = 1 - ejections['coach_ejected']            
                # Get player name
                ejections['playerName'] = [team_full_regex.sub("", ' '.join(x.split())) for x in ejections['description']]
                ejections['playerName'] = [coaching_regex.sub("", ' '.join(x.split())) for x in ejections['playerName']]
                ejections['playerName'] = [pos_regex.sub("", ' '.join(x.split())).strip() for x in ejections['playerName']]
                # If there is no match for a player use fuzzywuzzy to find the closest match
                ejections.loc[(ejections['player_ejected']==1), 'playerName'] = ejections.loc[(ejections['player_ejected']==1), 'playerName'].apply(lambda x: find_closest_playerName(x, players))
                ejections.loc[(ejections['player_ejected']==1), 'playerId'] = ejections.loc[(ejections['player_ejected']==1)].apply(lambda x: find_playerId(x, players, rosters), axis=1)

                t_tmp = t_tmp.merge(ejections.groupby('teamId')['coach_ejected'].sum().reset_index(), how='left', on='teamId')
                t_tmp['coach_ejected'] = t_tmp['coach_ejected'].fillna(0)
                t_tmp = t_tmp.merge(ejections.loc[ejections['player_ejected']==1,['playerId','player_ejected']], how='left', on='playerId')
                t_tmp['player_ejected'] = t_tmp['player_ejected'].fillna(0)
            else:
                t_tmp['coach_ejected'] = 0
                t_tmp['player_ejected'] = 0

        if not rosters.empty:
    #         if 'teamId' not in t_tmp.columns:
    #             t_tmp = t_tmp.merge(rosters[['playerId','teamId']], how='left', on='playerId')
    #         t_tmp = t_tmp.merge(all_dates[['dailyDataDate_lead','teamId','nextDayGame']], how='left', left_on=['dailyDataDate', 'teamId'], right_on=['dailyDataDate_lead','teamId'])
    #         t_tmp['nextDayGame'] = t_tmp['nextDayGame'].fillna(0) 
            roster_dummies = pd.concat([rosters[['dailyDataDate','playerId']], pd.get_dummies(rosters['statusCode'])], axis=1)
            roster_dummies = roster_dummies.groupby(["dailyDataDate", "playerId"]).sum().reset_index()
            for col in ['A', 'BRV', 'D10', 'D60', 'D7', 'DEC','FME', 'PL', 'RES', 'RM', 'SU']:
                if col not in roster_dummies.columns:
                    roster_dummies[col] = 0
            t_tmp = t_tmp.merge(roster_dummies, how='left', on=['dailyDataDate','playerId'])
        else: 
            t_tmp[['A', 'BRV', 'D10', 'D60', 'D7', 'DEC','FME', 'PL', 'RES', 'RM', 'SU']] = 0
            t_tmp['nextDayGame'] = 0 # There should be a better way to handle this. Don't want to miss this just because rosters is missing 

        if not transactions.empty:
            transactions_dummies = pd.concat([transactions[['dailyDataDate','playerId']], pd.get_dummies(transactions['typeCode'])], axis=1)
            transactions_dummies = transactions_dummies.groupby(["dailyDataDate", "playerId"]).sum().reset_index()
            for col in ['ASG', 'CLW', 'CU', 'DES', 'DFA', 'NUM','OPT', 'OUT', 'REL', 'RET', 'RTN', 'SC', 'SE', 'SFA', 'SGN', 'TR']:
                if col not in transactions_dummies.columns:
                    transactions_dummies[col] = 0
            t_tmp = t_tmp.merge(transactions_dummies, how='left', on=['dailyDataDate','playerId'])
        else:
            t_tmp[['ASG', 'CLW', 'CU', 'DES', 'DFA', 'NUM','OPT', 'OUT', 'REL', 'RET', 'RTN', 'SC', 'SE', 'SFA', 'SGN', 'TR']] = 0

        if not awards.empty:
            awards_filtered = awards[awards['awardId'].isin(keep_awards + keep_annual_awards)].reset_index(drop=True)

            # Update awards counts
            awards_dict_tmp = {n: grp.to_dict('list') for n, grp in awards_filtered[['awardId','playerId']].groupby('playerId')}
            for k,v in awards_dict_tmp.items():
                if not k in awards_dict:
                    awards_dict[k] = {}
                counts = np.unique(v['awardId'], return_counts=True)
                for feat, value in zip(counts[0], counts[1]):
                    if feat in awards_dict[k]:
                        awards_dict[k][feat] += value
                    else:
                        awards_dict[k][feat] = value

            awards_filtered = awards[awards['awardId'].isin(keep_awards)].reset_index(drop=True)
            if not awards_filtered.empty:

                awards_dummies = pd.concat([awards_filtered[['dailyDataDate','playerId']], pd.get_dummies(awards_filtered['awardId'])], axis=1)
                awards_dummies = awards_dummies.groupby(["dailyDataDate", "playerId"]).sum().reset_index()
                for col in  keep_awards:
                    if col not in awards_dummies.columns:
                        awards_dummies[col] = 0
                t_tmp = t_tmp.merge(awards_dummies, how='left', on=['dailyDataDate','playerId'])
            else:
                t_tmp[keep_awards] = 0 
        else:
            t_tmp[keep_awards] = 0

        if t_tmp.shape[0]!=eng_shape[0]:
            print("awards: t_tmp length does not match engagement frame length, check for duplicated data")
            t_tmp = t_tmp[~t_tmp[['playerId']].duplicated()]

        awards_df = pd.DataFrame.from_dict(awards_dict, orient='index').fillna(0)
        awards_df.columns = [f'{x}_career' for x in awards_df.columns]
        t_tmp = t_tmp.merge(awards_df.reset_index().rename(columns={'index': 'playerId'}), how='left', on='playerId')
        t_tmp[awards_df.columns] = t_tmp[awards_df.columns].fillna(0)

        if not standings.empty:
            standings = standings.replace("-",0.0)
            object_cols = standings.select_dtypes(exclude=['float']).columns
            standings[object_cols] = standings[object_cols].apply(pd.to_numeric, downcast='float', errors='coerce')
            bool_cols  = standings.select_dtypes(include=['boolean']).columns
            standings[bool_cols] = standings[bool_cols].astype(int)        
            t_tmp = t_tmp.merge(standings, how='left', on=['teamId'], suffixes=['','_team_standings'])
            t_tmp['team_games_played'] = t_tmp['wins'] + t_tmp['losses']
            if t_tmp.shape[0]!=eng_shape[0]:
                print("standings: t_tmp length does not match engagement frame length, check for duplicated data")
                t_tmp = t_tmp[~t_tmp[['playerId']].duplicated()]


        if len(win_streaks) > 0:
            t_tmp['team_win_streak'] = t_tmp['teamId'].map(win_streaks)
        if not hitting_history_df.empty:
            t_tmp = t_tmp.merge(hitting_history_df, how='left', on='playerId')
            t_tmp['hr_rank'] = t_tmp['homeRuns_season'].rank(ascending=False)
            if t_tmp.shape[0]!=eng_shape[0]:
                print("hitting_history_df: t_tmp length does not match engagement frame length, check for duplicated data")
                t_tmp = t_tmp[~t_tmp[['playerId']].duplicated()]

        if not pitching_history_df.empty:
            t_tmp = t_tmp.merge(pitching_history_df, how='left', on='playerId')
            # Calculate ERA
            # there are no more standings after season end so team_games_played is no longer known
            if 'team_games_played' in t_tmp.columns:
                t_tmp['era'] = 9 * (t_tmp['earnedRuns_season']/ t_tmp['inningsPitched_season'])
                t_tmp['era_rank'] = t_tmp.loc[t_tmp['inningsPitched_season'] >= t_tmp['team_games_played'], 'era'].rank(method='min')
            if t_tmp.shape[0]!=eng_shape[0]:
                print("pitching_history_df: t_tmp length does not match engagement frame length, check for duplicated data")
                t_tmp = t_tmp[~t_tmp[['playerId']].duplicated()]
        if not fielding_history_df.empty:
            t_tmp = t_tmp.merge(fielding_history_df, how='left', on='playerId')
            if t_tmp.shape[0]!=eng_shape[0]:
                print("fielding_history_df: t_tmp length does not match engagement frame length, check for duplicated data")
                t_tmp = t_tmp[~t_tmp[['playerId']].duplicated()]


        player_countries = ['Aruba', 'Australia', 'Bahamas', 'Brazil', 'Canada',
           'China', 'Colombia', 'Cuba', 'Curacao', 'Dominican Republic', 'Germany',
           'Honduras', 'Japan', 'Lithuania', 'Mexico', 'Netherlands', 'Nicaragua',
           'Northern Ireland', 'Panama', 'Peru', 'Puerto Rico', 'Saudi Arabia',
           'South Africa', 'South Korea', 'Taiwan', 'U.S. Virgin Islands', 'USA',
           'Venezuela']
        t_tmp = t_tmp.merge(player_country_dummies, how='left', on='playerId')

        # Add games features
    #     if not games.empty and not p_box_scores.empty:
    #         games['dayNight'] = games['dayNight'].map({'day': 0, 'night': 1})
    #         games['homeWinner'] = games['homeWinner'].fillna(-1).astype(float)
    #         t_tmp = t_tmp.merge(games[['gamePk', 'dayNight','homeWinPct','awayWinPct','homeScore','awayScore','homeWinner']], how='left', on='gamePk')
        # Add Twitter features
        if not p_twitter_recent.empty:
            t_tmp = t_tmp.merge(p_twitter_recent[['playerId','numberOfFollowers']], how='left', on=['playerId'])

        if not p_twitter_delta.empty:
            t_tmp = t_tmp.merge(p_twitter_delta, how='left', on=['playerId'])

        ### TRAILING AGGREGATION MERGES
        t_tmp['ddd_month'] = np.floor(t_tmp.dailyDataDate/100).clip(upper=MAX_MONTH).astype(int)
        t_tmp = t_tmp.merge(dt_player_aggregations,how="left",on=['playerId','ddd_month'])

        if 'gamePk' in t_tmp.columns:
            t_tmp['played_game'] = t_tmp['gamePk'].notnull().astype(float)
        else:
            t_tmp['played_game'] = 0.0

        t_tmp = t_tmp.merge(dt_player_game_aggregations,how="left",on=['playerId','ddd_month','played_game'])

        # t_tmp = t_tmp.merge(recent_player_means, how='left', on='playerId')
        t_tmp['monthday'] = t_tmp['dailyDataDate'].astype(str).str[4:].astype(int)
        t_tmp['dayofweek'] = pd.to_datetime(t_tmp['date_playerId'].str.split("_", expand=True)[0]).dt.dayofweek
        t_tmp['data_dayofmonth'] = t_tmp['dailyDataDate'].astype(str).str[6:].astype(int)
        t_tmp['eng_dayofmonth'] = pd.to_datetime(t_tmp['date_playerId'].str.split("_", expand=True)[0]).dt.day
        
        # Fill season values with 2021
        t_tmp['season'] = season
        
#         t.append(t_tmp)


        use_cols = lgb_target1.feature_name()
        # Add any missing columns so that it does not crash
        missing_cols = [col for col in use_cols if col not in t_tmp.columns]
        for col in missing_cols:
            print(f'lgb_target1 missing: {col}')
            t_tmp[col] = np.nan

        sub['target1_v30'] = np.clip(lgb_target1.predict(t_tmp[use_cols]), 0, 100)
        sub['target2_v30'] = np.clip(lgb_target2.predict(t_tmp[use_cols]), 0, 100)
        sub['target3_v30'] = np.clip(lgb_target3.predict(t_tmp[use_cols]), 0, 100)
        sub['target4_v30'] = np.clip(lgb_target4.predict(t_tmp[use_cols]), 0, 100)
        
        use_cols = lgb_bfa_target1.feature_name()
        # Add any missing columns so that it does not crash
        missing_cols = [col for col in use_cols if col not in t_tmp.columns]
        for col in missing_cols:
            print(f'lgb_bfa_target1 missing: {col}')
            t_tmp[col] = np.nan
            
        sub['target1_v30_bfa'] = np.clip(lgb_bfa_target1.predict(t_tmp[use_cols]), 0, 100)
        sub['target2_v30_bfa'] = np.clip(lgb_bfa_target2.predict(t_tmp[use_cols]), 0, 100)
        sub['target3_v30_bfa'] = np.clip(lgb_bfa_target3.predict(t_tmp[use_cols]), 0, 100)
        sub['target4_v30_bfa'] = np.clip(lgb_bfa_target4.predict(t_tmp[use_cols]), 0, 100)

        use_cols = lgb_dblsqrt_target1.feature_name()
        # Add any missing columns so that it does not crash
        missing_cols = [col for col in use_cols if col not in t_tmp.columns]
        for col in missing_cols:
            print(f'lgb_dblsqrt_target1 missing: {col}')
            t_tmp[col] = np.nan
            
        sub['target1_v30_dblsqrt'] = np.clip(lgb_dblsqrt_target1.predict(t_tmp[use_cols])**2**2, 0, 100)
        sub['target2_v30_dblsqrt'] = np.clip(lgb_dblsqrt_target2.predict(t_tmp[use_cols])**2**2, 0, 100)
        sub['target3_v30_dblsqrt'] = np.clip(lgb_dblsqrt_target3.predict(t_tmp[use_cols])**2**2, 0, 100)
        sub['target4_v30_dblsqrt'] = np.clip(lgb_dblsqrt_target4.predict(t_tmp[use_cols])**2**2, 0, 100)

        dart_use_cols = lgb_dart_target1.feature_name()
        # Add any missing columns so that it does not crash
        missing_cols = [col for col in dart_use_cols if col not in t_tmp.columns]
        for col in missing_cols:
            print(f'lgb_dart_target1 missing: {col}')
            t_tmp[col] = np.nan
        sub['target1_dart'] = np.clip(lgb_dart_target1.predict(t_tmp[dart_use_cols]), 0, 100)
        sub['target2_dart'] = np.clip(lgb_dart_target2.predict(t_tmp[dart_use_cols]), 0, 100)
        sub['target3_dart'] = np.clip(lgb_dart_target3.predict(t_tmp[dart_use_cols]), 0, 100)
        sub['target4_dart'] = np.clip(lgb_dart_target4.predict(t_tmp[dart_use_cols]), 0, 100)

        with open("../input/d/brandenkmurray/mlbmodels/xgb_v30_use_cols.txt") as f:
            xgb_use_cols = [x.rstrip() for x in f.readlines()]
#         xgb_use_cols = xgb_target1.feature_names
        missing_cols = [col for col in xgb_use_cols if col not in t_tmp.columns]
        missing_cols_filled = [x.replace(" ", "_") for x in missing_cols]
        t_tmp = t_tmp.rename(columns={k: v for k,v in zip(missing_cols_filled, missing_cols)})
        for col in missing_cols:
            print(f"{col} is missing for XGB model. Adding and filling with NaN")
            t_tmp[col] = np.nan

        sub['target1_xgb'] = np.clip(xgb_target1.predict(xgb.DMatrix(t_tmp[xgb_use_cols].fillna(-99999).replace(np.inf, -99999)))**2**2, 0, 100)
        sub['target2_xgb'] = np.clip(xgb_target2.predict(xgb.DMatrix(t_tmp[xgb_use_cols].fillna(-99999).replace(np.inf, -99999)))**2**2, 0, 100)
        sub['target3_xgb'] = np.clip(xgb_target3.predict(xgb.DMatrix(t_tmp[xgb_use_cols].fillna(-99999).replace(np.inf, -99999)))**2**2, 0, 100)
        sub['target4_xgb'] = np.clip(xgb_target4.predict(xgb.DMatrix(t_tmp[xgb_use_cols].fillna(-99999).replace(np.inf, -99999)))**2**2, 0, 100)



        correlates = ['hitBatsmen',
         'no_hitter',
         'home_team_box_score',
         'hitBatsmen_team_box_score',
         'season_team_standings',
         'sportGamesBack',
         'nlWins',
         'nlLosses',
         'errors_1_games_ago']

        t_tmp = t_tmp.drop(columns=correlates)
        use_cols = lgb_john_target1.feature_name()
        missing_cols = [col for col in use_cols if col not in t_tmp.columns]
        for col in missing_cols:
            print(col + " missing for John's model")
            t_tmp[col] = np.nan

        sub['target1_john'] = np.clip(lgb_john_target1.predict(t_tmp[use_cols]), 0, 100)
        sub['target2_john'] = np.clip(lgb_john_target2.predict(t_tmp[use_cols]), 0, 100)
        sub['target3_john'] = np.clip(lgb_john_target3.predict(t_tmp[use_cols]), 0, 100)
        sub['target4_john'] = np.clip(lgb_john_target4.predict(t_tmp[use_cols]), 0, 100) 

        sub['target1'] = (sub['target1_john']*0.4) + ((sub['target1_v30_bfa']*0.2 + sub['target1_v30']*0.1 + sub['target1_v30_dblsqrt']*0.7)*0.1) + (sub['target1_xgb']*0.1) + (sub['target1_dart']*0.4)
        sub['target2'] = (sub['target2_john']*0.4) + ((sub['target2_v30_bfa']*0.2 + sub['target2_v30']*0.1 + sub['target2_v30_dblsqrt']*0.7)*0.1) + (sub['target2_xgb']*0.1) + (sub['target2_dart']*0.4)
        sub['target3'] = (sub['target3_john']*0.4) + ((sub['target3_v30_bfa']*0.2 + sub['target3_v30']*0.1 + sub['target3_v30_dblsqrt']*0.7)*0.1) + (sub['target3_xgb']*0.1) + (sub['target3_dart']*0.4)
        sub['target4'] = (sub['target4_john']*0.4) + ((sub['target4_v30_bfa']*0.2 + sub['target4_v30']*0.1 + sub['target4_v30_dblsqrt']*0.7)*0.1) + (sub['target4_xgb']*0.1) + (sub['target4_dart']*0.4)
    
    except Exception as e:
        #If all else fails try to use player means
        print(f'Main loop failed: {e}')
        try:
            print("Using player rolling12 means")
            sub['ddd_month'] = np.floor(sub.dailyDataDate/100).clip(upper=MAX_MONTH).astype(int)
            sub = sub.drop(['target1','target2','target3','target4'], axis=1)
            sub = sub.merge(dt_player_game_aggregations[['playerId','ddd_month','roll12_target1_p_gameday_median', 'roll12_target2_p_gameday_median',
       'roll12_target3_p_gameday_median', 'roll12_target4_p_gameday_median']],how="left",on=['playerId','ddd_month'])
            sub = sub.rename({k:v for k,v in zip(['roll12_target1_p_gameday_median', 'roll12_target2_p_gameday_median',
       'roll12_target3_p_gameday_median', 'roll12_target4_p_gameday_median'], ['target1','target2','target3','target4'])}, axis=1)
        except Exception as e:
            print(e)
            # If player medians fail, use overall medians
            print("Player medians failed. Using overall medians")
            sub['target1'] = 0.001046
            sub['target2'] = 0.521472
            sub['target3'] = 0.001735
            sub['target4'] = 0.226034
        
    # Do a final check to ensure there are no duplicate players that will cause a scoring error
    sub = sub[~sub[['playerId']].duplicated()]
    
#     sub_list.append(sub)
    env.predict(sub[['date_playerId','target1','target2','target3','target4']])

#     eng_lag = sub[['playerId','target1','target2','target3','target4']].copy()
#     eng_lag = eng_lag.rename({'target1': 'target1_lag',
#                     'target2': 'target2_lag',
#                     'target3': 'target3_lag',
#                     'target4': 'target4_lag'}, axis=1)

In [None]:
# sub_all = pd.concat(sub_list)
# include_players = players[players['playerForTestSetAndFuturePreds']==1]['playerId'].tolist()
# sub_all = sub_all[sub_all['playerId'].isin(include_players)]

# print("v30")
# may_mae_list = []
# june_mae_list = []
# for target in ['target1', 'target2', 'target3', 'target4']:
#     may_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_v30'] - sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_act'])))
#     june_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_v30'] - sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_act'])))
# # print(may_mae_list)
# print(f"May MAE: {np.mean(may_mae_list)}")
# print(f"June MAE: {np.mean(june_mae_list)}")

# print("v30_bfa")
# may_mae_list = []
# june_mae_list = []
# for target in ['target1', 'target2', 'target3', 'target4']:
#     may_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_v30_bfa'] - sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_act'])))
#     june_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_v30_bfa'] - sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_act'])))
# # print(may_mae_list)
# print(f"May MAE: {np.mean(may_mae_list)}")
# print(f"June MAE: {np.mean(june_mae_list)}")

# print("v30_dblsqrt")
# may_mae_list = []
# june_mae_list = []
# for target in ['target1', 'target2', 'target3', 'target4']:
#     may_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_v30_dblsqrt'] - sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_act'])))
#     june_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_v30_dblsqrt'] - sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_act'])))
# # print(may_mae_list)
# print(f"May MAE: {np.mean(may_mae_list)}")
# print(f"June MAE: {np.mean(june_mae_list)}")

# print("john")
# may_mae_list = []
# june_mae_list = []
# for target in ['target1', 'target2', 'target3', 'target4']:
#     may_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_john'] - sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_act'])))
#     june_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_john'] - sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_act'])))
# # print(may_mae_list)
# print(f"May MAE: {np.mean(may_mae_list)}")
# print(f"June MAE: {np.mean(june_mae_list)}")

# print("xgb")
# may_mae_list = []
# june_mae_list = []
# for target in ['target1', 'target2', 'target3', 'target4']:
#     may_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_xgb'] - sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_act'])))
#     june_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_xgb'] - sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_act'])))
# # print(may_mae_list)
# print(f"May MAE: {np.mean(may_mae_list)}")
# print(f"June MAE: {np.mean(june_mae_list)}")

# print("dart")
# may_mae_list = []
# june_mae_list = []
# for target in ['target1', 'target2', 'target3', 'target4']:
#     may_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_dart'] - sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_act'])))
#     june_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_dart'] - sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_act'])))
# # print(may_mae_list)
# print(f"May MAE: {np.mean(may_mae_list)}")
# print(f"June MAE: {np.mean(june_mae_list)}")

# print("blend")
# may_mae_list = []
# june_mae_list = []
# for target in ['target1', 'target2', 'target3', 'target4']:
#     may_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target] - sub_all.loc[(sub_all['dailyDataDate']>=20210501) & (sub_all['dailyDataDate']<=20210531), target + '_act'])))
#     june_mae_list.append(np.mean(np.abs(sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target] - sub_all.loc[(sub_all['dailyDataDate']>=20210601) & (sub_all['dailyDataDate']<=20210630), target + '_act'])))
# # print(may_mae_list)
# print(f"May MAE: {np.mean(may_mae_list)}")
# print(f"June MAE: {np.mean(june_mae_list)}")

In [None]:
# t_df = pd.concat(t)
# t_df.to_csv("./train_features.csv", index=False)