#### Credit to @columbia2131 - I started with his notebook and then added an external data set with descriptive statistics of the targets for each player and also added unique params for each target model

## About Dataset

## Training

In [None]:
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
import mlb
import gc

In [None]:
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('../input/mlb-pdef-train-dataset')

players = pd.read_csv(BASE_DIR / 'players.csv')

rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
followers = pd.read_pickle(TRAIN_DIR / 'playerTwitterFollowers_train.pkl')
team_followers = pd.read_pickle(TRAIN_DIR / 'teamTwitterFollowers_train.pkl')
team_followers = team_followers.rename(columns={'numberOfFollowers': 'teamFollowers'})

players_id_map=dict(zip(players.playerId.values,players.playerName.values))

In [None]:
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName']
roster_cols = ['playerId', 'teamId', 'status', 'date']
followers_cols = ['playerId', 'numberOfFollowers', 'date']
teamfollowers_cols = ['teamId', 'teamFollowers', 'date']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'date']

feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances','target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob','target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob','target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob','target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob']#'numberOfFollowers','teamFollowers'

In [None]:
player_target_stats = pd.read_csv("../input/player-target-stats/player_target_stats.csv")
wOBA_weights=pd.read_csv('../input/mlb-img/FanGraphs Leaderboard.csv')

data_names=player_target_stats.columns.values.tolist()
#data_names

In [None]:
boxscores_df=pd.read_csv('../input/mlb-img/boxscores_df.csv')
boxscores_df.head(2)

In [None]:
class Sabermetrics:
    def __init__(self,df,method,wOBA_weights,grpby_elem,season,mode=None):
        
        self.df=df
        self.method=method
        self.wOBA_weights=wOBA_weights
        self.grpby_elem=grpby_elem
        self.season=season
        self.mode=mode

    def __get_battingstats__(self,season):        
        epsilon=1e-10
        sabermetrics_bat=pd.DataFrame()
        sabermetrics_bat['playerId']=self.df.playerId.unique()
        temp_scores=self.df[self.df.year==season]
        if self.mode=='all':
            print("zzsdsdd")
            temp_scores=self.df
            
            
        tmp_dict=dict(zip(self.df.playerId.values,self.df.playerName.values))
        wOBA_2018=self.wOBA_weights[self.wOBA_weights.Season==season]
        sabermetrics_bat['playerName']=sabermetrics_bat['playerId'].map(tmp_dict)
        
        sabermetrics_bat['games_played']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).gamesPlayedBatting.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['num_plate_appearances']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).plateAppearances.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['total_homeruns_scored']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).homeRuns.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['net_stolen_bases']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).stolenBases.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['net_caught_stealing']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).caughtStealing.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['walk_percent']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).baseOnBalls.sum())
        sabermetrics_bat['walk_percent']=sabermetrics_bat['walk_percent'].values/(sabermetrics_bat['num_plate_appearances'].values+epsilon)
        sabermetrics_bat['tot_groundouts']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).groundOuts.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_flyouts']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).flyOuts.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_strikeouts']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).strikeOuts.sum())
        sabermetrics_bat['strikeout_percent']=sabermetrics_bat['tot_strikeouts'].values/(sabermetrics_bat['num_plate_appearances'].values+epsilon)
        sabermetrics_bat['tot_singles']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).singles.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_doubles']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).doubles.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_triples']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).triples.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_hits']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).hits.sum())
        sabermetrics_bat['tot_hitsbypitch']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).hitByPitch.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_attbat']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).atBats.sum())
        sabermetrics_bat['tot_sacflies']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).sacFlies.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_sacbunts']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).sacBunts.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_baseonball']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).baseOnBalls.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_intentionalwalks']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).intentionalWalks.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['tot_unintentionalwalks']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).unintentional_walks.agg({self.method}).to_dict()[self.method])
        sabermetrics_bat['batting_average']=sabermetrics_bat['tot_hits'].values/(sabermetrics_bat['tot_attbat'].values+epsilon)
        sabermetrics_bat['tot_GIDP']=sabermetrics_bat[self.grpby_elem].map(temp_scores.groupby(self.grpby_elem).groundIntoDoublePlay.agg({self.method}).to_dict()[self.method])

        # BABIP (Batting Average on Balls in Play): The rate at which the batter gets a hit when he puts the ball in play, calculated as (H-HR)/(AB-K-HR+SF).
        sabermetrics_bat['BABIP']=sabermetrics_bat['tot_hits'].values-sabermetrics_bat['total_homeruns_scored'].values
        temp=sabermetrics_bat['tot_attbat'].values+sabermetrics_bat['tot_sacflies'].values-sabermetrics_bat['tot_strikeouts'].values-sabermetrics_bat['total_homeruns_scored'].values
        sabermetrics_bat['BABIP']=sabermetrics_bat['BABIP'].values/(temp+epsilon)

        #OBP (On Base Percentage): Rate at which the batter reaches base, calculated as (H+BB+HBP)/(AB+BB+HBP+SF).
        sabermetrics_bat['OBP']=sabermetrics_bat['tot_hits'].values+sabermetrics_bat['tot_baseonball'].values+sabermetrics_bat['tot_hitsbypitch'].values
        temp=sabermetrics_bat['tot_attbat'].values+sabermetrics_bat['tot_sacflies'].values+sabermetrics_bat['tot_baseonball'].values+sabermetrics_bat['tot_hitsbypitch'].values
        sabermetrics_bat['OBP']=sabermetrics_bat['OBP'].values/(temp+epsilon)

        #wOBA = (0.690×uBB + 0.722×HBP + 0.888×1B + 1.271×2B + 1.616×3B + 2.101×HR) / (AB + BB – IBB + SF + HBP). The coefficients are available for respective seasons
        sabermetrics_bat['wOBA']=wOBA_2018['wBB'].values*sabermetrics_bat['tot_unintentionalwalks'].values+wOBA_2018['wHBP'].values*sabermetrics_bat['tot_hitsbypitch'].values+\
                                    wOBA_2018['w1B'].values*sabermetrics_bat['tot_singles'].values+wOBA_2018['w2B'].values*sabermetrics_bat['tot_doubles'].values+wOBA_2018['w3B'].values*sabermetrics_bat['tot_triples'].values+\
                                    wOBA_2018['wHR'].values*sabermetrics_bat['total_homeruns_scored'].values
        temp=sabermetrics_bat['tot_attbat'].values+sabermetrics_bat['tot_sacflies'].values+sabermetrics_bat['tot_baseonball'].values-sabermetrics_bat['tot_intentionalwalks'].values+sabermetrics_bat['tot_hitsbypitch'].values
        sabermetrics_bat['wOBA']=sabermetrics_bat['wOBA'].values/(temp+epsilon)
        sabermetrics_bat['wOBA']=sabermetrics_bat['wOBA'].clip(0,1)
        
        #Total bases =  [1B + (2 × 2B) + (3 × 3B) + (4 × HR)]
        sabermetrics_bat['net_TB']=sabermetrics_bat['tot_singles'].values+2*sabermetrics_bat['tot_doubles'].values+\
                    3*sabermetrics_bat['tot_triples'].values+sabermetrics_bat['total_homeruns_scored'].values

        #SLG – Slugging average: total bases achieved on hits divided by at-bats (TB/AB)
        sabermetrics_bat['SLG']=sabermetrics_bat['net_TB']/(sabermetrics_bat['tot_attbat']+epsilon)
        sabermetrics_bat['SLG']=sabermetrics_bat['SLG'].clip(0,3)
        
        #A =  H + BB + HBP - HR - .5 * IBB    
        #B = (1.4 * TB - .6 * H - 3 * HR + .1 * (BB + HBP - IBB) + .9 * (SB - CS - GIDP)) * 1.1
        #C = AB - H + CS + GIDP

        sabermetrics_bat['A_bsr']=sabermetrics_bat['tot_hits']+sabermetrics_bat['tot_baseonball']+sabermetrics_bat['tot_hitsbypitch']-\
                                sabermetrics_bat['total_homeruns_scored']-0.5*sabermetrics_bat['tot_intentionalwalks']
        sabermetrics_bat['B_bsr']=1.1*(1.4*sabermetrics_bat['net_TB']-0.6*sabermetrics_bat['tot_hits']-3*sabermetrics_bat['total_homeruns_scored']+\
                                0.1*(sabermetrics_bat['tot_baseonball']+sabermetrics_bat['tot_hitsbypitch']-sabermetrics_bat['tot_intentionalwalks'])+\
                                0.9*(sabermetrics_bat['net_stolen_bases']-sabermetrics_bat['net_caught_stealing']-sabermetrics_bat['tot_GIDP']))                     

        sabermetrics_bat['C_bsr']=sabermetrics_bat['tot_attbat']+sabermetrics_bat['net_caught_stealing']-sabermetrics_bat['tot_hits']+\
                                sabermetrics_bat['tot_GIDP']

        sabermetrics_bat['bsr']=sabermetrics_bat['total_homeruns_scored']*(sabermetrics_bat['A_bsr']*sabermetrics_bat['B_bsr'])/(sabermetrics_bat['B_bsr']+sabermetrics_bat['C_bsr'])

        #wRAA= {{wOBA-.320}/{1.25}}*(AB+BB+HBP+SF+SH)

        sabermetrics_bat['wRAA']=(sabermetrics_bat['wOBA'].values-np.ones(len(sabermetrics_bat))*0.32)/1.25 * \
                            (sabermetrics_bat['tot_attbat']+sabermetrics_bat['tot_baseonball']+sabermetrics_bat['tot_hitsbypitch']+\
                             sabermetrics_bat['tot_sacflies']+sabermetrics_bat['tot_sacbunts'])
        sabermetrics_bat['wRAA']=sabermetrics_bat['wRAA'].clip(-20,40)
        
        #ISO - calculated by subtracting batting average from slugging percentage
        sabermetrics_bat['IS0']=sabermetrics_bat['SLG']-sabermetrics_bat['batting_average']

        if self.method!='mean':

            for cols in sabermetrics_bat.columns:
                if 'tot' in cols or 'net' in cols:
                    sabermetrics_bat[cols]=sabermetrics_bat[cols]/sabermetrics_bat['games_played']

        return sabermetrics_bat
    
    def __get_pitchingstats__(self,season):
        
        print(self.season)
        epsilon=1e-10
        saber_pitch=pd.DataFrame()
        saber_pitch['playerId']=self.df.playerId.unique()
        box_scores=self.df[self.df.year==season]
        if self.mode=='all':
            box_scores=self.df.copy(deep=True)
        
        tmp_dict=dict(zip(self.df.playerId.values,self.df.playerName.values))
        wOBA_2018=self.wOBA_weights[self.wOBA_weights.Season==season]
        saber_pitch['playerName']=saber_pitch['playerId'].map(tmp_dict)

        
        
        saber_pitch['games_played_pitch']=saber_pitch['playerId'].map(box_scores.groupby('playerId').gamesPlayedPitching.sum())
        saber_pitch['games_started_pitch']=saber_pitch['playerId'].map(box_scores.groupby('playerId').gamesStartedPitching.sum())
        saber_pitch['game_nonstarting_pitcher']=saber_pitch['games_played_pitch']-saber_pitch['games_started_pitch']
        saber_pitch['tot_walks_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).baseOnBallsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_battersfaced']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).battersFaced.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_balks']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).balks.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_blownsaves']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).blownSaves.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_shutout_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).shutoutsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_wins_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).winsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_loss_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).lossesPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_flyout_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).flyOutsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_airout_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).airOutsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_groundout_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).groundOutsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_strikeout_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).strikeOutsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_runs_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).runsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_doubles_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).doublesPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_triples_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).triplesPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_homeruns_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).homeRunsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_intentionalwalks_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).intentionalWalksPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_unintentionalwalks_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).unintentional_walks_pitch.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_hits_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).hitsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_HBP_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).hitByPitchPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_atbats_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).atBatsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_caughtsteal_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).caughtStealingPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_stolenbases_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).stolenBasesPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_innings_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).inningsPitched.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_saveopportunities']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).saveOpportunities.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_earnedruns']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).earnedRuns.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_outs_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).outsPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_pitchesthrown']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).pitchesThrown.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_pitchesthrown_cnt']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).balls.count())
        saber_pitch['tot_balls_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).earnedRuns.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_strikes_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).strikes.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_hitbatsman']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).hitBatsmen.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_wildpitches']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).wildPitches.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_rbi_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).rbiPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['games_finished_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).gamesFinishedPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_inherited_runner']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).inheritedRunners.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_inherited_runner_scored']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).inheritedRunnersScored.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_catcherinterference_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).catchersInterferencePitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_sacflies_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).sacFliesPitching.agg({self.method}).to_dict()[self.method])
        saber_pitch['tot_sacbunts_pitch']=saber_pitch[self.grpby_elem].map(box_scores.groupby(self.grpby_elem).sacBuntsPitching.agg({self.method}).to_dict()[self.method])

        saber_pitch['GO_AO_ratio']=saber_pitch['tot_groundout_pitch']/(saber_pitch['tot_airout_pitch']+epsilon)
        saber_pitch['GO_AO_ratio']=saber_pitch['GO_AO_ratio'].clip(0,10)

        saber_pitch['IP_GS_ratio']=saber_pitch['tot_innings_pitch']/(saber_pitch['games_started_pitch']+epsilon)
        #saber_pitch[saber_pitch['IP_GS_ratio']>25]=25
        saber_pitch['IP_GS_ratio']=saber_pitch['IP_GS_ratio'].clip(0,20)

        saber_pitch['K_BB_ratio']=saber_pitch['tot_strikeout_pitch']/(saber_pitch['tot_walks_pitch']+epsilon)
        #saber_pitch[saber_pitch['K_BB_ratio']>10]=10
        saber_pitch['K_BB_ratio']=saber_pitch['K_BB_ratio'].clip(0,20)
        

        saber_pitch['PC_ST_ratio']=saber_pitch['tot_pitchesthrown']/(saber_pitch['tot_strikeout_pitch']+epsilon)
        #saber_pitch[saber_pitch['PC_ST_ratio']>1.75]=1.75
        saber_pitch['PC_ST_ratio']=saber_pitch['PC_ST_ratio'].clip(0,50)
        

        #ERA – Earned run average
        saber_pitch['ERA']=9*saber_pitch['tot_earnedruns']/(saber_pitch['tot_innings_pitch']+epsilon)
        saber_pitch['ERA']=saber_pitch['ERA'].clip(0,20)
        #ERA+ – Adjusted ERA+  = 100*(2-{{\mathit {ERA}} \over {\mathit {lgERA}}}\cdot {1 \over {\mathit {PF}}})}


        # PTB(pitcher's total bases)=0.89*(1.255*(H-HR)+4*HR)+0.56*(BB+HBP-IBB)
        saber_pitch['PTB']=0.89*((saber_pitch['tot_hits_pitch']-saber_pitch['tot_homeruns_pitch'])*1.255 + 4*saber_pitch['tot_homeruns_pitch']) +\
        0.56*(saber_pitch['tot_walks_pitch']+saber_pitch['tot_HBP_pitch']-saber_pitch['tot_intentionalwalks_pitch'])
        saber_pitch['PTB']=saber_pitch['PTB'].clip(0,20)
        

        # ERC(Component ERA)=9*{(H+BB+HBP)*PTB / BFP*IP}-0.56

        saber_pitch['CERA']=9*(((saber_pitch['tot_hits_pitch']+saber_pitch['tot_walks_pitch']+saber_pitch['tot_hits_pitch'])*saber_pitch['PTB'])/ \
                        (saber_pitch['tot_battersfaced']*saber_pitch['tot_innings_pitch']+epsilon)) - 0.56
        saber_pitch['CERA']=saber_pitch['CERA'].clip(0,8)
        
        #Defense-Independent Component ERA (DICE) = 3.00+{{13HR+3(BB+HBP)-2K}/{IP}}}

        saber_pitch['DICE']=3+(13*saber_pitch['tot_homeruns_pitch']+3*(saber_pitch['tot_walks_pitch']+saber_pitch['tot_HBP_pitch'])-2*saber_pitch['tot_strikeout_pitch'])/\
                                (saber_pitch['tot_innings_pitch']+epsilon)
        saber_pitch['DICE']=saber_pitch['DICE'].clip(0,8)

        # FIP= {13HR+3BB-2K}/{IP}}+C
        #saber_pitch['FIP']=((13*saber_pitch['tot_homeruns_pitch']+3*saber_pitch['tot_walks_pitch']-2*saber_pitch['tot_strikeout_pitch'])/(saber_pitch['tot_innings_pitch']+epsilon))+bias_weights['cFIP']



        # BAA(Batting average against) = {H}/{BF-BB-HBP-SH-SF-CINT}}}
        temp=saber_pitch['tot_battersfaced']-saber_pitch['tot_walks_pitch']-saber_pitch['tot_HBP_pitch']-saber_pitch['tot_sacflies_pitch']- \
                saber_pitch['tot_sacbunts_pitch']-saber_pitch['tot_catcherinterference_pitch']
        saber_pitch['BAA']=saber_pitch['tot_hits_pitch']/(temp+epsilon)


        #Power finesse ratio: The sum of strikeouts and walks divided by innings pitched

        saber_pitch['power_finesse']=(saber_pitch['tot_strikeout_pitch']+saber_pitch['tot_walks_pitch'])/(saber_pitch['tot_innings_pitch']+epsilon)
        saber_pitch['power_finesse']=saber_pitch['power_finesse'].clip(0,5)
        
        saber_pitch['WHIP']=(saber_pitch['tot_hits_pitch']+saber_pitch['tot_walks_pitch'])/(saber_pitch['tot_innings_pitch']+epsilon)
        saber_pitch['WHIP']=saber_pitch['WHIP'].clip(1,2)
        
        if self.method!='mean':

            for cols in saber_pitch.columns:
                    if 'tot' in cols:
                        saber_pitch[cols]=saber_pitch[cols]/saber_pitch['games_played_pitch']

        return saber_pitch
    
    def __generate_batting_feats__(self):
        obj=Sabermetrics(season=2018,df=self.df,method=self.method,wOBA_weights=wOBA_weights,grpby_elem=self.grpby_elem)
        if self.mode=='all':
            obj=Sabermetrics(season=2018,df=self.df,method=self.method,wOBA_weights=wOBA_weights,grpby_elem=self.grpby_elem,mode=self.mode)
        
        temp=obj.__get_battingstats__(season=2020)
        sabermetrics_batting=pd.DataFrame(columns=temp.columns.tolist()+['season'])

        for yr in [2018,2019,2020,2021]:
            temp=obj.__get_battingstats__(season=yr)
            temp['season']=yr
            if self.mode=='all':
                temp['season']=2021
                sabermetrics_batting=pd.concat([sabermetrics_batting,temp])
                break
                
            sabermetrics_batting=pd.concat([sabermetrics_batting,temp])
                
                
                
        for cols in sabermetrics_batting.columns:
            sabermetrics_batting[cols]=sabermetrics_batting[cols].fillna(0)

        sabermetrics_batting['merger']=[str(val1)+'_'+str(val2) for val1,val2 in zip(sabermetrics_batting['playerId'],sabermetrics_batting['season'])]

        return sabermetrics_batting


    def __generate_pitching_feats__(self):
        obj=Sabermetrics(season=2018,df=self.df,method=self.method,wOBA_weights=wOBA_weights,grpby_elem=self.grpby_elem)
        if self.mode=='all':
            obj=Sabermetrics(season=2018,df=self.df,method=self.method,wOBA_weights=wOBA_weights,grpby_elem=self.grpby_elem,mode=self.mode)
            
        temp=obj.__get_pitchingstats__(season=2020)
        sabermetrics_pitching=pd.DataFrame(columns=temp.columns.tolist()+['season'])

        for yr in [2018,2019,2020,2021]:
            temp=obj.__get_pitchingstats__(season=yr)
            temp['season']=yr
            if self.mode=='all':
                temp['season']=2021
                sabermetrics_pitching=pd.concat([sabermetrics_pitching,temp])
                break
            sabermetrics_pitching=pd.concat([sabermetrics_pitching,temp])
            
        for cols in sabermetrics_pitching.columns:
            sabermetrics_pitching[cols]=sabermetrics_pitching[cols].fillna(0)

        sabermetrics_pitching['merger']=[str(val1)+'_'+str(val2) for val1,val2 in zip(sabermetrics_pitching['playerId'],sabermetrics_pitching['season'])]

        return sabermetrics_pitching



In [None]:
obj=Sabermetrics(season=2018,df=boxscores_df,method='sum',wOBA_weights=wOBA_weights,grpby_elem='playerId')
sabermetrics_batting=obj.__generate_batting_feats__()
sabermetrics_pitching=obj.__generate_pitching_feats__()
pd.set_option('display.max_columns', 500)

sabermetrics=pd.merge(sabermetrics_batting,sabermetrics_pitching,on='merger',how='left')
cols2_drop=[ 'playerId_x', 'playerId_y','playerName_x','season_x','season_y','playerName_y','tot_pitchesthrown_cnt']
sabermetrics.drop(columns=cols2_drop,inplace=True)
sabermetrics=sabermetrics.fillna(0)

sabermetrics.head()


In [None]:
sabermetrics.to_csv('sabermetrics.csv',index=None)


In [None]:
obj=Sabermetrics(season=2018,df=boxscores_df,method='sum',wOBA_weights=wOBA_weights,grpby_elem='playerId',mode='all')
sabermetrics_batting=obj.__generate_batting_feats__()
sabermetrics_pitching=obj.__generate_pitching_feats__()

sabermetrics_all=pd.merge(sabermetrics_batting,sabermetrics_pitching,on='merger',how='left')
cols2_drop=[ 'playerId_x', 'playerId_y','playerName_x','season_x','season_y','playerName_y','tot_pitchesthrown_cnt']
sabermetrics_all.drop(columns=cols2_drop,inplace=True)
sabermetrics_all=sabermetrics_all.fillna(0)

sabermetrics_all

In [None]:
train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='left')
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
train['primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['year']=[str(val)[:4] for val in train.date.values]
train['merger']=[str(val1)+'_'+val2 for val1,val2 in zip(train.playerId.values,train.year.values)]
train=train.merge(sabermetrics, on='merger',how='left')
train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])


#train = train.merge(rosters[roster_cols], on=['playerId', 'date'], how='left')
#train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')



player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
#teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
#status2num = {c: i for i, c in enumerate(train['status'].unique())}
train['playerId'] = train['playerId'].map(player2num)
#train['teamId'] = train['teamId'].map(teamid2num)
#train['status'] = train['status'].map(status2num)


train.head(2)

In [None]:
feature_cols=train.drop(columns=['playerId', 'target1', 'target2', 'target3', 'target4', 'date','merger','year']).columns
for cols in feature_cols:
    train[cols]=train[cols].fillna(0)
feature_cols

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
from sklearn.model_selection import train_test_split,GroupKFold

#X_train, X_valid, y_train, y_valid = train_test_split(train[feature_cols], train[['target1', 'target2', 'target3', 'target4']], test_size=0.25, random_state=421)
train_X = train[feature_cols]
train_y = train[['target1', 'target2', 'target3', 'target4']]

_index = (train['date'] < 20210401)
x_train = train_X.loc[_index].reset_index(drop=True)
y_train = train_y.loc[_index].reset_index(drop=True)
x_valid = train_X.loc[~_index].reset_index(drop=True)
y_valid = train_y.loc[~_index].reset_index(drop=True)
x_train=x_train.values.reshape(x_train.shape[0],1,x_train.shape[1])
x_valid=x_valid.values.reshape(x_valid.shape[0],1,x_valid.shape[1])
del train_X,train_y
gc.collect()

In [None]:
"""def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
        verbose=verbose)
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score


# training lightgbm
params1 = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 2000,
 'learning_rate': 0.01,
 'random_state': 42,
 "num_leaves": 64
}

params2 = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 100,
 'learning_rate': 0.01,
 'random_state': 42,
 "num_leaves": 25
}

params4 = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 2000,
 'learning_rate': 0.01,
 'random_state': 42,
 "num_leaves": 64
}


params = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 2000,
 'learning_rate': 0.01,
 'random_state': 42,
 "num_leaves": 64
}


oof1, model1, score1 = fit_lgbm(
    x_train, y_train['target1'],
    x_valid, y_valid['target1'],
    params1
)

oof2, model2, score2 = fit_lgbm(
    x_train, y_train['target2'],
    x_valid, y_valid['target2'],
    params2
)

oof3, model3, score3 = fit_lgbm(
    x_train, y_train['target3'],
    x_valid, y_valid['target3'],
    params
)

oof4, model4, score4 = fit_lgbm(
    x_train, y_train['target4'],
    x_valid, y_valid['target4'],
    params4
)

score = (score1+score2+score3+score4) / 4
print(f'score: {score}')"""

In [None]:
"""import matplotlib.pyplot as plt
import seaborn as sns

feature_imp = pd.DataFrame(sorted(zip(model1.feature_importances_,x_train[feature_cols])), columns=['Value','Feature'])

plt.figure(figsize=(15, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).head(40))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()
#plt.savefig('lgbm_importances-01.png')

print (feature_imp['Feature'].tolist())

feature_imp.tail(20)"""

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

class Attention(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super(Attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(Attention, self).build(input_shape)

    def call(self,x):
        et=tf.keras.backend.squeeze(tf.keras.backend.tanh(tf.keras.backend.dot(x,self.W)+self.b),axis=-1)
        at=tf.keras.backend.softmax(et)
        at=tf.keras.backend.expand_dims(at,axis=-1)
        output=x*at
        return tf.keras.backend.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(Attention,self).get_config()

In [None]:


num_columns=len(feature_cols); num_labels=y_train.shape[-1];
def create_mlp1(attention):
    inp = tf.keras.layers.Input(shape = (x_train.shape[1], x_train.shape[2]))
    #inp = tf.keras.layers.Input(shape = (num_columns))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.LSTM(128, input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True)(x)
    att_out=attention(x)

    """
    x = tf.keras.layers.Reshape((1,1, x.shape[-1]), input_shape=(1,x.shape[-1]))(x)
    x = tf.keras.layers.Conv1D(32,5,input_shape=(1,1,x.shape[-1]),padding='same')(x)    
    x = tf.keras.layers.Conv1D(16,3,input_shape=(1,1,x.shape[-1]),padding='same')(x)    
    """#x = tf.keras.layers.Dropout(0.12)(x)
    x = tf.keras.layers.Activation(tf.keras.activations.relu,name='inp_activation')(att_out)
    x = tf.keras.layers.Flatten()(x)
  
    hidden_unit=[64,90,150,100];drop_dense=[0.1,0.1,0.1,0.15]
    for i in range(1):
        x = tf.keras.layers.Dense(hidden_unit[i],name=f"Dense_layer_{i}")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.relu)(x)
        x = tf.keras.layers.Dropout(drop_dense[i])(x)  
    
    x = tf.keras.layers.Dense(4)(x)
    out = tf.keras.layers.Activation('linear')(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = tf.keras.optimizers.SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True),
                  loss = [tf.keras.losses.MeanAbsoluteError(),tf.keras.losses.Huber(delta=1.0)]) 
                  #metrics = tf.keras.metrics.AUC(name = 'AUC')) 
    #print(model.summary())
    return model
model=create_mlp1(attention=Attention())
print(model.summary())


In [None]:

#ckp_path = f'JSModel_{fold}.hdf5'
model = create_mlp1(attention=Attention())
#model = model_fn(hp)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=5)


rlr = ReduceLROnPlateau(monitor = 'val_MeanAbsoluteError', factor = 0.1, patience = 3, verbose = 0, 
                        min_delta = 1e-4, mode = 'min')

#X_train = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
#X_val = (X_val - np.mean(X_val, axis=0)) / np.std(X_val, axis=0)

#y_train = (y_train - np.mean(y_train, axis=0)) / np.std(y_train, axis=0)
#y_val = (y_val - np.mean(y_val, axis=0)) / np.std(y_val, axis=0)

model.fit(x_train, y_train, validation_data = (x_valid, y_valid), epochs = 100, 
          batch_size = 4096,callbacks=[es])#, verbose = 0)"""

In [None]:
pred=model.predict(x_valid)
pred.shape

## Inference

In [None]:
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status','date']


null = np.nan
true = True
false = False

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test: # make predictions here
    
    sample_prediction_df = sample_prediction_df.reset_index(drop=True)
    
    # creat dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    sample_prediction_df['date'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[0]))
    
    # Dealing with missing values
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
    test_scores = test_scores.groupby('playerId').sum().reset_index()
    test = sample_prediction_df[['playerId']].copy()
    test['date']=sample_prediction_df['date']
    test = test.merge(players[players_cols], on='playerId', how='left')
    test['primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['year']=[str(val)[:4] for val in test.date.values]
    test['merger']=[str(val1)+'_'+val2 for val1,val2 in zip(test.playerId.values,test.year.values)]
    test=test.merge(sabermetrics_all, on='merger',how='left')
    test = test.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])

    
    
   # test = test.merge(rosters[roster_cols], on=['playerId', 'date'], how='left')
  
    test['playerId'] = test['playerId'].map(player2num)
    #test['teamId'] = test['teamId'].map(teamid2num)
    #test['status'] = test['status'].map(status2num)
    test_X = test[feature_cols]
    test_X=test_X.values.reshape(test_X.shape[0],1,test_X.shape[1])
    # predict
   
    pred = model.predict(test_X)
    
    # merge submission
    sample_prediction_df['target1'] = np.clip(pred[:,0], 0, 100)
    sample_prediction_df['target2'] = np.clip(pred[:,1], 0, 100)
    sample_prediction_df['target3'] = np.clip(pred[:,2], 0, 100)
    sample_prediction_df['target4'] = np.clip(pred[:,3], 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0.)
    del sample_prediction_df['playerId'],sample_prediction_df['date']
    
    env.predict(sample_prediction_df)

In [None]:
"""train_X = train[feature_cols]
train_y = train[['target1', 'target2', 'target3', 'target4']]

_index = (train['date'] < 20210401)
x_train = train_X.loc[_index].reset_index(drop=True)
y_train = train_y.loc[_index].reset_index(drop=True)
x_valid = train_X.loc[~_index].reset_index(drop=True)
y_valid = train_y.loc[~_index].reset_index(drop=True)

del train_X,train_y
gc.collect()"""

In [None]:
"""def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
        verbose=verbose)
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score


# training lightgbm
params1 = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 3000,
 'learning_rate': 0.1,
 'random_state': 42,
 "num_leaves": 100
}

params2 = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 3000,
 'learning_rate': 0.1,
 'random_state': 42,
 "num_leaves": 22
}

params4 = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 3000,
 'learning_rate': 0.1,
 'random_state': 42,
 "num_leaves": 100
}


params = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 3000,
 'learning_rate': 0.1,
 'random_state': 42,
 "num_leaves": 100
}


oof1, model1, score1 = fit_lgbm(
    x_train, y_train['target1'],
    x_valid, y_valid['target1'],
    params1
)

oof2, model2, score2 = fit_lgbm(
    x_train, y_train['target2'],
    x_valid, y_valid['target2'],
    params2
)

oof3, model3, score3 = fit_lgbm(
    x_train, y_train['target3'],
    x_valid, y_valid['target3'],
    params
)

oof4, model4, score4 = fit_lgbm(
    x_train, y_train['target4'],
    x_valid, y_valid['target4'],
    params4
)

score = (score1+score2+score3+score4) / 4
print(f'score: {score}')"""

In [None]:
"""players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status','date']

null = np.nan
true = True
false = False

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test: # make predictions here
    
    sample_prediction_df = sample_prediction_df.reset_index(drop=True)
    
    # creat dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    sample_prediction_df['date'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[0]))
    
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
    test_scores = test_scores.groupby('playerId').sum().reset_index()
    test = sample_prediction_df[['playerId']].copy()
    test['date']=sample_prediction_df['date']
    test = test.merge(players[players_cols], on='playerId', how='left')
    test['primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['year']=[str(val)[:4] for val in test.date.values]
    test['merger']=[str(val1)+'_'+val2 for val1,val2 in zip(test.playerId.values,test.year.values)]
    test=test.merge(sabermetrics_all, on='merger',how='left')
    test = test.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])

    
    
    test = test.merge(rosters[roster_cols], on=['playerId', 'date'], how='left')
  
    test['playerId'] = test['playerId'].map(player2num)
    test['teamId'] = test['teamId'].map(teamid2num)
    test['status'] = test['status'].map(status2num)

    test_X = test[feature_cols]
    
    # predict
    pred1 = model1.predict(test_X)
    pred2 = model2.predict(test_X)
    pred3 = model3.predict(test_X)
    pred4 = model4.predict(test_X)
    
    # merge submission
    sample_prediction_df['target1'] = np.clip(pred1, 0, 100)
    sample_prediction_df['target2'] = np.clip(pred2, 0, 100)
    sample_prediction_df['target3'] = np.clip(pred3, 0, 100)
    sample_prediction_df['target4'] = np.clip(pred4, 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0.)
    del sample_prediction_df['playerId'],sample_prediction_df['date']
    
    env.predict(sample_prediction_df)"""