In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import gc

from sklearn.metrics import mean_absolute_error
import lightgbm as lgbm

pd.set_option('display.max_columns', 100)

In [None]:
# trainの中身をcsv, pickleにしてくれている親切なデータセットがあったので使います
base_dir = '../input/mlb-player-digital-engagement-forecasting/'
train_dir = '../input/mlb-pdef-train-dataset/'

In [None]:
# このtarget1～4を予測するのがコンペの目的
target_df = pd.read_pickle(train_dir + 'nextDayPlayerEngagement_train.pkl')
target_df.head()

In [None]:
target_df.info()

In [None]:
# 2061選手分の、1216件のデータがある
target_df.nunique()

In [None]:
# target encoding以外の特徴量はこのnotebookを参照
# https://www.kaggle.com/mlconsult/1-38-lb-lightgbm-with-target-statistics

players = pd.read_csv(base_dir + 'players.csv')
rosters = pd.read_pickle(train_dir + 'rosters_train.pkl')
scores = pd.read_pickle(train_dir + 'playerBoxScores_train.pkl')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()

In [None]:
# 効きそうなカラムを抽出
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status', 'date']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances', 'date']

target_stat_cols = ['playerId', 'target1_mean', 'target1_median', 'target1_std', 'target1_max', 'target1_min', 
                       'target2_mean', 'target2_median', 'target2_std', 'target2_max', 'target2_min', 
                       'target3_mean', 'target3_median', 'target3_std', 'target3_max', 'target3_min', 
                       'target4_mean', 'target4_median', 'target4_std', 'target4_max', 'target4_min']

feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
                'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
                'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
                'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
                'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
                'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
                'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
                'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
                'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
                'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
                'groundOutsPitching', 'runsPitching', 'doublesPitching',
                'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
                'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
                'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
                'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
                'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
                'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
                'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
                'inheritedRunnersScored', 'catchersInterferencePitching',
                'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
                'assists', 'putOuts', 'errors', 'chances', 
                'target1_mean', 'target1_median', 'target1_std', 'target1_max', 'target1_min', 
                'target2_mean', 'target2_median', 'target2_std', 'target2_max', 'target2_min', 
                'target3_mean', 'target3_median', 'target3_std', 'target3_max', 'target3_min', 
                'target4_mean', 'target4_median', 'target4_std', 'target4_max', 'target4_min']

In [None]:
# 結合
df = target_df[targets_cols]

df = df.merge(players[players_cols], on=['playerId'], how='left')
df = df.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
df = df.merge(scores[scores_cols], on=['playerId', 'date'], how='left')

# label encoding
player2num = {c: i for i, c in enumerate(df['playerId'].unique())}
position2num = {c: i for i, c in enumerate(df['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(df['teamId'].unique())}
status2num = {c: i for i, c in enumerate(df['status'].unique())}
df['label_playerId'] = df['playerId'].map(player2num)
df['label_primaryPositionName'] = df['primaryPositionName'].map(position2num)
df['label_teamId'] = df['teamId'].map(teamid2num)
df['label_status'] = df['status'].map(status2num)


del rosters, scores
gc.collect()

df.head()

In [None]:
with open('player2num.pkl', 'wb') as f:
    pickle.dump(player2num, f)

with open('position2num.pkl', 'wb') as f:
    pickle.dump(position2num, f)

with open('teamid2num.pkl', 'wb') as f:
    pickle.dump(teamid2num, f)

with open('status2num.pkl', 'wb') as f:
    pickle.dump(status2num, f)
    
del player2num, position2num, teamid2num, status2num

In [None]:
# seasonのみのモデルを作成するため4~9月のデータのみ抜き出し
print('before:', df.shape)
df['month'] = df['date'].astype('str').str[4:6].astype('int')
df = df[(4<=df['month']) & (df['month']<=9)]
print('after:', df.shape)

In [None]:
# target encoding用データセット作成
# 直近のデータで作成するが、これでは特徴量の意味が変わると思いつつ一番精度が良い
target_stat_df = df.loc[df['date']>=20210331, ['playerId', 'target1', 'target2', 'target3', 'target4']]
target_stat_df = target_stat_df.groupby('playerId').agg(['mean', 'median', 'std', 'max', 'min'])

In [None]:
target_stat_df

In [None]:
target_stat_df = target_stat_df.reset_index()
target_stat_df.columns = target_stat_cols
target_stat_df.head()

In [None]:
target_stat_df.to_pickle('target_stat_df.pkl')

In [None]:
df = df.merge(target_stat_df[target_stat_cols], how='left', on='playerId')

del target_stat_df
gc.collect()

df.head()

In [None]:
df_X = df[feature_cols]
df_y = df[['target1', 'target2', 'target3', 'target4']]

_index = (df['date'] < 20210401)
x_train = df_X.loc[_index].reset_index(drop=True)
y_train = df_y.loc[_index].reset_index(drop=True)
x_valid = df_X.loc[~_index].reset_index(drop=True)
y_valid = df_y.loc[~_index].reset_index(drop=True)

In [None]:
print('training data shape:' , x_train.shape, y_train.shape)
print('validation data shape:' , x_valid.shape, y_valid.shape)

In [None]:
import lightgbm as lgbm
def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
        verbose=verbose)
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score


# training lightgbm
params = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 100000,
 'learning_rate': 0.1,
 'random_state': 42,
}

oof1, model1, score1 = fit_lgbm(
    x_train, y_train['target1'],
    x_valid, y_valid['target1'],
    params
)
oof2, model2, score2 = fit_lgbm(
    x_train, y_train['target2'],
    x_valid, y_valid['target2'],
    params
)
oof3, model3, score3 = fit_lgbm(
    x_train, y_train['target3'],
    x_valid, y_valid['target3'],
    params
)
oof4, model4, score4 = fit_lgbm(
    x_train, y_train['target4'],
    x_valid, y_valid['target4'],
    params
)

score = (score1+score2+score3+score4) / 4
print(f'score: {score}')

In [None]:
with open('model1.pkl', 'wb') as f:
    pickle.dump(model1, f)
    
with open('model2.pkl', 'wb') as f:
    pickle.dump(model2, f)
    
with open('model3.pkl', 'wb') as f:
    pickle.dump(model3, f)
    
with open('model4.pkl', 'wb') as f:
    pickle.dump(model4, f)