In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from catboost import Pool
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_columns', 100)

In [None]:
# trainの中身をcsv, pickleにしてくれている親切なデータセットがあったので使います
base_dir = '../input/mlb-player-digital-engagement-forecasting/'
# train_dir = '../input/mlb-pdef-train-dataset/'
train_dir = '../input/mlb-updated-dataset/'

In [None]:
# このtarget1～4を予測するのがコンペの目的
# target_df = pd.read_pickle(train_dir + 'nextDayPlayerEngagement_train.pkl')
target_df = pd.read_csv(train_dir + 'nextDayPlayerEngagement_train.csv')
target_df.head()

In [None]:
# target encoding以外の特徴量はこのnotebookを参照
# https://www.kaggle.com/mlconsult/1-38-lb-lightgbm-with-target-statistics

players = pd.read_csv(base_dir + 'players.csv')
# rosters = pd.read_pickle(train_dir + 'rosters_train.pkl')
rosters = pd.read_csv(train_dir + 'rosters_train.csv')
# scores = pd.read_pickle(train_dir + 'playerBoxScores_train.pkl')
scores = pd.read_csv(train_dir + 'playerBoxScores_train.csv')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()

In [None]:
# 効きそうなカラムを抽出
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName',
               'birthCity', 'birthStateProvince','birthCountry','playerForTestSetAndFuturePreds']
rosters_cols = ['playerId', 'teamId', 'status', 'date']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances', 'date']

target_stat_cols = ['playerId', 'target1_mean', 'target1_median', 'target1_std', 'target1_max', 'target1_min', 
                       'target2_mean', 'target2_median', 'target2_std', 'target2_max', 'target2_min', 
                       'target3_mean', 'target3_median', 'target3_std', 'target3_max', 'target3_min', 
                       'target4_mean', 'target4_median', 'target4_std', 'target4_max', 'target4_min']

feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
                'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
                'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
                'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
                'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
                'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
                'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
                'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
                'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
                'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
                'groundOutsPitching', 'runsPitching', 'doublesPitching',
                'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
                'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
                'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
                'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
                'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
                'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
                'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
                'inheritedRunnersScored', 'catchersInterferencePitching',
                'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
                'assists', 'putOuts', 'errors', 'chances', 
                'target1_mean', 'target1_median', 'target1_std', 'target1_max', 'target1_min', 
                'target2_mean', 'target2_median', 'target2_std', 'target2_max', 'target2_min', 
                'target3_mean', 'target3_median', 'target3_std', 'target3_max', 'target3_min', 
                'target4_mean', 'target4_median', 'target4_std', 'target4_max', 'target4_min',
                'status','primaryPositionName','birthCity', 'birthStateProvince','birthCountry',]

category_cols = [
                'primaryPositionName',
                 'birthCity', 'birthStateProvince', 'birthCountry',
                 'gamesPlayedBatting',
#                  'positionCode',
#                  'locationName',
#                  'label_leagueId', 
#                  'label_divisionId',
                 'status',
                 'gamesPlayedPitching', 'gamesStartedPitching',
                 'completeGamesPitching', 'shutoutsPitching',
                 'winsPitching', 'lossesPitching',
                 'saveOpportunities',
                 'saves', 'holds', 'blownSaves'
#                 'awardId',
#                 'divisionChamp', 'divisionLeader', 'wildCardLeader',
#                 'label_typeCode'
                ]

In [None]:
# 結合
df = target_df[targets_cols]

df = df.merge(players[players_cols], on=['playerId'], how='left')
df = df.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
df = df.merge(scores[scores_cols], on=['playerId', 'date'], how='left')

# label encoding
player2num = {c: i for i, c in enumerate(df['playerId'].unique())}
position2num = {c: i for i, c in enumerate(df['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(df['teamId'].unique())}
status2num = {c: i for i, c in enumerate(df['status'].unique())}
df['label_playerId'] = df['playerId'].map(player2num)
df['label_primaryPositionName'] = df['primaryPositionName'].map(position2num)
df['label_teamId'] = df['teamId'].map(teamid2num)
df['label_status'] = df['status'].map(status2num)


del rosters, scores
gc.collect()

df.head()

In [None]:
with open('player2num.pkl', 'wb') as f:
    pickle.dump(player2num, f)

with open('position2num.pkl', 'wb') as f:
    pickle.dump(position2num, f)

with open('teamid2num.pkl', 'wb') as f:
    pickle.dump(teamid2num, f)

with open('status2num.pkl', 'wb') as f:
    pickle.dump(status2num, f)
    
del player2num, position2num, teamid2num, status2num

In [None]:
# seasonのみのモデルを作成するため4~9月のデータのみ抜き出し
print('before:', df.shape)
df['month'] = df['date'].astype('str').str[4:6].astype('int')
df = df[(4<=df['month']) & (df['month']<=9) & (df['playerForTestSetAndFuturePreds']==True)]
print('after:', df.shape)

In [None]:
# target encoding用データセット作成
# 直近のデータで作成するが、これでは特徴量の意味が変わると思いつつ一番精度が良い
target_stat_df = df.loc[df['date']>=20210331, ['playerId', 'target1', 'target2', 'target3', 'target4']]
target_stat_df = target_stat_df.groupby('playerId').agg(['mean', 'median', 'std', 'max', 'min'])

In [None]:
target_stat_df = target_stat_df.reset_index()
target_stat_df.columns = target_stat_cols

In [None]:
target_stat_df.to_pickle('target_stat_df.pkl')

In [None]:
df = df.merge(target_stat_df[target_stat_cols], how='left', on='playerId')

del target_stat_df
gc.collect()

df.head()

In [None]:
df[category_cols].info()

In [None]:
df[category_cols].head()

In [None]:
# CatboostようにNaNを補完＋文字列
df['primaryPositionName'] = df['primaryPositionName'].fillna(0)
df['birthCity'] = df['birthCity'].fillna(0)
df['birthStateProvince'] = df['birthStateProvince'].fillna(0)
df['birthCountry'] = df['birthCountry'].fillna(0)
df['status'] = df['status'].fillna(0)
df['gamesPlayedBatting'] = df['gamesPlayedBatting'].fillna(0).astype(int)
df['gamesPlayedBatting'] = df['gamesPlayedBatting'].fillna(0).astype(int)
df['gamesPlayedPitching'] = df['gamesPlayedPitching'].fillna(0).astype(int)
df['gamesStartedPitching'] = df['gamesStartedPitching'].fillna(0).astype(int)
df['completeGamesPitching'] = df['completeGamesPitching'].fillna(0).astype(int)
df['shutoutsPitching'] = df['shutoutsPitching'].fillna(0).astype(int)
df['winsPitching'] = df['winsPitching'].fillna(0).astype(int)
df['lossesPitching'] = df['lossesPitching'].fillna(0).astype(int)
df['saveOpportunities'] = df['saveOpportunities'].fillna(0).astype(int)
df['saves'] = df['saves'].fillna(0).astype(int)
df['holds'] = df['holds'].fillna(0).astype(int)
df['blownSaves'] = df['blownSaves'].fillna(0).astype(int)

In [None]:
df[category_cols].info()

In [None]:
df[category_cols].head()

In [None]:
df_X = df[feature_cols]
df_y = df[['target1', 'target2', 'target3', 'target4']]

_index = (df['date'] < 20210401)
x_train = df_X.loc[_index].reset_index(drop=True)
y_train = df_y.loc[_index].reset_index(drop=True)
x_valid = df_X.loc[~_index].reset_index(drop=True)
y_valid = df_y.loc[~_index].reset_index(drop=True)

print('training data shape:' , x_train.shape, y_train.shape)
print('validation data shape:' , x_valid.shape, y_valid.shape)

In [None]:
from catboost import CatBoostRegressor
from catboost import Pool

def fit_cat(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):

    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = CatBoostRegressor(
                n_estimators=2000,
                learning_rate=0.05,
                loss_function='MAE',
                eval_metric='MAE',
                max_bin=50,
                subsample=0.9,
                colsample_bylevel=0.5,
                verbose=100)
    model.fit(x_train, y_train, cat_features=category_cols,
                         use_best_model=True,
                         eval_set=(x_valid, y_valid),
#                          early_stopping_rounds=25,
                          plot=True
                )
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score

# training xgbm
params = {
'boosting_type': 'gbdt',
'objective':'mae',
'subsample': 0.5,
'subsample_freq': 1,
'learning_rate': 0.03,
'num_leaves': 2**11-1,
'min_data_in_leaf': 2**12-1,
'feature_fraction': 0.5,
'max_bin': 100,
'n_estimators': 2500,
'boost_from_average': False,
"random_seed":42,
    }

oof1, model1_cat, score1 = fit_cat(
    x_train, y_train['target1'],
    x_valid, y_valid['target1'],
    params
    )
oof2, model2_cat, score2 = fit_cat(
    x_train, y_train['target2'],
    x_valid, y_valid['target2'],
    params
    )
oof3, model3_cat, score3 = fit_cat(
    x_train, y_train['target3'],
    x_valid, y_valid['target3'],
    params
    )
oof4, model4_cat, score4 = fit_cat(
    x_train, y_train['target4'],
    x_valid, y_valid['target4'],
    params
    )

score = (score1+score2+score3+score4) / 4
print(f'score: {score}')

In [None]:
from catboost import Pool
import sklearn.metrics
def objective(trial):
    train_pool = Pool(x_train, y_train['target1'], cat_features=category_cols)
    test_pool = Pool(x_valid, y_valid['target1'], cat_features=category_cols)

    # パラメータの指定
    params = {
        'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50)
    }

    # 学習
    model = CatBoostRegressor(**params)
    model.fit(train_pool)
    # 予測
    preds = model.predict(test_pool)
    pred_labels = np.rint(preds)
    # 精度の計算
    score = mean_absolute_error(preds, y_valid['target1'])
    return score
#     print('mae:', score)
    

In [None]:
import optuna
study = optuna.create_study()
study.optimize(objective, n_trials=100)
print(study.best_trial)