In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import gc
import lightgbm as lgb

pd.set_option('display.max_columns', 100)

In [None]:
mydata_dir = '../input/mlb-takahashi-targetenc-seasons/'
train_dir = '../input/mlb-pdef-train-dataset/'

In [None]:
players = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/players.csv')
rosters = pd.read_pickle(train_dir + 'rosters_train.pkl')
scores = pd.read_pickle(train_dir + 'playerBoxScores_train.pkl')

with open(mydata_dir + 'player2num.pkl', 'rb') as f:
    player2num = pickle.load(f)
with open(mydata_dir + 'position2num.pkl', 'rb') as f:
    position2num = pickle.load(f)
with open(mydata_dir + 'teamid2num.pkl', 'rb') as f:
    teamid2num = pickle.load(f)
with open(mydata_dir + 'status2num.pkl', 'rb') as f:
    status2num = pickle.load(f)
    
with open(mydata_dir + 'model1.pkl', 'rb') as f:
    model1 = pickle.load(f)
with open(mydata_dir + 'model2.pkl', 'rb') as f:
    model2 = pickle.load(f)
with open(mydata_dir + 'model3.pkl', 'rb') as f:
    model3 = pickle.load(f)
with open(mydata_dir + 'model4.pkl', 'rb') as f:
    model4 = pickle.load(f)

target_stat_df = pd.read_pickle(mydata_dir + 'target_stat_df.pkl')

In [None]:
target_stat_df

In [None]:
import mlb
env = mlb.make_env()
iter_test = env.iter_test()

targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances']
target_stat_cols = ['playerId', 'target1_mean', 'target1_median', 'target1_std', 'target1_max', 'target1_min', 
                    'target2_mean', 'target2_median', 'target2_std', 'target2_max', 'target2_min', 
                    'target3_mean', 'target3_median', 'target3_std', 'target3_max', 'target3_min', 
                    'target4_mean', 'target4_median', 'target4_std', 'target4_max', 'target4_min']
feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
                'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
                'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
                'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
                'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
                'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
                'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
                'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
                'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
                'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
                'groundOutsPitching', 'runsPitching', 'doublesPitching',
                'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
                'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
                'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
                'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
                'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
                'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
                'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
                'inheritedRunnersScored', 'catchersInterferencePitching',
                'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
                'assists', 'putOuts', 'errors', 'chances',
                'target1_mean', 'target1_median', 'target1_std', 'target1_max', 'target1_min', 
                'target2_mean', 'target2_median', 'target2_std', 'target2_max', 'target2_min', 
                'target3_mean', 'target3_median', 'target3_std', 'target3_max', 'target3_min', 
                'target4_mean', 'target4_median', 'target4_std', 'target4_max', 'target4_min']

# これ入れないとdf読み込みでエラー吐きます
null = np.nan
true = True
false = False

    
for (test_df, sample_prediction_df) in iter_test:
    
    sample_prediction_df = sample_prediction_df.reset_index(drop=True)
    
    # creat dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
            
    test_scores = test_scores.groupby('playerId').sum().reset_index()
    
    test = sample_prediction_df[['playerId']].copy()
    test = test.merge(players[players_cols], on='playerId', how='left')
    test = test.merge(test_rosters[rosters_cols], on='playerId', how='left')
    test = test.merge(test_scores[scores_cols], on='playerId', how='left')
    test = test.merge(target_stat_df, how='inner', left_on=["playerId"],right_on=["playerId"])

    test['label_playerId'] = test['playerId'].map(player2num)
    test['label_primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['label_teamId'] = test['teamId'].map(teamid2num)
    test['label_status'] = test['status'].map(status2num)
    
    test_X = test[feature_cols]
    
    # predict
    pred1 = model1.predict(test_X)
    pred2 = model2.predict(test_X)
    pred3 = model3.predict(test_X)
    pred4 = model4.predict(test_X)
    
    # merge submission
    sample_prediction_df['target1'] = np.clip(pred1, 0, 100)
    sample_prediction_df['target2'] = np.clip(pred2, 0, 100)
    sample_prediction_df['target3'] = np.clip(pred3, 0, 100)
    sample_prediction_df['target4'] = np.clip(pred4, 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0.)
    del sample_prediction_df['playerId']
    
    env.predict(sample_prediction_df)