### How to submit notebook
##### 1. Turn off the internet
##### 2. Save version -> Advanced Settings -> Save output for this version
##### 3. Submission ! (To prevent 'Your Notebook cannot use internet access in this competition' Error)

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
import mlb
import gc
import os
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_rows = 200
pd.options.display.max_columns = 100

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
a = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/awards.csv", nrows = 100)
a.head()

In [None]:
t = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/train.csv", nrows = 100)
t[~t['awards'].isna()]

In [None]:
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('../input/mlb-pdef-train-dataset')

In [None]:
targets_cols = [
    'playerId', 
    'target1', 
    'target2', 
    'target3', 
    'target4', 
    'date'
]

players_cols = [
    'playerId', 
    'primaryPositionName'
]

teams_cols = [
    'id', 
    'leagueId',  
    'divisionId'
]

rosters_cols = [
    'playerId', 
    'teamId', 
    'status', 
    'date'
]

scores_cols = [
    'playerId', 
    'battingOrder', 
    'gamesPlayedBatting', 
    'flyOuts',
    'groundOuts', 
    'runsScored', 
    'doubles', 
    'triples', 
    'homeRuns',
    'strikeOuts', 
    'baseOnBalls', 
    'intentionalWalks', 
    'hits', 
    'hitByPitch',
    'atBats', 
    'caughtStealing', 
    'stolenBases', 
    'groundIntoDoublePlay',
    'groundIntoTriplePlay', 
    'plateAppearances', 
    'totalBases', 
    'rbi',
    'leftOnBase', 
    'sacBunts', 
    'sacFlies', 
    'catchersInterference',
    'pickoffs', 
    'gamesPlayedPitching', 
    'gamesStartedPitching',
    'completeGamesPitching', 
    'shutoutsPitching', 
    'winsPitching',
    'lossesPitching', 
    'flyOutsPitching', 
    'airOutsPitching',
    'groundOutsPitching', 
    'runsPitching', 
    'doublesPitching',
    'triplesPitching', 
    'homeRunsPitching', 
    'strikeOutsPitching',
    'baseOnBallsPitching', 
    'intentionalWalksPitching', 
    'hitsPitching',
    'hitByPitchPitching', 
    'atBatsPitching', 
    'caughtStealingPitching',
    'stolenBasesPitching', 
    'inningsPitched', 
    'saveOpportunities',
    'earnedRuns', 
    'battersFaced', 
    'outsPitching', 
    'pitchesThrown', 
    'balls',
    'strikes', 
    'hitBatsmen', 
    'balks', 
    'wildPitches', 
    'pickoffsPitching',
    'rbiPitching', 
    'gamesFinishedPitching', 
    'inheritedRunners',
    'inheritedRunnersScored', 
    'catchersInterferencePitching',
    'sacBuntsPitching', 
    'sacFliesPitching', 
    'saves', 
    'holds', 
    'blownSaves',
    'assists', 
    'putOuts', 
    'errors', 
    'chances', 
    'date',
    'home'  
]

awards_cols = [
    'date', 
    'playerId',
    'awardId'
]

playerTwitterFollowers_cols = [
    'playerId', 
    'numberOfFollowers'
]

teamTwitterFollowers_cols = [
    'teamId', 
    'numberOfFollowers'
]

standings_cols = [
    'teamId', 
    'wins', 
    'losses', 
    'lastTenWins',
    'lastTenLosses',
    'date',
    'homeWins',  
    'homeLosses',
    'awayWins',
    'awayLosses'    
]

feature_cols = [
    'label_playerId', 
    'label_primaryPositionName', 
    'label_teamId',
    'label_status',
    'playerId', 
    'battingOrder', 
    'gamesPlayedBatting', 
    'flyOuts',
    'groundOuts', 
    'runsScored', 
    'doubles', 
    'triples', 
    'homeRuns',
    'strikeOuts', 
    'baseOnBalls', 
    'intentionalWalks', 
    'hits', 
    'hitByPitch',
    'atBats', 
    'caughtStealing', 
    'stolenBases', 
    'groundIntoDoublePlay',
    'groundIntoTriplePlay', 
    'plateAppearances', 
    'totalBases', 
    'rbi',
    'leftOnBase', 
    'sacBunts', 
    'sacFlies', 
    'catchersInterference',
    'pickoffs', 
    'gamesPlayedPitching', 
    'gamesStartedPitching',
    'completeGamesPitching', 
    'shutoutsPitching', 
    'winsPitching',
    'lossesPitching', 
    'flyOutsPitching', 
    'airOutsPitching',
    'groundOutsPitching', 
    'runsPitching', 
    'doublesPitching',
    'triplesPitching', 
    'homeRunsPitching', 
    'strikeOutsPitching',
    'baseOnBallsPitching', 
    'intentionalWalksPitching', 
    'hitsPitching',
    'hitByPitchPitching', 
    'atBatsPitching', 
    'caughtStealingPitching',
    'stolenBasesPitching', 
    'inningsPitched', 
    'saveOpportunities',
    'earnedRuns', 
    'battersFaced', 
    'outsPitching', 
    'pitchesThrown', 
    'balls',
    'strikes', 
    'hitBatsmen', 
    'balks', 
    'wildPitches', 
    'pickoffsPitching',
    'rbiPitching', 
    'gamesFinishedPitching', 
    'inheritedRunners',
    'inheritedRunnersScored', 
    'catchersInterferencePitching',
    'sacBuntsPitching', 
    'sacFliesPitching', 
    'saves', 
    'holds', 
    'blownSaves',
    'assists', 
    'putOuts', 
    'errors', 
    'chances', 
    'target1_mean',
    'target1_median',
    'target1_std',
    'target1_min',
    'target1_max',
    'target2_mean',
    'target2_median',
    'target2_std',
    'target2_min',
    'target2_max',
    'target3_mean',
    'target3_median',
    'target3_std',
    'target3_min',
    'target3_max',
    'target4_mean',
    'target4_median',
    'target4_std',
    'target4_min',
    'target4_max',
    'awardId_count',
    'playernumberOfFollowers',               
    'teamnumberOfFollowers',
    'label_leagueId',
    'label_divisionId',
    'wins', 
    'losses', 
    'lastTenWins',
    'lastTenLosses',
    'home',   
    'homeWins',
    'homeLosses',
    'awayWins',
    'awayLosses'      
]

# Make MLB_PDEF_train_dataset
### Train dataset JSON -> CSV

In [None]:
# !pip install pandarallel 

# import gc

# import numpy as np
# import pandas as pd
# from pathlib import Path

# from pandarallel import pandarallel
# pandarallel.initialize()

# BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
# train = pd.read_csv(BASE_DIR / 'train.csv')
# train = reduce_mem_usage(train)

# null = np.nan
# true = True
# false = False

# for col in train.columns:
#     if col == 'date': 
#         continue
#     _index = train[col].notnull()
#     train.loc[_index, col] = train.loc[_index, col].parallel_apply(lambda x: eval(x))

#     outputs = []
#     for index, date, record in train.loc[_index, ['date', col]].itertuples():
#         _df = pd.DataFrame(record)
#         _df['index'] = index
#         _df['date'] = date
#         _df = reduce_mem_usage(_df)
#         outputs.append(_df)

#     outputs = pd.concat(outputs).reset_index(drop=True)

#     outputs.to_csv(f'{col}_train.csv', index=False)

#     del outputs
#     del train[col]
#     gc.collect()

# Use some columns

In [None]:
players = pd.read_csv(BASE_DIR / 'players.csv', usecols = players_cols)
players = reduce_mem_usage(players)


teams = pd.read_csv(BASE_DIR / 'teams.csv', usecols = teams_cols)
teams = teams.rename(columns = {'id':'teamId'})
teams = reduce_mem_usage(teams)


rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv', usecols = rosters_cols)
rosters = reduce_mem_usage(rosters)


targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv', usecols = targets_cols)
targets = reduce_mem_usage(targets)


scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv', usecols = scores_cols)
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
scores = reduce_mem_usage(scores)


awards = pd.read_csv(TRAIN_DIR / 'awards_train.csv', usecols = awards_cols)
awards_count = awards[['playerId', 'awardId']].groupby('playerId').count().reset_index()
awards_count = awards_count.rename(columns = {'awardId':'awardId_count'})
awards_count = reduce_mem_usage(awards_count)
del awards

playerTwitterFollowers = pd.read_csv(TRAIN_DIR / 'playerTwitterFollowers_train.csv', usecols = playerTwitterFollowers_cols)
playerTwitterFollowers = playerTwitterFollowers.groupby('playerId').sum().reset_index()
playerTwitterFollowers = playerTwitterFollowers.rename(columns = {'numberOfFollowers':'playernumberOfFollowers'})
playerTwitterFollowers = reduce_mem_usage(playerTwitterFollowers)


teamTwitterFollowers = pd.read_csv(TRAIN_DIR / 'teamTwitterFollowers_train.csv', usecols = teamTwitterFollowers_cols)
teamTwitterFollowers = teamTwitterFollowers.groupby('teamId').sum().reset_index()
teamTwitterFollowers = teamTwitterFollowers.rename(columns = {'numberOfFollowers':'teamnumberOfFollowers'})
teamTwitterFollowers = reduce_mem_usage(teamTwitterFollowers)


standings = pd.read_csv(TRAIN_DIR / 'standings_train.csv', usecols = standings_cols)
standings = reduce_mem_usage(standings)

gc.collect()

# Make Player_Target_Stats

In [None]:
# def make_colnames(t, f):
#     li = ['playerId']
#     for t_name in t:
#         li += [t_name + '_' + f_name for f_name in f]
#     return li
    

In [None]:
# # make player_target_stats
# target_name = ['target1', 'target2', 'target3', 'target4']
# function_name = ['mean', 'median', 'std', 'min', 'max']
# column_name = make_colnames(target_name, function_name)

# targets_stats = targets.groupby('playerId').agg({'target1' : ['mean', 'median', 'std', 'min', 'max'],
#                                                'target2' : ['mean', 'median', 'std', 'min', 'max'],
#                                                'target3' : ['mean', 'median', 'std', 'min', 'max'],
#                                                'target4' : ['mean', 'median', 'std', 'min', 'max']})
# targets_stats.reset_index(inplace = True)


# targets_stats.columns = column_name
# targets_stats.to_csv("player_target_stats.csv", index = False)

In [None]:
player_target_stats = pd.read_csv("../input/player-target-stats2/player_target_stats.csv")
player_target_stats

In [None]:
# create dataset
train = targets.copy()[targets_cols]

print(targets[targets_cols].shape)

train = train.merge(
    players, 
    on=['playerId'], 
    how='left'
)
gc.collect()


train = train.merge(
    rosters, 
    on=['playerId', 'date'], 
    how='left'
)
gc.collect()


train = train.merge(
    scores, 
    on=['playerId', 'date'], 
    how='left'
)
gc.collect()


train = train.merge(
    player_target_stats, 
    how='inner', 
    on= "playerId",
)
gc.collect()


train = train.merge(
    teams,
    on = 'teamId',
    how='left'
)
gc.collect()


train = train.merge(
    awards_count,
    on = 'playerId',
    how = 'left'
)
train['awardId_count'] = train['awardId_count'].fillna(0)
gc.collect()


train = train.merge(
    playerTwitterFollowers, 
    how = 'left', 
    on = 'playerId'
)
gc.collect()


train = train.merge(
    teamTwitterFollowers, 
    how = 'left', 
    on = 'teamId'
)
gc.collect()


train = train.merge(
    standings, 
    how = 'left', 
    on = ['teamId', 'date']
)
gc.collect()



# label encoding
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}
leagueId2num = {c: i for i, c in enumerate(train['leagueId'].unique())}
divisionId2num = {c: i for i, c in enumerate(train['divisionId'].unique())}


# mapping
train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)
train['label_leagueId'] = train['leagueId'].map(leagueId2num)
train['label_divisionId'] = train['divisionId'].map(divisionId2num)

In [None]:
train_X = train[feature_cols]
train_y = train[['target1', 'target2', 'target3', 'target4']]

_index = (train['date'] < 20210401)
x_train = train_X.loc[_index].reset_index(drop=True)
y_train = train_y.loc[_index].reset_index(drop=True)
x_valid = train_X.loc[~_index].reset_index(drop=True)
y_valid = train_y.loc[~_index].reset_index(drop=True)

In [None]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
        verbose=verbose)
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score



# Hyperparameters Tuning in LightGBM docs
params = {
    'objective': 'mae', 
    'metric': 'l1', 
    
    # Deal with overfitting
    'min_data_in_leaf' : 50,      
    'max_depth' : 9,              
    'early_stopping_round': 100,  
    'feature_fraction' : 0.8,     
    'bagging_fraction' : 0.6,     
    'bagging_freq' : 3,           # If bagging_fraction < 1 -> bagging_freq : nonzero
    
    # For better accuracy
    'num_leaves' : 250,           
    'max_bin' : 300,              
    'learning_rate' : 0.05,       
    'num_iterations' : 500,       
    'colsample_bytree' : 0.9,     
    'boosting' : 'dart',          # better than default(i.e. gbdt)
    
    'random_state' : 42,
    
}

# Target1
oof1, model1, score1 = fit_lgbm(
    x_train, y_train['target1'],
    x_valid, y_valid['target1'],
    params
)
# Target2
oof2, model2, score2 = fit_lgbm(
    x_train, y_train['target2'],
    x_valid, y_valid['target2'],
    params
)
# Target3
oof3, model3, score3 = fit_lgbm(
    x_train, y_train['target3'],
    x_valid, y_valid['target3'],
    params
)
# Target4
oof4, model4, score4 = fit_lgbm(
    x_train, y_train['target4'],
    x_valid, y_valid['target4'],
    params
)

score = (score1+score2+score3+score4) / 4
print(f'score: {score}')

In [None]:
rosters_cols.remove('date')
scores_cols.remove('date')
standings_cols = [
    'teamId', 
    'wins', 
    'losses', 
    'lastTenWins',
    'lastTenLosses',
    'homeWins',
    'homeLosses',
    'awayWins',
    'awayLosses'
]

null = np.nan
true = True
false = False

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

# make predictions here
for (test_df, sample_prediction_df) in iter_test:     
    sample_prediction_df = sample_prediction_df.reset_index(drop=True)
    
    # create dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    
    test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    test_standings = pd.DataFrame(eval(test_df['standings'].iloc[0]))

    test_scores = test_scores.groupby('playerId').sum().reset_index()
    test = sample_prediction_df[['playerId']].copy()
    
    test = test.merge(
        players[players_cols], 
        on='playerId', 
        how='left'
    )
    
    test = test.merge(
        test_rosters[rosters_cols], 
        on='playerId', 
        how='left'
    )
    
    test = test.merge(
        test_scores[scores_cols], 
        on='playerId', 
        how='left'
    )
    
    test = test.merge(
        player_target_stats, 
        how='inner', 
        on='playerId'
    )
    
    test = test.merge(
        awards_count, 
        how='left', 
        on='playerId'
    )
    
    test = test.merge(
        teams, 
        how='left', 
        on='teamId'
    )
    test['awardId_count'] = test['awardId_count'].fillna(0)
    
    test = test.merge(
        playerTwitterFollowers, 
        how='left', 
        on='playerId'
    )
    
    test = test.merge(
        teamTwitterFollowers, 
        how='left', 
        on='teamId'
    )
    
    test = test.merge(
        test_standings[standings_cols], 
        how='left', 
        on='teamId'
    )


    test['label_playerId'] = test['playerId'].map(player2num)
    test['label_primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['label_teamId'] = test['teamId'].map(teamid2num)
    test['label_status'] = test['status'].map(status2num)
    test['label_leagueId'] = test['leagueId'].map(leagueId2num)
    test['label_divisionId'] = test['divisionId'].map(divisionId2num)
    
    test_X = test[feature_cols]
    
    # predict
    pred1 = model1.predict(test_X)
    pred2 = model2.predict(test_X)
    pred3 = model3.predict(test_X)
    pred4 = model4.predict(test_X)
    
    # merge submission
    # (x = 0 if x < 0 else x) & (x = 100 if x > 100 else x)
    sample_prediction_df['target1'] = np.clip(pred1, 0, 100)  
    sample_prediction_df['target2'] = np.clip(pred2, 0, 100)
    sample_prediction_df['target3'] = np.clip(pred3, 0, 100)
    sample_prediction_df['target4'] = np.clip(pred4, 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0.)
    del sample_prediction_df['playerId']
    
    env.predict(sample_prediction_df)