# Credits
* Original notebook: https://www.kaggle.com/ryanholbrook/getting-started-with-mlb-player-digital-engagement
* Data Loading: https://www.kaggle.com/columbia2131/mlb-lightgbm-starter-dataset-code-en-ja (which was linked by https://www.kaggle.com/ranjeetshrivastav/mib-eda-xgboost) 
* submission: https://www.kaggle.com/tensorchoko/mlb-player-lightgbmx and https://www.kaggle.com/columbia2131/mlb-lightgbm-starter-dataset-code-en-ja again

# Imports

In [None]:
import gc
import sys
import warnings
from joblib import Parallel, delayed
from pathlib import Path
import copy

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)

import torch
import torch.nn as nn
from torch.utils.data import Dataset
import lightgbm as lgbm
import xgboost as xgb

from skopt import dummy_minimize

import mlb

# Path Config

In [None]:
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('../input/mlb-pdef-train-dataset')

players = pd.read_csv(BASE_DIR / 'players.csv')
TRAIN_DIR = Path('../input/mlb-pdef-train-dataset')
rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
scores = pd.read_pickle(TRAIN_DIR / 'playerBoxScores_train.pkl')

# Data loading

In [None]:
# Players in the test set. We'll filter our data for only this set of players
pids_test = players.playerId.loc[
    players.playerForTestSetAndFuturePreds.fillna(False)
].astype(str)


def make_playerBoxScores(dfs: dict, features):
    X = dfs['playerBoxScores'].copy()
    X = X[['gameDate', 'playerId'] + features]
    # Set dtypes
    X = X.astype({name: np.float32 for name in features})
    X = X.astype({'playerId': str})
    # Create date index
    X = X.rename(columns={'gameDate': 'date'})
    X['date'] = pd.PeriodIndex(X.date, freq='D')
    # Aggregate multiple games per day by summing
    X = X.groupby(['date', 'playerId'], as_index=False).sum()
    return X


def make_targets(training_dfs: dict):
    Y = training_dfs['nextDayPlayerEngagement'].copy()
    # Set dtypes
    Y = Y.astype({name: np.float32 for name in targets})
    Y = Y.astype({'playerId': str})
    # Match target dates to feature dates and create date index
    Y = Y.rename(columns={'engagementMetricsDate': 'date'})
    Y['date'] = pd.to_datetime(Y['date'])
    Y = Y.set_index('date').to_period('D')
    Y.index = Y.index - 1
    return Y.reset_index()


def join_datasets(dfs):
    dfs = [x.pivot(index='date', columns='playerId') for x in dfs]
    df = pd.concat(dfs, axis=1).stack().reset_index('playerId')
    return df


def make_training_data(training_dfs: dict,
                       features,
                       targets,
                       fourier=4,
                       test_size=30):
    # Process dataframes
    X = make_playerBoxScores(training_dfs, features)
    Y = make_targets(training_dfs)
    # Merge for processing
    df = join_datasets([X, Y])
    # Filter for players in test set
    df = df.loc[df.playerId.isin(pids_test), :]
    # Convert from long to wide format
    df = df.pivot(columns="playerId")
    # Restore features and targets
    X = df.loc(axis=1)[features, :]
    Y = df.loc(axis=1)[targets, :]
    # Fill missing values in features
    X.fillna(-1, inplace=True)
    # Create temporal features
    fourier_terms = CalendarFourier(freq='A', order=fourier)
    deterministic = DeterministicProcess(
        index=X.index,
        order=0,
        seasonal=False,  # set to True for weekly seasonality
        additional_terms=[fourier_terms],
    )
    X = pd.concat([X, deterministic.in_sample()], axis=1)
    # Create train / validation splits
    X_train, X_valid, y_train, y_valid = train_test_split(
        X,
        Y,
        test_size=test_size,
        shuffle=False,
    )
    return X_train, X_valid, y_train, y_valid, deterministic

In [None]:
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status', 'date']
rosters_cols_no_date = ['playerId', 'teamId', 'status']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching','lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'date']

feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching','baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']

fixed_feature_cols = ['battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching','baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']

In [None]:
train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='left')
train = train.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')

# label encoding ------- This can be problematic, as you are assigning numerical relationships where they don't exist. 
                         # Why would we want to assign primarypositionname as 1 and 2 instead of 1 and 3? That's not representative, but the algorithm 
                         # won't understand it, 
#player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
#position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
#teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
#tatus2num = {c: i for i, c in enumerate(train['status'].unique())}
#train['label_playerId'] = train['playerId'].map(player2num)
#train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
#train['label_teamId'] = train['teamId'].map(teamid2num)
#train['label_status'] = train['status'].map(status2num)

#One hot encoding instead (or some other valid encoding method):
#player_id_dummy = pd.get_dummies(train['playerId'], prefix="playerId") # not really sure if this is a relevant feature
primary_position_dummy = pd.get_dummies(train['primaryPositionName'], prefix="primaryPositionId")
team_id_dummy = pd.get_dummies(train['teamId'], prefix="teamId")
status_dummy = pd.get_dummies(train['status'], prefix="statusId")
train_X = train.loc[:, fixed_feature_cols]

In [None]:
print(train_X.shape)
print(primary_position_dummy.shape)
print(team_id_dummy.shape)
print(status_dummy.shape)

X = pd.concat([train_X, primary_position_dummy, team_id_dummy, status_dummy], axis=1)

print(X.shape)

In [None]:
X.columns

# Train/Valid/Test splits (Test = submission set)

TODO: add a column for past player info and add a recurrent model in the pipeline for ensemble prediction.
    care not to consider id. Treat data as anonymous time series and add this recurrent model's prediction to the ensemble voting pool

In [None]:
train_features = X
train_y = train[['target1', 'target2', 'target3', 'target4']]

_index = (train['date'] < 20210401)
X_train = train_features.loc[_index].reset_index(drop=True)
y_train = train_y.loc[_index].reset_index(drop=True)

In [None]:
X_valid = train_features.loc[~_index].reset_index(drop=True)
y_valid = train_y.loc[~_index].reset_index(drop=True)

In [None]:
X_whole = pd.concat([X_train, X_valid])
y_whole = pd.concat([y_train, y_valid])
whole = pd.concat([X_whole, y_whole], axis=1)

Checking out what we've got:

In [None]:
train_features.columns

In [None]:
X_whole

# Dataset

In [None]:
def MLB_Train_Dataset(Dataset):
    def __init__(self, x, y):
        self.features = x
        self.labels   = y
        
    def __len__(self):
        return x.size
    
    def __getitem__(self, idx):
        return (self.features.iloc[idx, :], self.labels.iloc[idx])

In [None]:
lgbm_params = \
{
    'random_state': 42,
    'objective': 'regression'
}

#print(lgbm_train.columns)
#print(lgbm_labels.columns)

lgbm_labels1 = y_train['target1']

lgbm_dataset1 = lgbm.Dataset(X_train, label=lgbm_labels1)
model1 = lgbm.train(lgbm_params, lgbm_dataset1)


In [None]:
lgbm_labels2 = y_train['target2']
lgbm_dataset2 = lgbm.Dataset(X_train, label=lgbm_labels2)
model2 = lgbm.train(lgbm_params, lgbm_dataset2)

In [None]:
lgbm_labels3 = y_train['target3']
lgbm_dataset3 = lgbm.Dataset(X_train, label=lgbm_labels3)
model3 = lgbm.train(lgbm_params, lgbm_dataset3)

In [None]:
lgbm_labels4 = y_train['target4']
lgbm_dataset4 = lgbm.Dataset(X_train, label=lgbm_labels4)
model4 = lgbm.train(lgbm_params, lgbm_dataset4)

In [None]:
#num_rounds = 5

#params = {}
#model5 = xgb()
#labels = whole[['target1', 'target2', 'target3', 'target4']]
#model5_dataset = xgb.DMatrix(whole, label=labels)
#model5 = xgb.train(params, model5_dataset, num_round=num_rounds)

In [None]:
env = mlb.make_env() # initialize the environment

In [None]:
iter_test = env.iter_test() # iterator which loops over each date in test set
for (test_df, sample_prediction_df) in iter_test: # make predictions here
    print(test_df.shape)
    print(sample_prediction_df.shape)
    
    sample_prediction_df = sample_prediction_df.reset_index(drop=True)
    
    #create dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    
    print(test_df.shape)
    print(sample_prediction_df.shape)
    
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
       
    print(test_df.shape)
    print(sample_prediction_df.shape)
    
    sub_df = sample_prediction_df[['playerId']].copy()
    sub_df = sub_df.merge(players[players_cols], on='playerId', how='left')
    sub_df = sub_df.merge(test_rosters[rosters_cols_no_date], on='playerId', how='left')
    sub_df = sub_df.merge(scores[scores_cols], on=['playerId', 'date'], how='left')
    sub_df = sub_df.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
    
    print(test_df.shape)
    print(sample_prediction_df.shape)
    print(X_submission.shape)
    
    primary_position_dummy = pd.get_dummies(sub_df['primaryPositionName'], prefix="primaryPositionId")
    team_id_dummy = pd.get_dummies(sub_df['teamId'], prefix="teamId")
    status_dummy = pd.get_dummies(sub_df['status'], prefix="statusId")
    X_submission = sub_df.loc[:, [fixed_feature_cols]]
    
    X_submission = pd.concat([X_submission, primary_position_dummy, team_id_dummy, status_dummy], axis=1)   
    
    
    # predict
    lgbm_pred1 = model1.predict(X_submission)
    lgbm_pred2 = model2.predict(X_submission)
    lgbm_pred3 = model3.predict(X_submission)
    lgbm_pred4 = model4.predict(X_submission)
    
    # merge submission
    sub_df['target1'] = lgbm_pred1
    sub_df['target2'] = lgbm_pred2
    sub_df['target3'] = lgbm_pred3
    sub_df['target4'] = lgbm_pred4
    
    print(test_df.columns)
    print(sample_prediction_df.columns)
    #sample_prediction_df = sample_prediction_df.fillna(0.)
    #del sample_prediction_df['playerId']

In [None]:
sub_df

In [None]:
sub_df.to_csv('submission.csv')