 In this notebook only inference and submission task will be performed.

In [None]:
import gc
import sys
import warnings
from pathlib import Path

import os

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
#warnings.simplefilter("ignore")

In [None]:
# Helper function to unpack json found in daily data
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

# helper function to add is_played column
def is_played_games(row):
    if pd.isnull(row['gameDate']):
        is_played = 0
    else:
        is_played = 1
    return is_played

# helper function to add bmi column
def BMI(row):
    '''
    Calculate BMI for players.csv
    '''
    height_in = row['heightInches']
    mass_lb = row['weight']
    bmi = (mass_lb/height_in**2)*703
    
    return bmi

In [None]:
# Helper function to unpack json found in daily data
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

# helper function to add is_played column
def is_played_games(row):
    if pd.isnull(row['gameDate']):
        is_played = 0
    else:
        is_played = 1
    return is_played

In [None]:
def age_now(row):
    '''
    Calculate Age (in days) at Given Day
    '''
    date = row['date']
    given_day = pd.to_datetime(date,format='%Y%m%d')
    dob = row["DOB"] #should be datetime formated already
    age = (given_day - dob).days/365
    
    return age

def age_now_d(row):
    '''
    Calculate Age (in days) at Given Day
    '''
    date = row['date_playerId'].split('_')[0]
    given_day = pd.to_datetime(date,format='%Y%m%d')
    dob = row["DOB"] #should be datetime formated already
    age = (given_day - dob).days/365
    
    return age

def mlbDebutDays_now(row):
    '''
    Calculate mlbDebutDays at Given Day
    '''
    date = row['date']
    given_day = pd.to_datetime(date,format='%Y%m%d')
    dob = pd.to_datetime(row["mlbDebutDate"])
    mlbDebutDays = (given_day - dob).days
    
    return mlbDebutDays

In [None]:
def evalYear(row):
    year = pd.to_datetime(row.date, format='%Y%m%d').year
    return year

In [None]:
def player_data_process(dataset):
    '''
    This fucntion process the players.csv
    New Columns : age, bmi
    '''
    temp = dataset.copy()
    temp["DOB"] = pd.to_datetime(temp["DOB"]) # death of birth
    temp['bmi'] = temp.apply(BMI,axis=1)
    
    return temp

## Loading Data

In [None]:
test = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_test.csv")
players = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/players.csv")
mean_target_by_player = pd.read_csv("../input/derived-data/mean_target_by_player.csv")

In [None]:
test

## Loading Model

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras import optimizers

In [None]:
def make_model(n_in):
    inp = L.Input(name="inputs", shape=(n_in,))
    x = L.Dense(50, activation="relu", name="d3")(inp)
#     x = L.Dropout(0.2)(x)
    x = L.Dense(50, activation="relu", name="d4")(x)
#     x = L.Dropout(0.2)(x)
    preds = L.Dense(4, activation="linear", name="preds")(x)
    
    model = M.Model(inp, preds, name="ANN")
    model.compile(loss="mean_absolute_error", optimizer=optimizers.Adamax(lr=0.001, decay=1e-3))
    return model

In [None]:
model = make_model(7)

In [None]:
# Loads the weights
model.load_weights("../input/weights/model_ANN2.cpkt")

In [None]:
model.summary()

## Prediction Task

In [None]:
FECOLS = ['t1_m','t2_m','t3_m','t4_m','is_played','age','bmi'] #feature columns 
TGTCOLS = ['target1', 'target2', 'target3', 'target4']  #target columns

In [None]:
def add_date_playerid(row):

    given_day = pd.to_datetime(row['date'],format='%Y%m%d') #taking timestamp of the given day
    next_day = given_day + pd.DateOffset(1) # next date
                                   
    next_day = str(next_day).split(" ")[0].replace("-","")
    playerId = row['playerId']
    date_playerId = next_day+"_"+str(playerId)

    return date_playerId

In [None]:
def process_prediction(test_df,sub_mode=True):
    append = False #flag for append to new_df
    
    for i in range(test_df.shape[0]):
        #test dataframe that is provided for submission has no formal date column
        if sub_mode:
            date = test_df.index[i]
        else:
            date = test_df.date.iloc[i] #taking the date where we are expanding json
        
        roster = unpack_json(test_df.rosters.iloc[i])
        roster.insert(0,'date',date) #inserting the given date
        
        if append==False:
            append= True
            new_df = roster
        else:
            new_df = new_df.append(roster,ignore_index=True)
            
    
    new_df['date_playerId'] = new_df.apply(add_date_playerid,axis=1)
    return new_df

In [None]:
#processing players data
players_processed = player_data_process(players)

In [None]:
tempx = process_prediction(test,sub_mode=False)
tempx

In [None]:
tempx.isnull().any()

In [None]:
# tempx[tempx.is_played==0]
# tempx[tempx.playerId==596049]

In [None]:
# X = tempx[FECOLS].values
# preds = model.predict(X)

In [None]:
# tempx[TGTCOLS] = np.clip(preds,0,100)
# tempx

In [None]:
gc.collect()

## Submission

In [None]:
import mlb
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    
    sample_prediction_df = sample_prediction_df.reset_index(drop=True)
    sample_prediction_df.drop(TGTCOLS,axis=1,inplace=True)
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    
    
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
    
    test = sample_prediction_df[['playerId']].copy()
    test = test.merge(test_rosters, on='playerId', how='left')
    test = test.merge(players_processed[['playerId','bmi','DOB']],on='playerId',how='left')
    test.insert(0,'date',test_df.index[0]) #test_df fully extended here (rosters)

    
    #add new columns: age, is_played
    test['age'] = test.apply(age_now,axis=1)
    test['is_played'] = test.apply(is_played_games,axis=1)
    #adding mean_target_by_player
    test = test.merge(mean_target_by_player,on='playerId',how='left')
    
    
    #making predictions : preds
    X_ = test[FECOLS].values
    preds = model.predict(X_)
    
#     #to debug
#     sample_pred_temp = sample_pred_temp.append(sample_prediction_df)#,ignore_index=True)
#     test_temp = test_temp.append(test) #ignore_index=True)
#     #
    
    #merging prediction to submission dataframe
    sample_prediction_df[TGTCOLS] = np.clip(preds,0,100)
    sample_prediction_df = sample_prediction_df.fillna(0.)    
    del sample_prediction_df['playerId']
   
    env.predict(sample_prediction_df)
    

In [None]:
sample_prediction_df.head()

In [None]:
# #For debugging (first uncomment 'to debug' section in the submission loop, then run twice)
# test_temp = test.copy()
# test_temp.drop(test_temp.index,axis=0,inplace=True)

# sample_pred_temp = sample_prediction_df.copy()
# sample_pred_temp.drop(sample_pred_temp.index,axis=0,inplace=True)

In [None]:
# # to reset env.predict() to work
# example_sample_submission = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_sample_submission.csv")
# example_sample_submission
# env.predict(example_sample_submission) 