In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import gc
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float64)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%%time
players = pd.read_csv('/kaggle/input/mlb-player-digital-engagement-forecasting/players.csv')
# players = reduce_mem_usage(players,verbose = False)

# teams = pd.read_csv('/kaggle/input/mlb-player-digital-engagement-forecasting/teams.csv')
# teams = reduce_mem_usage(teams,verbose = False)

# seasons = pd.read_csv('/kaggle/input/mlb-player-digital-engagement-forecasting/seasons.csv')
# seasons = reduce_mem_usage(seasons,verbose = False)

train = pd.read_csv('/kaggle/input/mlb-player-digital-engagement-forecasting/train.csv')
# train = reduce_mem_usage(train,verbose = False)

# awards = pd.read_csv('/kaggle/input/mlb-player-digital-engagement-forecasting/awards.csv')
# awards = reduce_mem_usage(awards,verbose = False)

In [None]:
pids_test = players.playerId[players.playerForTestSetAndFuturePreds == True]

In [None]:
# ## Uncomment this chunk of codes to have a basic info the columns present in each meta data and how the 
# ## first few rows look
# display(players.info())
# display(teams.head())
# display(teams.info())
# display(seasons.head())
# display(seasons.info())
# display(train.head())
# display(train.info())
# display(awards.head())
# display(awards.info())

In [None]:
%who

In [None]:
def unpack_raw_data(raw_data,dfs_name):
    
    unnested_data_dict = dict()
# columns = train.drop('date', axis = 1).columns.values.tolist()
# columns

    for col in dfs_name:

        data_nested_info = raw_data[['date',col]]

        data_nested_info = (data_nested_info[
              ~pd.isna(data_nested_info[col])
              ].
              reset_index(drop = True)
              )

        daily_dfs_collection = []
        for data_index, data_row in data_nested_info.iterrows():
            daily_df = pd.read_json(data_row[col])

            daily_df['dailydate'] = data_row['date']

            daily_dfs_collection = daily_dfs_collection + [daily_df]

        unnested_table = (pd.concat(daily_dfs_collection,
              ignore_index = True).
                # Set and reset index to move 'dailyDataDate' to front of df
              set_index('dailydate').
              reset_index()
              )

    #     display(col)
        unnested_table = reduce_mem_usage(unnested_table,verbose = False)

        unnested_data_dict[col] = unnested_table
        del daily_dfs_collection,unnested_table
    return unnested_data_dict 
    
    

In [None]:
# check = unpack_raw_data(train,['playerBoxScores','games'])
features = ['dailydate','engagementMetricsDate','target1','target2','target3','target4','flyOuts','strikeOuts','stolenBases','homeRunsPitching']
   

In [None]:
def make_train_data(raw_data,features):
    
    nday_pl_eng = raw_data['nextDayPlayerEngagement']
    pl_box_scores = raw_data['playerBoxScores']
  
    pl_eng_w_scores = pd.merge(nday_pl_eng,pl_box_scores,on=['dailydate','playerId'],how = 'inner')
    train_data = pl_eng_w_scores[features]
    del nday_pl_eng,pl_box_scores
    return train_data,features


In [None]:
%%time
raw_data = unpack_raw_data(train,['nextDayPlayerEngagement','playerBoxScores'])
train_data,features = make_train_data(raw_data,features)
del(raw_data)


In [None]:
gc.collect()

In [None]:
train_data.fillna(-1,inplace = True)
sample_y = train_data[['target1','target2','target3','target4']]
sample_X = train_data[['flyOuts','strikeOuts','stolenBases','homeRunsPitching']]

# display(sample_X.head())
# display(sample_y.head())
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(sample_X,sample_y)
del train_data,sample_X,sample_y

In [None]:
%who

In [None]:
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(6,input_dim = 4,activation = 'relu'))
model.add(Dense(6,activation = 'relu'))
model.add(Dense(4))
model.compile(optimizer = 'adam',loss = 'mae', metrics = ['mae'])

fit_model = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5)

In [None]:
features = ['flyOuts','strikeOuts','stolenBases','homeRunsPitching']
primary_cols = 'playerBoxScores'

In [None]:
def make_features(unnested_data_dict,primary_cols,features,sample_prediction_df):
    
    test_set = unnested_data_dict[primary_cols]
    tmp = features.copy()
    tmp.append('playerId')
    test_set = test_set[tmp]
    test_set = test_set.groupby('playerId').sum().reset_index()
    test_set = test_set.merge(pids_test,on = 'playerId',how = 'right')
    test_set = test_set.fillna(-1)
    sub_df = sample_prediction_df.copy()
    sub_df['playerId'] = sub_df['date_playerId'].map(lambda x: int(x.split('_')[1]))
    test_set = sub_df.merge(test_set,on = 'playerId', how = 'left')
    test_set = test_set[features]

    return test_set 

In [None]:
gc.collect()

In [None]:
import mlb
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test:
    
    test_df = test_df.reset_index().rename(columns = {'index':'date'})
    test_data = unpack_raw_data(test_df,['playerBoxScores'])
    
    fit_data = make_features(test_data,primary_cols,features,sample_prediction_df)
    
    pred = model.predict(fit_data)
    pred = pred.clip(0,100)
    pred = pred.round(2)
    
    sample_prediction_df[['target1','target2','target3','target4']] = pred
                        
    
    
#     sample_prediction_df['target1'] = 0.4
#     sample_prediction_df['target2'] = 2
#     sample_prediction_df['target3'] = 0.4
#     sample_prediction_df['target4'] = 0.7
    
#     sample_prediction_df = sample_prediction_df[['date_playerId']].reset_index().merge(submission,
#                                 how='left', on='date_playerId').set_index('date')
#     del submission
    env.predict(sample_prediction_df)