In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import datetime
import json
import os

import datatable as dt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.base import BaseEstimator, TransformerMixin

import mlb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.options.display.float_format = '{:.2f}'.format

In [None]:
def unpack_json(json_like: str) -> pd.DataFrame:
    '''Convert json value into dataframe.
    
    Parameters
    ----------
    json_like: str
        Json-format string
    
    Return
    ------
    converted: pd.DataFrame
        If `json_like` is not a json-format string, return blank dataframe.e 
    '''
    try:
        unpacked = pd.DataFrame(json.loads(json_like))
    except json.JSONDecodeError:
        unpacked = pd.DataFrame()
    return unpacked

In [None]:
class Predictor(BaseEstimator, TransformerMixin):
    
    def __init__(self, train_path):
        self.train_path = train_path
        self.timeseries = self.load_timeseries_engagement()

    def load_timeseries_engagement(self) -> pd.DataFrame:
        train = dt.fread(self.train_path).to_pandas()
        timeseries = pd.concat(train.nextDayPlayerEngagement.apply(unpack_json).tolist())
        timeseries['engagementMetricsDate'] = pd.to_datetime(timeseries['engagementMetricsDate'])
        timeseries['date'] = timeseries.engagementMetricsDate + datetime.timedelta(days=-1)
        return timeseries
    
    def get_timeseries_engagement(self, copy=True):
        return self.timeseries.copy() if copy else self.timeseries
    
    def fit(self, X=None, y=None):
        return self

    def predict(self, X: pd.DataFrame) -> pd.DataFrame:
        return X.date_playerId.apply(self.predict_by_row)
        
    def predict_by_row(self, date_playerId: str) -> pd.Series:
        date = date_playerId.split('_')[0]
        date = '-'.join([str(date)[:4], str(date)[4:6], str(date)[6:]])  # to YYYY-MM-DD format
        player_id = int(date_playerId.split('_')[1])
        timeseries = self.get_timeseries_engagement()
        if player_id in timeseries.playerId.values:  # Find player's engagement data
            player_timeseries = timeseries[timeseries.playerId == player_id]
            player_history = player_timeseries[player_timeseries.date < date].sort_values('date')
            if player_history.shape[0] > 0:  # Find historical data
                latest = player_history.iloc[-1]
                return pd.Series({'date': latest.date,
                                  'playerId': latest.playerId,
                                  'target1': latest.target1,
                                  'target2': latest.target2,
                                  'target3': latest.target3,
                                  'target4': latest.target4})
            else:  # Go to future...
                nearest = player_timeseries.sort_values('date').iloc[0]
                return pd.Series({'date': nearest.date,
                                  'playerId': nearest.playerId,
                                  'target1': nearest.target1,
                                  'target2': nearest.target2,
                                  'target3': nearest.target3,
                                  'target4': nearest.target4})
        else:  # That player is not seen in training set
            all_player_history = timeseries[timeseries.date < date]
            if all_player_history.shape[0] > 0:
                mean_by_date = all_player_history \
                              .groupby('date') \
                              [['target1', 'target2', 'target3', 'target4']] \
                              .mean() \
                              .sort_index() \
                              .reset_index()
                latest = mean_by_date.iloc[0]
                return pd.Series({'date': latest.date,
                                  'playerId': np.nan,
                                  'target1': latest.target1,
                                  'target2': latest.target2,
                                  'target3': latest.target3,
                                  'target4': latest.target4})
            else:
                return pd.Series({'date': np.nan,
                                  'playerId': np.nan,
                                  'target1': timeseries.target1.median(),
                                  'target2': timeseries.target2.median(),
                                  'target3': timeseries.target3.median(),
                                  'target4': timeseries.target4.median()})
            
            

In [None]:
%%time
predictor = Predictor(train_path='/kaggle/input/mlb-player-digital-engagement-forecasting/train.csv')

In [None]:
%%time
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set
for i, (test_df, sample_prediction_df) in enumerate(iter_test):
    prediction = predictor.predict(sample_prediction_df.copy())
    sample_prediction_df['target1'] = prediction['target1']
    sample_prediction_df['target2'] = prediction['target2']
    sample_prediction_df['target3'] = prediction['target3']
    sample_prediction_df['target4'] = prediction['target4']
    env.predict(sample_prediction_df)