In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import gc
from tqdm.auto import tqdm

### Config

In [None]:
### DATA CONFIG

targets = ['target1','target2','target3','target4']
SPLIT = pd.to_datetime('2020-01-01')
features = ['have_game']
identifiers = ['playerId','date']

### MODEL CONFIG



In [None]:
def compute_metric(ground_truth,predicted):
    ground_truth_sorted = ground_truth.sort_values(identifiers).reset_index(drop=True)
    predicted_sorted = predicted.sort_values(identifiers).reset_index(drop=True)
    metric = (ground_truth_sorted[targets]-predicted_sorted[targets]).abs().mean()
    metric.loc['CV'] = metric.mean()
    return metric

def pair_correlation(df1,df2):
    correlation_dfs = pd.merge(df1,df2,on=identifiers).corr()
    cols1 = [x for x in df1.columns if x in correlation_dfs.columns and x not in df2.columns]
    cols2 = [x for x in df2.columns if x in correlation_dfs.columns and x not in df1.columns]
    return correlation_dfs.loc[cols1,cols2]

### Read Data

In [None]:
engagements = pd.read_csv('../input/mlb-train-processed-data/nextDayPlayerEngagement.csv',index_col=0).rename({'engagementMetricsDate':'date'},axis=1)
engagements['date'] = pd.to_datetime(engagements['date'])-pd.to_timedelta('1 days')
player_box_scores = pd.read_csv('../input/mlb-train-processed-data/playerBoxScores.csv',index_col=0).rename(columns={'gameDate':'date'})
player_box_scores['date'] = pd.to_datetime(player_box_scores['date'])
player_box_scores['have_game'] = 1
player_box_scores = player_box_scores.drop_duplicates(['playerId','date'],keep='first')
df = pd.merge(engagements,player_box_scores,on=['playerId','date'],how='left')
df['have_game'] = df['have_game'].fillna(0)
del player_box_scores,engagements
gc.collect()

In [None]:
df.groupby('have_game')[targets].describe().T

In [None]:
correlations = df.corr()[['target1','target2','target3','target4']]
correlations['mean_corr'] = correlations.mean(axis=1)
correlations.sort_values('mean_corr',ascending=False).head(30)

In [None]:
df[df.have_game==1].sample(3)

In [None]:
df[df.have_game==0].sample(3)

### Truncated Validation

In [None]:
## Train Targets
train_targets = df.loc[df.date<SPLIT,identifiers+targets].reset_index(drop=True)
val_targets = df.loc[df.date>=SPLIT,identifiers+targets].reset_index(drop=True)
print(train_targets.shape,val_targets.shape)

## Train Features
train_features = df.loc[df.date<SPLIT,identifiers+features].reset_index(drop=True)
val_features = df.loc[df.date>=SPLIT,identifiers+features].reset_index(drop=True)
print(train_features.shape,val_features.shape)

## Compute Aggregate Features From Train
aggregate = train_targets[train_features.have_game==0].groupby('playerId')[targets].median().reset_index()
aggregate.columns = ['agg_'+x if 'target' in x else x for x in aggregate.columns]

train_features = pd.merge(train_features,aggregate,on='playerId')
val_features = pd.merge(val_features,aggregate,on='playerId')
print(train_features.shape,val_features.shape)
print(train_features.date.min(),train_features.date.max(),val_features.date.min(),val_features.date.max())
train_features.sample(3)

In [None]:
pair_correlation(train_targets,train_features)

In [None]:
pair_correlation(val_targets,val_features)

### Modelling

In [None]:
selected_features = ['have_game','agg_target1', 'agg_target2', 'agg_target3', 'agg_target4']

In [None]:
class Regressor():

    def fit(self,X,y):
        temp = pd.concat([X,y],axis=1)
        temp.columns = ['have_game','_','target']
        neg,pos = temp.groupby('have_game').target.median().values
        self.offset = (pos - neg)
        
    def predict(self,X):
        offset = X.values[:,0]*self.offset
        return np.clip(X.values[:,1] + offset,0,100)

In [None]:
%%time
# from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
# reg = RandomForestRegressor(n_estimators=100,max_depth=6, random_state=0,n_jobs=-1,verbose=1)
predicted = train_targets.sort_values(identifiers).reset_index(drop=True)[identifiers]
gt = train_targets.sort_values(identifiers).reset_index(drop=True)
regs = {}
for target in targets:
    selected_features = features + ['agg_'+target]
#     reg = Ridge(alpha=1.0)
    reg = Regressor()
    X = train_features.sort_values(identifiers).reset_index(drop=True)[selected_features]
    y = train_targets.sort_values(identifiers).reset_index(drop=True)[target]
    reg.fit(X, y)
    print(target,reg.offset)
    predicted[target] = reg.predict(X)
    regs[target] = reg
# predicted = pd.DataFrame(predicted,columns=targets)
print("Train Metrics: ",compute_metric(gt,predicted).to_dict())

In [None]:
pd.merge(predicted,train_features,on=identifiers).groupby('have_game')[targets].mean()

In [None]:
pd.merge(train_targets,train_features,on=identifiers).groupby('have_game')[targets].mean()

In [None]:
predicted = val_targets.sort_values(identifiers).reset_index(drop=True)[identifiers]
for target in tqdm(targets):
    selected_features = features + ['agg_'+target]
    X = val_features.sort_values(identifiers).reset_index(drop=True)[selected_features]
    predicted[target] = regs[target].predict(X)
print(val_targets.shape,predicted.shape)
print("Val Metrics: ",compute_metric(val_targets,predicted).to_dict())

In [None]:
pd.merge(predicted,val_features,on=identifiers).groupby('have_game')[targets].mean()

In [None]:
pd.merge(val_targets,val_features,on=identifiers).groupby('have_game')[targets].mean()

In [None]:
pd.concat([train_targets.groupby('playerId').median().mean(),predicted.mean(),val_targets.groupby('playerId').median().mean()],axis=1)

In [None]:
pd.concat([train_targets.mean(),predicted.mean(),val_targets.mean()],axis=1)

### Training For Test

In [None]:
%%time

## Train Targets & Features
train_targets = df[identifiers+targets].reset_index(drop=True)
train_features = df[identifiers+features].reset_index(drop=True)

## Compute Aggregate Features From Train
aggregate = train_targets.groupby('playerId')[targets].median().reset_index()
aggregate.columns = ['agg_'+x if 'target' in x else x for x in aggregate.columns]
train_features = pd.merge(train_features,aggregate,on='playerId')

print(train_features.shape,train_targets.shape)
train_features.sample(3)

pair_correlation(train_targets,train_features)

In [None]:
%%time
predicted = train_targets.sort_values(identifiers).reset_index(drop=True)[identifiers]
gt = train_targets.sort_values(identifiers).reset_index(drop=True)
regs = {}
for target in tqdm(targets):
    selected_features = features + ['agg_'+target]
    X = train_features.sort_values(identifiers).reset_index(drop=True)[selected_features]
    y = train_targets.sort_values(identifiers).reset_index(drop=True)[target]
    reg.fit(X, y)
    predicted[target] = reg.predict(X)
    regs[target] = reg
print("Train Metrics: ",compute_metric(gt,predicted).to_dict())

### Submitting

In [None]:
import json
import matplotlib.pyplot as plt


In [None]:
import mlb
from tqdm.auto import tqdm
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in tqdm(iter_test):
    template = sample_prediction_df[['date_playerId']].reset_index()
    template['playerId'] = template.date_playerId.apply(lambda x:x.split('_')[1]).astype(int)
    test_box_scores = test_df['playerBoxScores'].fillna('[]').apply(lambda x:json.loads(x))
    test_box_scores = list(np.concatenate(test_box_scores.values))
    test_box_scores = pd.DataFrame(test_box_scores).rename(columns={'gameDate':'date'})
    test_box_scores['date'] = test_box_scores.date.apply(lambda x:x.replace('-','')).astype(int)
    test_box_scores['have_game'] = 1
    test_box_scores = test_box_scores.drop_duplicates(identifiers)
    df = pd.merge(template,test_box_scores,on=identifiers,how='left')
    df['have_game'] = df['have_game'].fillna(0)
    test_features = df[identifiers+features+['date_playerId']].reset_index(drop=True)
    test_features = pd.merge(test_features,aggregate,on='playerId').sort_values(identifiers).reset_index(drop=True)
    predicted = pd.DataFrame(index=df.date)
    for target in tqdm(targets):
        selected_features = features + ['agg_'+target]
        X = test_features[selected_features]
        predicted[target] = regs[target].predict(X)

    predicted['date_playerId'] = test_features['date_playerId'].values
    predicted.index = df.date
    print(predicted.shape,sample_prediction_df.shape)
    assert predicted.shape==sample_prediction_df.shape
    env.predict(predicted)