In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
import mlb

In [None]:
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('../input/mlb-pdef-train-dataset')

In [None]:
players = pd.read_csv(BASE_DIR / 'players.csv')

rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
followers = pd.read_pickle(TRAIN_DIR / 'playerTwitterFollowers_train.pkl')
team_followers = pd.read_pickle(TRAIN_DIR / 'teamTwitterFollowers_train.pkl')
team_followers = team_followers.rename(columns={'numberOfFollowers': 'teamFollowers'})
scores = pd.read_pickle(TRAIN_DIR / 'playerBoxScores_train.pkl')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()

In [None]:
import os

In [None]:
os.listdir(TRAIN_DIR)

In [None]:
awards = pd.read_csv(TRAIN_DIR / 'awards_train.csv')

In [None]:
awards.head()

# **Target Visualization**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
sns.set(font_scale = 1.5)
fig, axs = plt.subplots(2,2, figsize = (20, 10))
sns.kdeplot(ax=axs[0,0], data=targets['target1'])
sns.kdeplot(ax=axs[0,1], data=targets['target2'])
sns.kdeplot(ax=axs[1,0], data=targets['target3'])
sns.kdeplot(ax=axs[1,1], data=targets['target4'])
bbox = axs[0,0].get_position()
bbox2 = axs[0,1].get_position()

center=(bbox2.x1) * 0.4 + (bbox.x1) * 0.25
plt.suptitle('Distribution of targets', x = center)


In [None]:
def draw_kde_plot(col = 'target1'):
    sns.set_style('whitegrid')
    sns.set(font_scale = 1.5)
    fig, axs = plt.subplots(2,2, figsize = (15, 10))
    g = sns.kdeplot(ax=axs[0,0], data=targets[col])
    g.set_xlabel('original')
    g = sns.kdeplot(ax=axs[0,1], data=targets[col]**2)
    g.set_xlabel('squared')
    g = sns.kdeplot(ax=axs[1,0], data=targets[col]**4)
    g.set_xlabel('power 4')
    g = sns.kdeplot(ax=axs[1,1], data = np.log(targets[col]+1))
    g.set_xlabel('log')



    bbox = axs[0,0].get_position()
    bbox2 = axs[0,1].get_position()
    center=(bbox2.x1) * 0.4 + (bbox.x1) * 0.25
    plt.suptitle(f'Transformation of {col}', x = center)
    plt.tight_layout()


# **Target2 has highest Skewness**


In [None]:
for col in ['target1', 'target2', 'target3', 'target4']:
    draw_kde_plot(col)

In [None]:
sns.set_style('whitegrid')
sns.set(font_scale = 1.5)


fig, axs = plt.subplots(1,1, figsize = (20,8))
sns.lineplot(ax=axs, x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target1'],
             legend='full', label = 'target1')
sns.lineplot(ax=axs, x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target2'],
             legend='full', label = 'target2')
sns.lineplot(ax=axs, x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target3'], 
             legend='full', label = 'target3')
sns.lineplot(ax=axs,x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target4'], 
             legend='full', label = 'target4')

bbox = axs.get_position()
center=0.5*(bbox.x1)
plt.suptitle('Comparision of targets', x = center)



In [None]:
sns.set_style('ticks')
sns.set(font_scale = 1.5)


fig, axs = plt.subplots(2,2, figsize = (20,8))
sns.lineplot(ax=axs[0,0], x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target1'],
             legend='full', label = 'target1')
sns.lineplot(ax=axs[0,1], x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target2'],
             legend='full', label = 'target2')
sns.lineplot(ax=axs[1,0], x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target3'], 
             legend='full', label = 'target3')
sns.lineplot(ax=axs[1,1], x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target4'], 
             legend='full', label = 'target4')

plt.title('Comparision of targets, side by side view')


In [None]:
targets['year'] = pd.to_datetime(targets['date'], format = '%Y%m%d').dt.year

# We have less data for year 4, since we need to predict for the future

May be we should have different validation strategy

In [None]:
targets['year'].value_counts().plot(kind = 'bar')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
sns.set(font_scale = 2)
sns.color_palette("Set2")



fig, axs = plt.subplots(4,1, figsize = (20,20))
sns.lineplot(ax=axs[0], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target1',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

sns.lineplot(ax=axs[1], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target2',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

sns.lineplot(ax=axs[2], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target3',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

sns.lineplot(ax=axs[3], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target4',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

bbox = axs[0].get_position()
center=0.5*(bbox.x1)
plt.suptitle('targets over years', x = center)

In [None]:

def plot_target_for_player(col = 'target1', playerid = 683734):
    
    sns.set(style="whitegrid")
    sns.set(font_scale = 2)
    sns.color_palette("Set2")

    fig, axs = plt.subplots(1,1, figsize = (20,8))

    sns.lineplot(ax=axs, x = np.arange(365),
                 data = targets[((targets.year==2018) & (targets.playerId==playerid))],
                 y = col,
                 label = '2018',
                 linewidth=2.5)

    sns.lineplot(ax=axs, 
                 x =  np.arange(365),
                 data = targets[((targets.year==2019) & (targets.playerId==playerid))],
                 y = col,
                 label = '2019',
                 linewidth=2.5)

    sns.lineplot(ax=axs, 
                 x =  np.arange(366),
                 data = targets[((targets.year==2020) & (targets.playerId==playerid))],
                 y = col,
                 label = '2020',
                 linewidth=2.5)

    sns.lineplot(ax=axs, 
                 x =  np.arange(120),
                 data = targets[((targets.year==2021) & (targets.playerId==playerid))],
                 y = col,
                 label = '2021',
                 linewidth=2.5)
    
    bbox = axs.get_position()
    center=0.5*(bbox.x1)
    plt.suptitle(f'player Id {playerid}', x = center)

# There is definitely seasonality for targets, seems like we can remove year 2018 from modelling

In [None]:
plot_target_for_player()

In [None]:
plot_target_for_player('target2')

In [None]:
plot_target_for_player('target3')

In [None]:
plot_target_for_player('target4')

In [None]:
plot_target_for_player('target1',477132)

In [None]:
plot_target_for_player('target2',477132)

In [None]:
plot_target_for_player('target3',477132)

In [None]:
plot_target_for_player('target4',477132)

In [None]:
awards['awardDateMonth'] = pd.to_datetime(awards['awardDate'], format = '%Y-%m-%d').dt.month
awards['awardDateYear'] = pd.to_datetime(awards['awardDate'], format = '%Y-%m-%d').dt.year

In [None]:
def plot_awards(playerid=477132):
    if len(awards[awards.playerId==playerid]['awardDateYear'].value_counts()) > 0:
        awards[awards.playerId==playerid]['awardDateYear'].value_counts().plot(kind = 'bar')
        plt.show()

In [None]:
plot_awards(477132)

# There seems to be relationship between number of awards and targets, higher awards the player is popular

In [None]:
from scipy.stats import boxcox
xt, _ = boxcox(targets['target1'].values + 1)
sns.distplot(xt)

In [None]:
xt, _ = boxcox(targets['target4'].values + 1)
sns.distplot(xt)

In [None]:
xt, _ = boxcox(targets['target3'].values + 1)
sns.distplot(xt)