In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
import mlb
import matplotlib.pyplot as plt

import plotly.express as px

features that we have now

In [None]:
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('../input/mlb-pdef-train-dataset')

#### In the current data, we use only 
> rosters_train.pkl <br>
> nextDayPlayerEngagement_train.pkl <br>
> playerBoxScores_train.pkl <br>
> player_target_stats.csv

In [None]:
players = pd.read_csv(BASE_DIR / 'players.csv')

rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
followers = pd.read_pickle(TRAIN_DIR / 'playerTwitterFollowers_train.pkl')
team_followers = pd.read_pickle(TRAIN_DIR / 'teamTwitterFollowers_train.pkl')
team_followers = team_followers.rename(columns={'numberOfFollowers': 'teamFollowers'})
scores = pd.read_pickle(TRAIN_DIR / 'playerBoxScores_train.pkl')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2   
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64) 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)   
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
players = reduce_mem_usage(players)
rosters = reduce_mem_usage(rosters)
followers = reduce_mem_usage(followers)
team_followers = reduce_mem_usage(team_followers)
scores = reduce_mem_usage(scores)

# Rosters EDA

- playerId - Unique identifier for a player.
- gameDate - dat of the game
- teamId - teamId that player is on that date.
- statusCode - Roster status abbreviation.
- status - Descriptive roster status.

# Target EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
sns.set(font_scale = 1.5)
fig, axs = plt.subplots(2,2, figsize = (20, 10))
sns.kdeplot(ax=axs[0,0], data=targets['target1'])
sns.kdeplot(ax=axs[0,1], data=targets['target2'])
sns.kdeplot(ax=axs[1,0], data=targets['target3'])
sns.kdeplot(ax=axs[1,1], data=targets['target4'])
bbox = axs[0,0].get_position()
bbox2 = axs[0,1].get_position()

center=(bbox2.x1) * 0.4 + (bbox.x1) * 0.25
plt.suptitle('Distribution of targets', x = center)


In [None]:
def draw_kde_plot(col = 'target1'):
    sns.set_style('whitegrid')
    sns.set(font_scale = 1.5)
    fig, axs = plt.subplots(2,2, figsize = (15, 10))
    g = sns.kdeplot(ax=axs[0,0], data=targets[col])
    g.set_xlabel('original')
    g = sns.kdeplot(ax=axs[0,1], data=targets[col]**2)
    g.set_xlabel('squared')
    g = sns.kdeplot(ax=axs[1,0], data=targets[col]**4)
    g.set_xlabel('power 4')
    g = sns.kdeplot(ax=axs[1,1], data = np.log(targets[col]+1))
    g.set_xlabel('log')



    bbox = axs[0,0].get_position()
    bbox2 = axs[0,1].get_position()
    center=(bbox2.x1) * 0.4 + (bbox.x1) * 0.25
    plt.suptitle(f'Transformation of {col}', x = center)
    plt.tight_layout()


# **Target2 has highest Skewness**


In [None]:
for col in ['target1', 'target2', 'target3', 'target4']:
    draw_kde_plot(col)

In [None]:
sns.set_style('whitegrid')
sns.set(font_scale = 1.5)


fig, axs = plt.subplots(1,1, figsize = (20,8))
sns.lineplot(ax=axs, x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target1'],
             legend='full', label = 'target1')
sns.lineplot(ax=axs, x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target2'],
             legend='full', label = 'target2')
sns.lineplot(ax=axs, x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target3'], 
             legend='full', label = 'target3')
sns.lineplot(ax=axs,x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target4'], 
             legend='full', label = 'target4')

bbox = axs.get_position()
center=0.5*(bbox.x1)
plt.suptitle('Comparision of targets', x = center)



In [None]:
sns.set_style('ticks')
sns.set(font_scale = 1.5)


fig, axs = plt.subplots(2,2, figsize = (20,8))
sns.lineplot(ax=axs[0,0], x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target1'],
             legend='full', label = 'target1')
sns.lineplot(ax=axs[0,1], x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target2'],
             legend='full', label = 'target2')
sns.lineplot(ax=axs[1,0], x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target3'], 
             legend='full', label = 'target3')
sns.lineplot(ax=axs[1,1], x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target4'], 
             legend='full', label = 'target4')

plt.title('Comparision of targets, side by side view')


In [None]:
targets['year'] = pd.to_datetime(targets['date'], format = '%Y%m%d').dt.year

# We have less data for year 4, since we need to predict for the future

May be we should have different validation strategy

In [None]:
targets['year'].value_counts().plot(kind = 'bar')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
sns.set(font_scale = 2)
sns.color_palette("Set2")



fig, axs = plt.subplots(4,1, figsize = (20,20))
sns.lineplot(ax=axs[0], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target1',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

sns.lineplot(ax=axs[1], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target2',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

sns.lineplot(ax=axs[2], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target3',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

sns.lineplot(ax=axs[3], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target4',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

bbox = axs[0].get_position()
center=0.5*(bbox.x1)
plt.suptitle('targets over years', x = center)

In [None]:

def plot_target_for_player(col = 'target1', playerid = 683734):
    
    sns.set(style="whitegrid")
    sns.set(font_scale = 2)
    sns.color_palette("Set2")

    fig, axs = plt.subplots(1,1, figsize = (20,8))

    sns.lineplot(ax=axs, x = np.arange(365),
                 data = targets[((targets.year==2018) & (targets.playerId==playerid))],
                 y = col,
                 label = '2018',
                 linewidth=2.5)

    sns.lineplot(ax=axs, 
                 x =  np.arange(365),
                 data = targets[((targets.year==2019) & (targets.playerId==playerid))],
                 y = col,
                 label = '2019',
                 linewidth=2.5)

    sns.lineplot(ax=axs, 
                 x =  np.arange(366),
                 data = targets[((targets.year==2020) & (targets.playerId==playerid))],
                 y = col,
                 label = '2020',
                 linewidth=2.5)

    sns.lineplot(ax=axs, 
                 x =  np.arange(120),
                 data = targets[((targets.year==2021) & (targets.playerId==playerid))],
                 y = col,
                 label = '2021',
                 linewidth=2.5)
    
    bbox = axs.get_position()
    center=0.5*(bbox.x1)
    plt.suptitle(f'player Id {playerid}', x = center)

# There is definitely seasonality for targets, seems like we can remove year 2018 from modelling

In [None]:
plot_target_for_player()

In [None]:
plot_target_for_player('target2')

In [None]:
plot_target_for_player('target3')

In [None]:
plot_target_for_player('target4')

In [None]:
plot_target_for_player('target1',477132)

In [None]:
plot_target_for_player('target2',477132)

In [None]:
plot_target_for_player('target3',477132)

In [None]:
plot_target_for_player('target4',477132)

# There seems to be relationship between number of awards and targets, higher awards the player is popular

In [None]:
from scipy.stats import boxcox
xt, _ = boxcox(targets['target1'].values + 1)
sns.distplot(xt)

In [None]:
xt, _ = boxcox(targets['target4'].values + 1)
sns.distplot(xt)

In [None]:
xt, _ = boxcox(targets['target3'].values + 1)
sns.distplot(xt)

**Target4 has highest correlation with Twitter follower count**

In [None]:
import seaborn as sns

In [None]:
followers_agg =followers.groupby('playerId')['numberOfFollowers'].agg('median').reset_index()
targets_agg =targets.groupby('playerId')[['target1', 'target2', 'target3', 'target4']].agg('median').reset_index()
followers_agg.columns = ['playerId', '#Followers']
followers_agg = pd.merge(followers_agg, targets_agg, on = ['playerId'], how = 'left')
plt.figure(figsize=(10, 2))
plt.xticks(rotation=45)
plt.suptitle("Median Target vs Median Twitter Followers", fontsize =15)

corr = followers_agg.drop(columns =['playerId']).corr()
#mask = np.triu(np.ones_like(corr, dtype=bool))

x_axis_labels = ['#Followers', 'target1','target2', 'target3', 'target4'] 
sns.heatmap(np.array(corr['#Followers']).reshape((1,5)),
            annot = True,
            xticklabels=x_axis_labels,
            vmin=0,
            vmax=1,
            center= 0,
            cmap="RdYlGn"
       )

In [None]:
awards = pd.read_csv(TRAIN_DIR / 'awards_train.csv')

awards_agg = awards.groupby('playerId')['awardId'].agg('count').reset_index()
awards_agg.columns = ['playerId', '#Awards']
targets_agg =targets.groupby('playerId')[['target1', 'target2', 'target3', 'target4']].agg('median').reset_index()
followers_agg = pd.merge(awards_agg, targets_agg, on = ['playerId'], how = 'left')
plt.figure(figsize=(10, 2))
plt.xticks(rotation=45)
plt.suptitle("Total number of awards vs Median targets", fontsize =15)
corr = followers_agg.drop(columns =['playerId']).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

x_axis_labels = ['#Awards', 'target1','target2', 'target3', 'target4'] 
sns.heatmap(np.array(corr['#Awards']).reshape((1,5)),
            annot = True,
            xticklabels=x_axis_labels,
            vmin=0,
            vmax=1,
            center= 0,
            cmap="RdYlGn"
       )
del awards

In [None]:
scores_cols = scores.columns.tolist()

scores_cols = [col for col in scores_cols if col not in ['playerId', 'date', 'home', 'gamePk', 'teamId', 'battingOrder']]
for col in scores_cols:
            scores_agg = scores.groupby('playerId')[col].agg('sum').reset_index()
            scores_agg.columns = ['playerId', "#"+col]
            targets_agg =targets.groupby('playerId')[['target1', 'target2', 'target3', 'target4']].agg('median').reset_index()
            scores_agg = pd.merge(scores_agg, targets_agg, on = ['playerId'], how = 'left')
            plt.figure(figsize=(10, 2))
            plt.suptitle(f"Total {col} vs Median targets", fontsize =15)
            corr = scores_agg.drop(columns =['playerId']).corr()
            mask = np.triu(np.ones_like(corr, dtype=bool))
            sns.set(font_scale=1.4)
            plt.xticks(rotation=45)
            
            x_axis_labels = ["#"+col, 'target1','target2', 'target3', 'target4'] 

            sns.heatmap(np.array(corr["#"+col]).reshape((1,5)),
            annot = True,
            xticklabels=x_axis_labels,
            vmin=0,
            vmax=1,
            center= 0,
            cmap="RdYlGn")           
            
            plt.show()
            plt.close()

**about players followers**: Followers is highly correlated with Target4. <br>
**about player box scores**: Most of box scores are highly correlated with Target2. <br>
**about awards**: Awards are highly correlated with target1

In [None]:
seasons_df = pd.read_csv(BASE_DIR / 'seasons.csv')

In [None]:
targets['year'] = pd.to_datetime(targets['date'], format = '%Y%m%d').dt.year
targets = pd.merge(targets,
                   seasons_df,
                   how = 'left',
                   left_on = 'year',
                   right_on = 'seasonId')

In [None]:
targets['engagementMetricsDate'] = pd.to_datetime(targets['engagementMetricsDate'], format='%Y-%m-%d').dt.date
targets['seasonEndDate'] = pd.to_datetime(targets['seasonEndDate'], format='%Y-%m-%d').dt.date
targets['seasonStartDate'] = pd.to_datetime(targets['seasonStartDate'], format='%Y-%m-%d').dt.date
targets['preSeasonEndDate'] = pd.to_datetime(targets['preSeasonEndDate'], format='%Y-%m-%d').dt.date
targets['preSeasonStartDate'] = pd.to_datetime(targets['preSeasonStartDate'], format='%Y-%m-%d').dt.date
targets['regularSeasonStartDate'] = pd.to_datetime(targets['regularSeasonStartDate'], format='%Y-%m-%d').dt.date
targets['regularSeasonEndDate'] = pd.to_datetime(targets['regularSeasonEndDate'], format='%Y-%m-%d').dt.date
targets['days_to_season_end'] = (targets.seasonEndDate - targets.engagementMetricsDate).dt.days
targets['days_to_season_start'] = (targets.seasonStartDate - targets.engagementMetricsDate).dt.days

targets['during_season'] = np.where(((targets.seasonStartDate <= targets.engagementMetricsDate)
                                   & (targets.seasonEndDate  >= targets.engagementMetricsDate)), 1, 0)
targets['during_preseason'] = np.where(((targets.preSeasonStartDate <= targets.engagementMetricsDate)
                                   & (targets.preSeasonEndDate  >= targets.engagementMetricsDate)), 1, 0)

targets['during_regseason'] = np.where(((targets.regularSeasonStartDate <= targets.engagementMetricsDate)
                                   & (targets.regularSeasonEndDate  >= targets.engagementMetricsDate)), 1, 0)

In [None]:
sns.set(style="whitegrid")
sns.set(font_scale = 1)
sns.color_palette("Set2")

for target in ['target1', 'target2', 'target3', 'target4']:
    df = targets[['days_to_season_start', target, 'year']].groupby(['days_to_season_start', 'year'])[target].agg('median').reset_index()
    for year in [2018, 2019, 2020, 2021]:
        plt.figure(figsize=(20, 5))
        plot_ = sns.barplot(data = df[df.year==year], x = 'days_to_season_start', y = target)
        for label in plot_.get_xticklabels():
            if np.int(label.get_text()) % 10 == 0:  
                label.set_visible(True)
            else:
                label.set_visible(False)
        plt.title(f'{target} - year {year}', fontsize = 20)
        plt.xlabel("days to season start", fontsize = 15)
        plt.ylabel(f"{target}", fontsize = 15)

        plt.show()
        plt.close()