In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
import mlb
import matplotlib.pyplot as plt

import plotly.express as px

In [None]:
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('../input/mlb-pdef-train-dataset')

In [None]:
players = pd.read_csv(BASE_DIR / 'players.csv')

rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
followers = pd.read_pickle(TRAIN_DIR / 'playerTwitterFollowers_train.pkl')
team_followers = pd.read_pickle(TRAIN_DIR / 'teamTwitterFollowers_train.pkl')
team_followers = team_followers.rename(columns={'numberOfFollowers': 'teamFollowers'})
scores = pd.read_pickle(TRAIN_DIR / 'playerBoxScores_train.pkl')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()

# Rosters EDA

- playerId - Unique identifier for a player.
- gameDate - dat of the game
- teamId - teamId that player is on that date.
- statusCode - Roster status abbreviation.
- status - Descriptive roster status.

In [None]:
rosters.head()

In [None]:
rosters.status.unique()

In [None]:
statusCode = rosters.statusCode.value_counts()

fig = px.bar(x=statusCode.index, y=np.log(statusCode.values), text=statusCode.values,
            labels={'x': 'roster status code', 'y': 'values'},
            title='Number of players under each roster status(log)', 
            color=statusCode.index)

fig.show()

In [None]:
rosters['year'] = pd.to_datetime(rosters['date'], format = '%Y%m%d').dt.year

In [None]:
status_change_per_year = pd.DataFrame(rosters.groupby('year')['statusCode'].value_counts())
status_change_per_year.columns = ['count']
status_change_per_year = status_change_per_year.reset_index()

fig = px.bar(x=status_change_per_year['statusCode'], y=np.log(status_change_per_year['count']), 
             text=status_change_per_year['count'],
             animation_frame=status_change_per_year['year'], animation_group=status_change_per_year['count'],
             labels={'x': 'roster status code', 'y': 'values'},
             title='Number of players under each roster status per year(log)', 
             color=status_change_per_year['statusCode'])

fig.show()

# Player twitter followers EDA

Twitter following data was collected by MLB from Twitter APIs for Major League players, on the first of each month, dating back to 1/1/2018. This dataset is not exhaustive of all players over all months, as not every player has/had a Twitter account, players may create/delete/reinstate accounts at random, or other scenarios preventing follower data from being collected on a given date. <br>

- date - Date of follower count.
- playerId - Unique identifier for a player.
- playerName
- accountName - Name on player twitter account.
- twitterHandle - Player twitter handle.
- numberOfFollowers - Number of followers

In [None]:
followers.head()

In [None]:
followers.shape

In [None]:
followers.playerName.unique().shape

In [None]:
most_famous_player = followers.groupby('playerName')['numberOfFollowers'].max().sort_values(ascending=False)[:20]

fig = px.bar(x=most_famous_player.index, y=most_famous_player.values, text=most_famous_player.values,
            labels={'x': 'Name', 'y': 'followers'},
            title='Top 20 the most famous players by tweeter followers', 
            color=most_famous_player.index)
fig.show()

In [None]:
followers['date'] = pd.to_datetime(followers['date'], format = '%Y%m%d')

In [None]:
followers_growth = followers.groupby(['playerName', 'date']).agg({'numberOfFollowers':'min'}).reset_index()

In [None]:
min_followers = \
    pd.DataFrame(followers_growth.groupby('playerName').agg({'numberOfFollowers':'min'})['numberOfFollowers']).reset_index()
min_followers.columns = ['name', 'min_f']

max_followers = \
    pd.DataFrame(followers_growth.groupby('playerName').agg({'numberOfFollowers':'max'})['numberOfFollowers']).reset_index()
max_followers.columns = ['name', 'max_f']

In [None]:
def get_growth(min_max_dataset):
    A = min_max_dataset['min_f']
    B = min_max_dataset['max_f']
    return (B-A)/A*100

In [None]:
min_max_followers = min_followers.merge(max_followers, on='name', how='left')
min_max_followers['growth%'] = get_growth(min_max_followers)
min_max_followers['growth_num'] = min_max_followers['max_f']-min_max_followers['min_f']
min_max_followers = min_max_followers.sort_values(by=['growth_num', 'growth%'], ascending=False)[:20]

In [None]:
min_max_followers

In [None]:
fig = px.bar(x=min_max_followers['name'], y=min_max_followers['growth_num'], 
             text=round(min_max_followers['growth%']),
             labels={'x': 'name', 'y': 'followers'},
             title='Top 20 fastest growing players by tweeter followers', 
             color=min_max_followers['name'])
fig.show()

# Team followers EDA

Twitter following data was collected by MLB from Twitter APIs for all 30 Major League teams, on the first of each month, dating back to 1/1/2018. <br>

- date - Date of follower count.
- teamId - Unique identifier for a team.
- teamName
- accountName - Name on team twitter account.
- twitterHandle - Team twitter handle.
- numberOfFollowers - Number of followers.

In [None]:
team_followers.head()

In [None]:
team_followers.shape

In [None]:
most_famous_teams = team_followers.groupby('teamName')['teamFollowers'].max().sort_values(ascending=False)

fig = px.bar(x=most_famous_teams.index, y=most_famous_teams.values, text=most_famous_teams.values,
            labels={'x': 'Team name', 'y': 'followers'},
            title='The most famous teams by tweeter followers', 
            color=most_famous_teams.index)
fig.show()

# Player Box Scores EDA

- home - Binary, 1 if home team, 0 if away.
- gamePk - unique identifier for game.
- gameDate
- gameTimeUTC - First pitch time in UTC.
- teamId - Unique identifier for team.
- teamName
- playerId - Unique identifier for player.
- playerName
- jerseyNum
- positionCode - Number position code, details are here.
- positionName - Text position display, details are here.
- positionType - Position group, details are here.
- battingOrder - Format: “###”, where the first digit indicates the batting order spot, and the second two digits indicate the sequence in which that player occupied that batting order spot. Examples: “300”, which indicates the starter in the third spot in the batting order; “903,” which indicates the fourth player (after 900, 901 and 902) occupy the ninth spot in the batting order. Only populates if appeared in game.
- gamesPlayedBatting - 1 if player entered the game as a batter, runner or fielder.
- flyOuts - Game total fly outs.
- groundOuts - Game total ground outs.
- runsScored - Game total runs scored.
- doubles - Game total doubles.
- triples - Game total triples.
- homeRuns - Game total home runs.
- strikeOuts - Game total strike outs.
- baseOnBalls - Game total walks.
- intentionalWalks - Game total intentional walks.
- hits - Game total hits.
- hitByPitch - Game total hit by pitches.
- atBats - Game total at-bats.
- caughtStealing - Game total caught stealing.
- stolenBases - Game total stolen bases.
- groundIntoDoublePlay - Game total double plays grounded into.
- groundIntoTriplePlay - Game total triple plays grounded into.
- plateAppearances - Game total plate appearances.
- totalBases - Game total bases.
- rbi - Game total runs batted in.
- leftOnBase - Game total runners left on base.
- sacBunts - Game total sacrifice bunts.
- sacFlies - Game total sacrifice flies.
- catchersInterference - Game total catchers interference reached on.
- pickoffs - Game total number of times picked off base.
- gamesPlayedPitching- Binary, 1 if player entered the game as a pitcher.
- gamesStartedPitching- Binary, 1 if player was game’s starting pitcher.
- completeGamesPitching - Binary, 1 if credited with complete game.
- shutoutsPitching - Binary, 1 if credited with shutout.
- winsPitching - Binary, 1 if credited with win.
- lossesPitching - Binary, 1 if credited with loss.
- flyOutsPitching - Game total of flyouts allowed.
- airOutsPitching - Game total of air outs (flyouts + popouts) allowed.
- groundOutsPitching - Game total ground outs allowed.
- runsPitching - Game total runs allowed.
- doublesPitching - Game total doubles allowed.
- triplesPitching - Game total triples allowed.
- homeRunsPitching - Game total home runs allowed.
- strikeOutsPitching - Game total strike outs allowed.
- baseOnBallsPitching - Game total walks allowed.
- intentionalWalksPitching - Game total intentional walks allowed.
- hitsPitching - Game total hits allowed.
- hitByPitchPitching - Game total hit by pitches allowed.
- atBatsPitching - Game total at-bats against.
- caughtStealingPitching - Game total caught stealing against.
- stolenBasesPitching - Game total stolen bases allowed.
- inningsPitched - Game total innings pitched.
- saveOpportunities - Binary, 1 if credited with save opportunity.
- earnedRuns - Game total earned runs allowed.
- battersFaced - Game total batters faced.
- outsPitching - Game total outs recorded.
- pitchesThrown - Game total number of pitches thrown.
- balls - Game total balls thrown.
- strikes - Game total strikes thrown.
- hitBatsmen - Game total hit batters.
- balks - Game total balks.
- wildPitches - Game total number of wild pitches thrown.
- pickoffsPitching - Game total number of pickoffs.
- rbiPitching - Game total number of runs batted in allowed.
- inheritedRunners - Game total of inherited runners assumed.
- inheritedRunnersScored- Game total of inherited runners that scored.
- catchersInterferencePitching - Game total of catcher’s interference occurred by battery.
- sacBuntsPitching - Game total sacrifice bunts allowed.
- sacFliesPitching - Game total sacrifice flies allowed.
- saves - Binary, 1 if credited with save.
- holds - Binary, 1 if credited with hold.
- blownSaves - Binary, 1 if credited with blown save.
- assists - Game total number of assists.
- putOuts - Game total number of putouts.
- errors - Game total number of errors.
- chances - Game total fielding chances.

In [None]:
scores.head()

In [None]:
st = scores.columns[2:45]
nd = scores.columns[45:]

In [None]:
scores[st].describe()

In [None]:
scores[nd].describe()

# Awards EDA

In [None]:
awards = pd.read_csv(TRAIN_DIR / 'awards_train.csv')

In [None]:
awards['awardDateMonth'] = pd.to_datetime(awards['awardDate'], format = '%Y-%m-%d').dt.month
awards['awardDateYear'] = pd.to_datetime(awards['awardDate'], format = '%Y-%m-%d').dt.year

In [None]:
awards.head()

In [None]:
most_awarded_player_per_year = pd.DataFrame(awards.groupby('awardDateYear')['playerName'].value_counts())
most_awarded_player_per_year.columns = ['num_of_awards']
most_awarded_player_per_year = most_awarded_player_per_year.reset_index()
year2018 = most_awarded_player_per_year[most_awarded_player_per_year['awardDateYear'] == 2018][:20]
year2019 = most_awarded_player_per_year[most_awarded_player_per_year['awardDateYear'] == 2019][:20]
year2020 = most_awarded_player_per_year[most_awarded_player_per_year['awardDateYear'] == 2020][:20]
year2021 = most_awarded_player_per_year[most_awarded_player_per_year['awardDateYear'] == 2021][:20]
most_awarded_player_per_year = year2018.append([year2019, year2020, year2021])

fig = px.bar(x=most_awarded_player_per_year['playerName'], y=most_awarded_player_per_year['num_of_awards'], 
             text=most_awarded_player_per_year['num_of_awards'],
             animation_frame=most_awarded_player_per_year['awardDateYear'],
             animation_group=most_awarded_player_per_year['num_of_awards'],
             labels={'y': 'number of awards', 'x': ' '},
             title='Top the most awarded players per year', 
             color=most_awarded_player_per_year['playerName'])

fig.show()

In [None]:
most_frequent_award = awards.awardName.value_counts()[:25]

fig = px.pie(most_frequent_award, values=most_frequent_award.values, names=most_frequent_award.index,
             title='Most frequent awards')
fig.show()

In [None]:
def plot_awards(playerid=477132):
    if len(awards[awards.playerId==playerid]['awardDateYear'].value_counts()) > 0:
        awards[awards.playerId==playerid]['awardDateYear'].value_counts().plot(kind = 'bar')
        plt.show()

In [None]:
plot_awards(477132)

# Conclusion

# Target EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
sns.set(font_scale = 1.5)
fig, axs = plt.subplots(2,2, figsize = (20, 10))
sns.kdeplot(ax=axs[0,0], data=targets['target1'])
sns.kdeplot(ax=axs[0,1], data=targets['target2'])
sns.kdeplot(ax=axs[1,0], data=targets['target3'])
sns.kdeplot(ax=axs[1,1], data=targets['target4'])
bbox = axs[0,0].get_position()
bbox2 = axs[0,1].get_position()

center=(bbox2.x1) * 0.4 + (bbox.x1) * 0.25
plt.suptitle('Distribution of targets', x = center)


In [None]:
def draw_kde_plot(col = 'target1'):
    sns.set_style('whitegrid')
    sns.set(font_scale = 1.5)
    fig, axs = plt.subplots(2,2, figsize = (15, 10))
    g = sns.kdeplot(ax=axs[0,0], data=targets[col])
    g.set_xlabel('original')
    g = sns.kdeplot(ax=axs[0,1], data=targets[col]**2)
    g.set_xlabel('squared')
    g = sns.kdeplot(ax=axs[1,0], data=targets[col]**4)
    g.set_xlabel('power 4')
    g = sns.kdeplot(ax=axs[1,1], data = np.log(targets[col]+1))
    g.set_xlabel('log')



    bbox = axs[0,0].get_position()
    bbox2 = axs[0,1].get_position()
    center=(bbox2.x1) * 0.4 + (bbox.x1) * 0.25
    plt.suptitle(f'Transformation of {col}', x = center)
    plt.tight_layout()


# **Target2 has highest Skewness**


In [None]:
for col in ['target1', 'target2', 'target3', 'target4']:
    draw_kde_plot(col)

In [None]:
sns.set_style('whitegrid')
sns.set(font_scale = 1.5)


fig, axs = plt.subplots(1,1, figsize = (20,8))
sns.lineplot(ax=axs, x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target1'],
             legend='full', label = 'target1')
sns.lineplot(ax=axs, x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target2'],
             legend='full', label = 'target2')
sns.lineplot(ax=axs, x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target3'], 
             legend='full', label = 'target3')
sns.lineplot(ax=axs,x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target4'], 
             legend='full', label = 'target4')

bbox = axs.get_position()
center=0.5*(bbox.x1)
plt.suptitle('Comparision of targets', x = center)



In [None]:
sns.set_style('ticks')
sns.set(font_scale = 1.5)


fig, axs = plt.subplots(2,2, figsize = (20,8))
sns.lineplot(ax=axs[0,0], x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target1'],
             legend='full', label = 'target1')
sns.lineplot(ax=axs[0,1], x = np.arange(1,10001),
             y = targets.sample(10000, random_state=500)['target2'],
             legend='full', label = 'target2')
sns.lineplot(ax=axs[1,0], x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target3'], 
             legend='full', label = 'target3')
sns.lineplot(ax=axs[1,1], x = np.arange(1,10001), 
             y = targets.sample(10000, random_state=500)['target4'], 
             legend='full', label = 'target4')

plt.title('Comparision of targets, side by side view')


In [None]:
targets['year'] = pd.to_datetime(targets['date'], format = '%Y%m%d').dt.year

# We have less data for year 4, since we need to predict for the future

In [None]:
targets['year'].value_counts().plot(kind = 'bar')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
sns.set(font_scale = 2)
sns.color_palette("Set2")



fig, axs = plt.subplots(4,1, figsize = (20,20))
sns.lineplot(ax=axs[0], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target1',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

sns.lineplot(ax=axs[1], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target2',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

sns.lineplot(ax=axs[2], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target3',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

sns.lineplot(ax=axs[3], 
             x = np.arange(1,10001),
             data = targets.sample(10000, random_state=100),
             y = 'target4',
             hue = 'year',
             palette='tab10',
             linewidth=2.5)

bbox = axs[0].get_position()
center=0.5*(bbox.x1)
plt.suptitle('targets over years', x = center)

In [None]:

def plot_target_for_player(col = 'target1', playerid = 683734):
    
    sns.set(style="whitegrid")
    sns.set(font_scale = 2)
    sns.color_palette("Set2")

    fig, axs = plt.subplots(1,1, figsize = (20,8))

    sns.lineplot(ax=axs, x = np.arange(365),
                 data = targets[((targets.year==2018) & (targets.playerId==playerid))],
                 y = col,
                 label = '2018',
                 linewidth=2.5)

    sns.lineplot(ax=axs, 
                 x =  np.arange(365),
                 data = targets[((targets.year==2019) & (targets.playerId==playerid))],
                 y = col,
                 label = '2019',
                 linewidth=2.5)

    sns.lineplot(ax=axs, 
                 x =  np.arange(366),
                 data = targets[((targets.year==2020) & (targets.playerId==playerid))],
                 y = col,
                 label = '2020',
                 linewidth=2.5)

    sns.lineplot(ax=axs, 
                 x =  np.arange(120),
                 data = targets[((targets.year==2021) & (targets.playerId==playerid))],
                 y = col,
                 label = '2021',
                 linewidth=2.5)
    
    bbox = axs.get_position()
    center=0.5*(bbox.x1)
    plt.suptitle(f'player Id {playerid}', x = center)

# There is definitely seasonality for targets, seems like we can remove year 2018 from modelling

In [None]:
plot_target_for_player()

In [None]:
plot_target_for_player('target2')

In [None]:
plot_target_for_player('target3')

In [None]:
plot_target_for_player('target4')

In [None]:
plot_target_for_player('target1',477132)

In [None]:
plot_target_for_player('target2',477132)

In [None]:
plot_target_for_player('target3',477132)

In [None]:
plot_target_for_player('target4',477132)

# There seems to be relationship between number of awards and targets, higher awards the player is popular

In [None]:
from scipy.stats import boxcox
xt, _ = boxcox(targets['target1'].values + 1)
sns.distplot(xt)

In [None]:
xt, _ = boxcox(targets['target4'].values + 1)
sns.distplot(xt)

In [None]:
xt, _ = boxcox(targets['target3'].values + 1)
sns.distplot(xt)

In [None]:
import seaborn as sns

In [None]:
player_target_stats = pd.read_csv("../input/player-target-stats/player_target_stats.csv")
targets_agg =targets.groupby('playerId')[['target1', 'target2', 'target3', 'target4']].agg('median').reset_index()
player_target_stats_for_corr = pd.merge(player_target_stats, targets_agg, on = ['playerId'], how = 'left')
plt.figure(figsize=(20, 20))
plt.xticks(rotation=25)
corr = player_target_stats_for_corr.drop(columns =['playerId']).corr()


sns.heatmap(corr,
            annot = True,
            vmin=0,
            vmax=1,
            center= 0,
            cmap="RdYlGn"
       )
plt.title('targets corr')
plt.show()

In [None]:
followers_agg =followers.groupby('playerId')['numberOfFollowers'].agg('median').reset_index()
targets_agg =targets.groupby('playerId')[['target1', 'target2', 'target3', 'target4']].agg('median').reset_index()
followers_agg.columns = ['playerId', '#Followers']
followers_agg = pd.merge(followers_agg, targets_agg, on = ['playerId'], how = 'left')
plt.figure(figsize=(10, 2))
plt.xticks(rotation=45)
plt.suptitle("Median Target vs Median Twitter Followers", fontsize =15)

corr = followers_agg.drop(columns =['playerId']).corr()
#mask = np.triu(np.ones_like(corr, dtype=bool))

x_axis_labels = ['#Followers', 'target1','target2', 'target3', 'target4'] 
sns.heatmap(np.array(corr['#Followers']).reshape((1,5)),
            annot = True,
            xticklabels=x_axis_labels,
            vmin=0,
            vmax=1,
            center= 0,
            cmap="RdYlGn"
       )

In [None]:
min_max_followers

In [None]:
awards = pd.read_csv(TRAIN_DIR / 'awards_train.csv')

awards_agg = awards.groupby('playerId')['awardId'].agg('count').reset_index()
awards_agg.columns = ['playerId', '#Awards']
targets_agg =targets.groupby('playerId')[['target1', 'target2', 'target3', 'target4']].agg('median').reset_index()
followers_agg = pd.merge(awards_agg, targets_agg, on = ['playerId'], how = 'left')
plt.figure(figsize=(10, 2))
plt.xticks(rotation=45)
plt.suptitle("Total number of awards vs Median targets", fontsize =15)
corr = followers_agg.drop(columns =['playerId']).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

x_axis_labels = ['#Awards', 'target1','target2', 'target3', 'target4'] 
sns.heatmap(np.array(corr['#Awards']).reshape((1,5)),
            annot = True,
            xticklabels=x_axis_labels,
            vmin=0,
            vmax=1,
            center= 0,
            cmap="RdYlGn"
       )

In [None]:
scores_cols = scores.columns.tolist()

scores_cols = [col for col in scores_cols if col not in ['playerId', 'date', 'home', 'gamePk', 'teamId', 'battingOrder']]
for col in scores_cols:
            scores_agg = scores.groupby('playerId')[col].agg('sum').reset_index()
            scores_agg.columns = ['playerId', "#"+col]
            targets_agg =targets.groupby('playerId')[['target1', 'target2', 'target3', 'target4']].agg('median').reset_index()
            scores_agg = pd.merge(scores_agg, targets_agg, on = ['playerId'], how = 'left')
            plt.figure(figsize=(10, 2))
            plt.suptitle(f"Total {col} vs Median targets", fontsize =15)
            corr = scores_agg.drop(columns =['playerId']).corr()
            mask = np.triu(np.ones_like(corr, dtype=bool))
            sns.set(font_scale=1.4)
            plt.xticks(rotation=45)
            
            x_axis_labels = ["#"+col, 'target1','target2', 'target3', 'target4'] 

            sns.heatmap(np.array(corr["#"+col]).reshape((1,5)),
            annot = True,
            xticklabels=x_axis_labels,
            vmin=0,
            vmax=1,
            center= 0,
            cmap="RdYlGn")           
            
            plt.show()
            plt.close()