In [None]:
!pip install seedir

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns
import warnings
import seedir as sd
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
data_dir = '/kaggle/input/mlb-player-digital-engagement-forecasting'
sd.seedir(data_dir, style='emoji')

Static files that do not change with time:
* players.csv 
* teams.csv
* seasons.csv
* awards.csv

Daily data:
* train.csv

Example test and submission:
* example_test.csv
* example_sample_submission.csv

The test data arrives in a data frame identical in format to train.csv, except it does not contain the target values. It means that all 4 targets are in column **nextDayPlayerEngagement** in the train.csv and it is represented as a big string




In [None]:
train = pd.read_csv(f'{data_dir}/train.csv')
train.head()

In [None]:
train[train['date'] == 20180101]['nextDayPlayerEngagement'][0][:500]

It is clear that we need somehow to preprocess these strings into data frames, e.g. create the unnested data frames. In that purpose, code from this notebook is used https://www.kaggle.com/naotaka1128/creating-unnested-dataset

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float64)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

for file in ['example_test', 'train']:
    # drop playerTwitterFollowers, teamTwitterFollowers from example_test
    df = pd.read_csv(f"{data_dir}/{file}.csv").dropna(axis=1,how='all')
    daily_data_nested_df_names = df.drop('date', axis = 1).columns.values.tolist()

    for df_name in daily_data_nested_df_names:
        date_nested_table = df[['date', df_name]]

        date_nested_table = (date_nested_table[
          ~pd.isna(date_nested_table[df_name])
          ].
          reset_index(drop = True)
          )

        daily_dfs_collection = []

        for date_index, date_row in date_nested_table.iterrows():
            daily_df = pd.read_json(date_row[df_name])

            daily_df['dailyDataDate'] = date_row['date']

            daily_dfs_collection = daily_dfs_collection + [daily_df]

        # Concatenate all daily dfs into single df for each row
        unnested_table = (pd.concat(daily_dfs_collection,
          ignore_index = True).
          # Set and reset index to move 'dailyDataDate' to front of df
          set_index('dailyDataDate').
          reset_index()
          )
        #print(f"{file}_{df_name}.pickle")
        #display(unnested_table.head(3))
        reduce_mem_usage(unnested_table).to_pickle(f"{file}_{df_name}.pickle")
        #print('\n'*2)

        # Clean up tables and collection of daily data frames for this df
        del(date_nested_table, daily_dfs_collection, unnested_table)

del train

## Content:

* [train_nextDayPlayerEngagement.pickle (Target)](#topic1)
* [players.csv](#topic2)
* [teams.csv](#topic3)
* [seasons.csv](#topic4)
* [awards.csv](#topic5)
  * [train_awards.pickle](#topic6)
  * [train_events.pickle](#topic7)
  * [train_games.pickle](#topic8)
  * [train_playerBoxScores.pickle](#topic9)
  * [train_playerTwitterFollowers.pickle](#topic10)
  * [train_rosters.pickle](#topic11)
  * [train_standings.pickle](#topic12)
  * [train_teamBoxScores.pickle](#topic13)
  * [train_teamTwitterFollowers.pickle](#topic14)
  * [transactions.pickle](#topic15)

<a id =topic1> </a>
# Target

In [None]:
train_target = pd.read_pickle('train_nextDayPlayerEngagement.pickle')
train_target['engagementMetricsDate'] = pd.to_datetime(train_target['engagementMetricsDate'])
train_target['dailyDataDate'] = train_target['dailyDataDate'].astype(str)
train_target['dailyDataDate'] = pd.to_datetime(train_target['dailyDataDate'], format="%Y%m%d")
train_target.head()

In [None]:
train_target.groupby('engagementMetricsDate').count()['playerId'].plot(figsize=(10,5))
plt.title('Number of players per date')
plt.show()

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(12,15))

for i, ax in enumerate(axes):
    train_target.groupby('engagementMetricsDate').mean()[f'target{i+1}'].plot(ax=ax)
    ax.set_title(f'mean target{i+1}')

<a id =topic2> </a>
# Players

In [None]:
players = pd.read_csv(f'{data_dir}/players.csv')
players.head()

In [None]:
players.groupby('birthCountry').count()['playerId'].sort_values().plot.barh(figsize=(5, 8))
plt.title('Player birth country')

In [None]:
df_temp = players.groupby('primaryPositionName').count()['playerId'].sort_values()

y_pos = np.arange(len(df_temp))

plt.barh(y_pos, df_temp.values, align='center')
plt.yticks(y_pos, [f'{x}_{y}' for x, y in zip(df_temp.index, df_temp.values)])
plt.title('Player primary position name')
plt.show()

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(12,15))
fig.subplots_adjust(hspace=0.3)

df_agg_temp = train_target.merge(players[['playerId', 'primaryPositionName']], how='left')
for pos in df_agg_temp['primaryPositionName'].unique():
    if pos == 'Pitcher':
        lw = 3
        al = 1
    else:
        lw = 1
        al = 0.5
    for i, ax in enumerate(axes):
        df_agg_temp[df_agg_temp['primaryPositionName']==pos].groupby(
            'engagementMetricsDate').mean()[f'target{i+1}'].plot(ax=ax, label=pos,
                                                                 linewidth=lw, alpha=al, figsize=(17, 30))
        ax.set_title(f'mean target{i+1}')
for i, ax in enumerate(axes):
    ax.legend(loc="upper left")

del df_agg_temp, df_temp

* ### Designated Hitters have highest peaks althought it might be because of the low number of them (only 6).
* ### Target 2 shows some significant spikes in other positions (like First Base and outfielder).

In [None]:
players.groupby('playerForTestSetAndFuturePreds').count()['playerId'].plot.bar()
plt.title('True if player is among those for whom predictions are to be made in test data')

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(12,15))
fig.subplots_adjust(hspace=0.3)

df_agg_temp = train_target.merge(players[['playerId', 'playerForTestSetAndFuturePreds']], how='left')
for in_test in [True, False]:
    for i, ax in enumerate(axes):
        df_agg_temp[df_agg_temp['playerForTestSetAndFuturePreds']==in_test].groupby(
            'engagementMetricsDate').mean()[f'target{i+1}'].plot(ax=ax, label=f'Player in test: {in_test}',
                                                                 figsize=(17, 30))
        ax.set_title(f'mean target{i+1}')
for i, ax in enumerate(axes):
    ax.legend(loc="upper left")

del df_agg_temp

<a id =topic3> </a>
# Teams

In [None]:
teams = pd.read_csv(f'{data_dir}/teams.csv')
teams.head()

In [None]:
train_rosters = pd.read_pickle(f'train_rosters.pickle')
train_target = pd.read_pickle('train_nextDayPlayerEngagement.pickle')
teams_agg = pd.merge(train_target, train_rosters, left_on=['dailyDataDate', 'playerId'],
                     right_on=['dailyDataDate', 'playerId'], how = 'left')
teams_agg = pd.merge(teams_agg, teams, left_on=['teamId'], right_on=['id'], how='left')
for i in range(4):
    teams_agg.groupby('shortName').mean()[f'target{i+1}'].sort_values().plot.barh(figsize=(10, 5))
    plt.xlabel(f'mean target{i+1}')
    plt.title(f'mean target{i+1} per team')
    plt.show()

In [None]:
for i in range(4):
    teams_agg.groupby('leagueName').mean()[f'target{i+1}'].sort_values().plot.barh(figsize=(4, 2))
    plt.xlabel(f'mean target{i+1}')
    plt.title(f'mean target{i+1} per league')
    plt.show()

In [None]:
for i in range(4):
    teams_agg.groupby('divisionName').mean()[f'target{i+1}'].sort_values().plot.barh(figsize=(10, 5))
    plt.xlabel(f'mean target{i+1}')
    plt.title(f'mean target{i+1} per division')
    plt.show()

<a id =topic4> </a>
# Seasons

In [None]:
seasons = pd.read_csv(f'{data_dir}/seasons.csv')
seasons.head()

In [None]:
del seasons

<a id =topic5> </a>
# Awards

In [None]:
awards = pd.read_csv(f'{data_dir}/awards.csv')
awards.head()

In [None]:
awards.groupby('playerName').count()['awardId'].sort_values()[-20:].plot.barh()
plt.title('Top 20 players per number of awards for period 1997-2017')

In [None]:
del awards

<a id =topic6> </a>
# Train awards

In [None]:
train_awards = pd.read_pickle(f'train_awards.pickle')
train_awards.head()

In [None]:
train_awards.groupby('playerName').count()['awardId'].sort_values()[-20:].plot.barh()
plt.title('Top 20 players per number of awards for training period')

In [None]:
top_player_targets = train_target[train_target['playerId']==624413]
award_dates = train_awards[train_awards['playerId'] == 624413]['dailyDataDate'].to_list()
top_player_targets['award_date'] = top_player_targets['dailyDataDate'].isin(award_dates).astype(int)

for i in range(4):
    top_player_targets[f'target{i+1}'].plot(figsize = (20, 5))
    top_player_targets[top_player_targets['award_date']==1][f'target{i+1}'].plot(
        figsize = (20, 5), style='o-',markerfacecolor='red', linestyle='none')
    plt.legend([f'target{i+1}', 'awards'])
    plt.title(f'Pete Alonso target{i+1} and awards')
    plt.show()

In [None]:
second_player_targets = train_target[train_target['playerId']==605141]
award_dates = train_awards[train_awards['playerId'] == 605141]['dailyDataDate'].to_list()
second_player_targets['award_date'] = second_player_targets['dailyDataDate'].isin(award_dates).astype(int)

for i in range(4):
    second_player_targets[f'target{i+1}'].plot(figsize = (20, 5))
    second_player_targets[second_player_targets['award_date']==1][f'target{i+1}'].plot(
        figsize = (20, 5), style='o-',markerfacecolor='red', linestyle='none')
    plt.legend([f'target{i+1}', 'awards'])
    plt.title(f'Wander Franco target{i+1} and awards')
    plt.show()

In [None]:
del train_awards

<a id =topic7> </a>
# Train events

In [None]:
train_events = pd.read_pickle('train_events.pickle')
train_events.head()

In [None]:
num_col = [col for col in train_events.columns if pd.api.types.is_numeric_dtype(train_events[col])]
fig, axes = plt.subplots(nrows=18, ncols=3)
plt.suptitle('Histograms for numeric columns in train_events data frame', y=0.9)
fig.set_figheight(40)
fig.set_figwidth(20)
fig.subplots_adjust(hspace=0.4)
columns = list(train_events.columns)

for i, ax in enumerate(axes.flatten()):
    try:
        train_events[num_col[i]].hist(ax=ax)
        ax.set_title(num_col[i])
    except:
        continue
        
plt.show()

In [None]:
object_col = list(set(train_events.columns).difference(num_col))
fig, axes = plt.subplots(nrows=10, ncols=2)
plt.suptitle('Top 20 values for each non numeric columns', y=0.9)
fig.set_figheight(40)
fig.set_figwidth(20)
fig.subplots_adjust(hspace=0.4)

for i, ax in enumerate(axes.flatten()):
    
    try:
        train_events.groupby(object_col[i]).count()['dailyDataDate'].sort_values()[-20:].plot.barh(ax=ax)
        ax.set_title(object_col[i])
    except:
        continue

plt.show()

In [None]:
del train_events

<a id =topic8> </a>
# Train games

In [None]:
train_games = pd.read_pickle('train_games.pickle')
train_games.head()

In [None]:
num_col = [col for col in train_games.columns if pd.api.types.is_numeric_dtype(train_games[col])]
fig, axes = plt.subplots(nrows=7, ncols=3)
plt.suptitle('Histograms for numeric columns in train_games data frame', y=0.9)
fig.set_figheight(40)
fig.set_figwidth(20)
fig.subplots_adjust(hspace=0.4)

for i, ax in enumerate(axes.flatten()):
    
    try:
        train_games[num_col[i]].hist(ax=ax)
        ax.set_title(num_col[i])
    except:
        continue
        
plt.show()

In [None]:
object_col = list(set(train_games.columns).difference(num_col))
fig, axes = plt.subplots(nrows=7, ncols=2, dpi=120)
plt.suptitle('Top 20 values for each non numeric columns in train_games data frame', y=0.9)
fig.set_figheight(30)
fig.set_figwidth(20)
fig.subplots_adjust(hspace=0.3)

for i, ax in enumerate(axes.flatten()):
    
    try:
        train_games.groupby(object_col[i]).count()['dailyDataDate'].sort_values()[-20:].plot.barh(ax=ax)
        ax.set_title(object_col[i])
    except:
        continue

plt.show()

In [None]:
del train_games

<a id =topic9> </a>
# Train BoxScores

In [None]:
train_playerBoxScores = pd.read_pickle(f'train_playerBoxScores.pickle')
train_playerBoxScores.head()

In [None]:
num_col = [col for col in train_playerBoxScores.columns
           if pd.api.types.is_numeric_dtype(train_playerBoxScores[col])]
fig, axes = plt.subplots(nrows=27, ncols=3)
plt.suptitle('Histograms for numeric columns in train_playerBoxScores data frame', y=0.9)
fig.set_figheight(60)
fig.set_figwidth(20)
fig.subplots_adjust(hspace=0.4)

for i, ax in enumerate(axes.flatten()):
    
    try:
        train_playerBoxScores[num_col[i]].hist(ax=ax)
        ax.set_title(num_col[i])
    except:
        continue
plt.show()

In [None]:
object_col = list(set(train_playerBoxScores.columns).difference(num_col))
fig, axes = plt.subplots(nrows=4, ncols=2, dpi=120)
plt.suptitle('Top 20 values for each non numeric columns', y=0.9)
fig.set_figheight(20)
fig.set_figwidth(20)
fig.subplots_adjust(hspace=0.3)
for i, ax in enumerate(axes.flatten()):
    
    try:
        train_playerBoxScores.groupby(object_col[i]).count()['dailyDataDate'].sort_values()[-20:].plot.barh(ax=ax)
        ax.set_title(object_col[i])
    except:
        continue

plt.show()

In [None]:
del train_playerBoxScores

<a id =topic10> </a>
# Train Player Twitter Followers

In [None]:
train_playerTwitterFollowers = pd.read_pickle(f'train_playerTwitterFollowers.pickle')
train_playerTwitterFollowers.head()

In [None]:
train_playerTwitterFollowers.groupby('playerName').max()['numberOfFollowers'].sort_values()[-20:].plot.barh()
plt.title('Top 20 players with the most twitter followers')

In [None]:
del train_playerTwitterFollowers

<a id =topic11> </a>
# Train rosters

In [None]:
train_rosters = pd.read_pickle('train_rosters.pickle')
train_rosters.head()

In [None]:
train_rosters.groupby('playerId')['teamId'].nunique().hist()
plt.title('The number of different teams that the player changed')
plt.show()

In [None]:
train_rosters.merge(players[['playerId', 'playerName']], left_on=['playerId'],
                    right_on=['playerId'], how='left').groupby(
    'playerName')['teamId'].nunique().sort_values()[-40:].plot.barh(figsize=(5,10))
plt.title('Players who changed the most different teams')
plt.show()

In [None]:
del train_rosters

<a id =topic12> </a>
# Train standings

In [None]:
train_standings = pd.read_pickle('train_standings.pickle')
train_standings.head()

In [None]:
train_rosters = pd.read_pickle('train_rosters.pickle')
df_temp = pd.merge(train_target, train_rosters, left_on=['dailyDataDate', 'playerId'],
                   right_on=['dailyDataDate', 'playerId'], how='left')

for col in ['engagementMetricsDate', 'gameDate', 'status', 'statusCode']:
    df_temp = df_temp.drop(col, axis=1)

df_temp = pd.merge(df_temp, train_standings, left_on=['dailyDataDate', 'teamId'],
                   right_on=['dailyDataDate', 'teamId'], how='left')

df_corr = df_temp.corr()
plt.rcParams["figure.figsize"] = (17,17)
sns.heatmap(df_corr, xticklabels=df_corr.columns, yticklabels=df_corr.columns, annot=True)
plt.title('Corerlation between tagret columns and columns from standing data frame')

In [None]:
del train_rosters, train_standings

<a id =topic13> </a>
# Train teamBoxScores

In [None]:
train_teamBoxScores = pd.read_pickle('train_teamBoxScores.pickle')
train_teamBoxScores.head()

In [None]:
train_rosters = pd.read_pickle('train_rosters.pickle')
df_temp = pd.merge(train_target, train_rosters, left_on=['dailyDataDate', 'playerId'],
                   right_on=['dailyDataDate', 'playerId'], how='left')

for col in ['engagementMetricsDate', 'gameDate', 'status', 'statusCode']:
    df_temp = df_temp.drop(col, axis=1)

df_temp = pd.merge(df_temp, train_teamBoxScores, left_on=['dailyDataDate', 'teamId'],
                   right_on=['dailyDataDate', 'teamId'], how='left')

df_corr = df_temp.corr()
plt.rcParams["figure.figsize"] = (17,17)
sns.heatmap(df_corr, xticklabels=df_corr.columns, yticklabels=df_corr.columns, annot=True)
plt.title('Corerlation between tagret columns and columns from teamBoxScores data frame')

In [None]:
del train_rosters, df_temp, train_teamBoxScores

<a id =topic14> </a>
# Train teamTwitterFollowers

In [None]:
train_teamTwitterFollowers = pd.read_pickle('train_teamTwitterFollowers.pickle')
train_teamTwitterFollowers.head()

In [None]:
plt.rcParams["figure.figsize"] = (7,7)
train_teamTwitterFollowers.groupby('teamName').max()['numberOfFollowers'].sort_values()[-20:].plot.barh()
plt.title('Top 20 teams with the most twitter followers')

In [None]:
del train_teamTwitterFollowers

<a id =topic15> </a>
# Train transactions

In [None]:
train_transactions = pd.read_pickle('train_transactions.pickle')
train_transactions.head()