In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [None]:
TRAIN_DIR = Path('../input/mlb-pdef-train-dataset')
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
players = pd.read_csv(BASE_DIR / 'players.csv')

### Short analysis to see if age has any impact on the Social Media noise a Player generates

In [None]:
players['DOB'] = pd.to_datetime(players['DOB'],
                                format = '%Y-%m-%d')

targets['date'] = pd.to_datetime(targets['date'],
                                format = '%Y%m%d')
targets['year'] = targets['date'].dt.year


players_target = pd.merge(targets,
                          players,
                          on = 'playerId',
                          how = 'left')

players_target['PLAYER_AGE'] = (players_target['date'] - pd.to_datetime(players_target['DOB'])).dt.days/365

In [None]:
mean=np.ceil(players_target['PLAYER_AGE'].mean())
median=np.ceil(players_target['PLAYER_AGE'].median())

In [None]:
plt.figure(figsize=(10, 5))
sns.set_style('white')
hist_plot = sns.histplot(players_target['PLAYER_AGE'], )
hist_plot.axvline(mean, color='r', linestyle='--', linewidth = 4, label = f'mean-{mean}')
hist_plot.axvline(median, color='g', linestyle='-', linewidth = 4, label = f'median-{median}')
plt.suptitle("Players Age Distribution")
plt.legend()

In [None]:
players_target['PLAYER_AGE'] = round(players_target['PLAYER_AGE']).astype('int')
fig, axs = plt.subplots(4,2, figsize = (20, 12))
sns.set_style('white')
sns.set(font_scale = 1)
i = 0
for target in ['target1', 'target2', 'target3', 'target4']:
    target_median = players_target.groupby('PLAYER_AGE')[target].agg(['median', 'sum']).reset_index()   
    sns.set_style('white')
    bar_plot = sns.barplot(ax=axs[i, 0], x = target_median['PLAYER_AGE'], y = target_median['median'])
    axs[i, 0].set(ylabel = f"{target} "+ "MEDIAN")
    sns.barplot(ax=axs[i, 1], x = target_median['PLAYER_AGE'], y = target_median['sum'])
    axs[i, 1].set(ylabel = f"{target} "+ "SUM")
    i = i + 1
    plt.suptitle("Player Age vs Target Median/Sum", y = 1.03)
    plt.tight_layout()

In [None]:
print("Players making more noise")
for name in players_target[players_target['PLAYER_AGE'] >= 45]['playerName'].unique():
    print(name)

### Other than those players above, rest of the age groups does not seem interesting, becase the differences in medians are not significant by visual inspection, and sums have similar distribution of distribution of age.