This notebook will create a desciptive statistics dataset for all players 4 targets.

In [None]:
import numpy as np
import pandas as pd
from numpy import mean,std
from scipy.stats import norm
import statistics as st

import os
import gc

import matplotlib.pyplot as plt
import seaborn as sns
import json

In [None]:
train = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/train.csv")

In [None]:
#thanks to Alok Pattani https://www.kaggle.com/alokpattani

# Get names of all "nested" data frames in daily training set
#### get all column names
#daily_data_nested_df_names = train.drop('date', axis = 1).columns.values.tolist()

daily_data_nested_df_names = ['nextDayPlayerEngagement']

for df_name in daily_data_nested_df_names:
    date_nested_table = train[['date', df_name]]

    date_nested_table = (date_nested_table[
      ~pd.isna(date_nested_table[df_name])
      ].
      reset_index(drop = True)
      )
    
    daily_dfs_collection = []
    
    for date_index, date_row in date_nested_table.iterrows():
        daily_df = pd.read_json(date_row[df_name])
        
        daily_df['dailyDataDate'] = date_row['date']
        
        daily_dfs_collection = daily_dfs_collection + [daily_df]

    # Concatenate all daily dfs into single df for each row
    unnested_table = (pd.concat(daily_dfs_collection,
      ignore_index = True).
      # Set and reset index to move 'dailyDataDate' to front of df
      set_index('dailyDataDate').
      reset_index()
      )
    
    # Creates 1 pandas df per unnested df from daily data read in, with same name
    globals()[df_name] = unnested_table    
    
    # Clean up tables and collection of daily data frames for this df
    del(date_nested_table, daily_dfs_collection, unnested_table)

print (daily_data_nested_df_names)

In [None]:
del(train)
gc.collect()

In [None]:
nextDayPlayerEngagement

In [None]:
nextDayPlayerEngagement['year'] = pd.DatetimeIndex(nextDayPlayerEngagement['engagementMetricsDate']).year
nextDayPlayerEngagement['month'] = pd.DatetimeIndex(nextDayPlayerEngagement['engagementMetricsDate']).month

In [None]:
new_df = nextDayPlayerEngagement[nextDayPlayerEngagement['year'] == 2021]
new_df = new_df[new_df['month'] >= 4]
new_df

In [None]:
playerId_list=new_df.playerId.unique().tolist()
#playerId_list=playerId_list[:10]
#playerId_list

In [None]:
import warnings
warnings.simplefilter('ignore')

def calc_probs(pid,df,temp):
    to_append=[pid,'','','','','','','','','','','','','','','','','','','','','','','','']
    targets=['target1','target2','target3','target4']
    z=1
    for target in targets:
        target_prob = temp[target].tolist()
        mean = np.mean(target_prob)
        std = np.std(target_prob)
        median = st.median(target_prob)
        distribution = norm(mean, std)
        min_weight = min(target_prob)
        max_weight = max(target_prob)
        values = list(np.linspace(min_weight, max_weight))
        probabilities = [distribution.pdf(v) for v in values]
        max_value = max(probabilities)
        max_index = probabilities.index(max_value)
        to_append[z]=mean
        to_append[z+1]=median
        to_append[z+2]=std
        to_append[z+3]=min_weight
        to_append[z+4]=max_weight
        to_append[z+5]=target_prob[max_index]
        z=z+6
    df_length = len(df)
    df.loc[df_length] = to_append
    return df
    

### CREATE DATAFRAME to store probabilities
column_names = ["playerId", "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_prob", "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_prob", "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_prob", "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_prob"]
player_target_probs = pd.DataFrame(columns = column_names)
    
for pid in playerId_list:
    temp = new_df[new_df['playerId'] == pid]
    player_target_stats=calc_probs(pid,player_target_probs,temp)

player_target_stats

In [None]:
player_target_stats.to_csv('player_target_stats.csv', index = False)