In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from sklearn.preprocessing import StandardScaler
import gc
from pathlib import Path

In [None]:
# Start with input file path
input_file_path = Path('/kaggle/input/mlb-player-digital-engagement-forecasting/')


# Create table with list of CSV files to be read in, w/ corresponding df name
# This does include large 'train' data set (read in separately)
csv_and_df_names = pd.DataFrame(data = {
  'csv_name': ['seasons', 'teams', 'players', 'awards'],
  'df_name': ['seasons', 'teams', 'players', 'awards_pre2018'] 
  })

# Set up for tabbed output
kaggle_data_tabs = widgets.Tab()

# Add Output widgets for each (eventual) DF as tabs' children
kaggle_data_tabs.children = list([widgets.Output() for df_name 
  in csv_and_df_names['df_name']])

for index, row in csv_and_df_names.iterrows():
    
    csv_name = row['csv_name']
    df_name = row['df_name']
    
    # Read from CSV and create df with specified name in environment
    globals()[df_name] = pd.read_csv(input_file_path / f"{csv_name}.csv")

    # Set tab title to df name
    kaggle_data_tabs.set_title(index, df_name)
    
    # Display corresponding table output for this tab name
    with kaggle_data_tabs.children[index]:
        display(eval(df_name))

display(kaggle_data_tabs)

In [None]:
train = pd.read_csv(input_file_path /'train.csv')

# Convert training data date field to pandas datetime type
train['date'] = pd.to_datetime(train['date'], format = "%Y%m%d")

display(train.info())

display(train)

In [None]:
# Get names of all "nested" data frames in daily training set
daily_data_nested_df_names = train.drop('date', axis = 1).columns.values.tolist()

for df_name in daily_data_nested_df_names:
    date_nested_table = train[['date', df_name]]

    date_nested_table = (date_nested_table[
      ~pd.isna(date_nested_table[df_name])
      ].
      reset_index(drop = True)
      )
    
    daily_dfs_collection = []
    
    for date_index, date_row in date_nested_table.iterrows():
        daily_df = pd.read_json(date_row[df_name])
        
        daily_df['dailyDataDate'] = date_row['date']
        
        daily_dfs_collection = daily_dfs_collection + [daily_df]

    # Concatenate all daily dfs into single df for each row
    unnested_table = (pd.concat(daily_dfs_collection,
      ignore_index = True).
      # Set and reset index to move 'dailyDataDate' to front of df
      set_index('dailyDataDate').
      reset_index()
      )
    
    # Creates 1 pandas df per unnested df from daily data read in, with same name
    globals()[df_name] = unnested_table    
    
    # Clean up tables and collection of daily data frames for this df
    del(date_nested_table, daily_dfs_collection, unnested_table)

# Set up for tabbed output
daily_data_unnested_tabs = widgets.Tab()

# Add Output widgets for each (eventual) DF as tabs' children
daily_data_unnested_tabs.children = list([widgets.Output() 
  for df_name in daily_data_nested_df_names])

for index in range(0, len(daily_data_nested_df_names)):
    df_name = daily_data_nested_df_names[index]
    
    # Rename tab bar titles to df names
    daily_data_unnested_tabs.set_title(index, df_name)

    # Display corresponding table output for this tab name
    with daily_data_unnested_tabs.children[index]:
        display(eval(df_name))

display(daily_data_unnested_tabs)

In [None]:
del(train)

gc.collect()

**Correlation between targets**

In [None]:
plt.figure(figsize=(8,6))
cor = nextDayPlayerEngagement[['target1','target2','target3','target4']].corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

**Relationship between important player-level stats with our target**

*Compute averagre of the four targets*

In [None]:
player_eng_info = nextDayPlayerEngagement.copy()
player_eng_info['target1To4Avg'] = np.mean(
  player_eng_info[['target1', 'target2', 'target3', 'target4']],
  axis = 1)

In [None]:
player_eng_info = player_eng_info[player_eng_info['dailyDataDate'] >='2018-03-29']
player_eng_info = pd.merge(
  player_eng_info,
  playerBoxScores[['dailyDataDate','playerId','gamePk','teamId', 'playerName', 'runsScored', 'atBats', 'homeRuns','flyOuts','hits','strikes',
                   'balks','errors','chances','rbi']],
   on = ['dailyDataDate','playerId'],
   how = 'inner'
   )


In [None]:
player_eng_info.head()

**Correlation matrix between them**

In [None]:
plt.figure(figsize=(10,8))
cor = player_eng_info[['target1To4Avg','runsScored','atBats','homeRuns','flyOuts','hits','strikes',
                   'balks','errors','chances','rbi']].corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

*note : There are some features highly correlated with each other and we may choose only one from them and neglect the other*

In [None]:
standings_stats = pd.merge(
  player_eng_info,
  standings[['dailyDataDate','teamId','wins','losses','pct','xWinLossPct','divisionRank',
             'leagueRank','wildCardRank','lastTenWins','lastTenLosses']],
   on = ['dailyDataDate','teamId'],
   how = 'inner'
   )

**Correlation between important stats from standings data and target**

In [None]:
plt.figure(figsize=(8,6))
cor = standings_stats[['target1To4Avg','wins','losses','pct','xWinLossPct','divisionRank',
                       'leagueRank','wildCardRank','lastTenWins','lastTenLosses']].corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
averaged_data = standings_stats.groupby('dailyDataDate', as_index=True)[['target1To4Avg','runsScored','atBats','homeRuns','flyOuts','hits','strikes',
                   'balks','errors','chances','rbi','pct','xWinLossPct','divisionRank','leagueRank','wildCardRank','lastTenWins','lastTenLosses']].mean()

In [None]:
averaged_data.head()

**Plot Average of targets of all players for each day across time and compare it with average of strikes**

In [None]:
averaged_data[['target1To4Avg','strikes']].plot()
plt.show()

*note : There is some similarity in their behavior across time*

**Plot Average of targets of all players for each day across time and compare it with average of atBats**

In [None]:
averaged_data['atBats'] = averaged_data['atBats']*10

In [None]:
averaged_data[['target1To4Avg','atBats']].plot()
plt.show()

*note : There is some similarity in their behavior across time*

**Plot Average of targets of all players for each day across time and compare it with average of xWinLossPct**

In [None]:
sns.distplot(averaged_data['xWinLossPct'])
plt.show()

In [None]:
averaged_data['xWinLossPct'] = averaged_data['xWinLossPct']*50

In [None]:
averaged_data[['target1To4Avg','xWinLossPct']].plot()
plt.show()

*note : There is some similarity in their behavior across time*

In [None]:
plt.figure(figsize=(12,10))
cor = averaged_data[['target1To4Avg','runsScored','atBats','homeRuns','hits',
        'balks','chances','rbi','pct','xWinLossPct','divisionRank','leagueRank','wildCardRank','lastTenWins','lastTenLosses']].corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

**We see here that 'pct' variable has strong positive correlation with target variable after computing averages of them per every unique day**

**'lastTenWins' variable has strong positive correlation with target and of course 'lastTenLosses' has negative correlation with target also**

**Also 'divisionRank' variable has strong negative correlation with target**

**'leagueRank' variable has strong negative correlation with target**

**'wildCardRank' variable has strong negative correlation with target**


*Before grouping data by unique days and computing averages of variables, the correlation between target and 'pct' was only **0.16** and was **-0.16** with 'divisionRank'*

**We can also see here that 'divisionRank' variable has completely opposite behavior compared to target across time **

In [None]:
averaged_data['divisionRank'] = averaged_data['divisionRank']*10

In [None]:
averaged_data[['target1To4Avg','divisionRank']].plot()
plt.show()

**And here we see that 'Current winning percentage' or 'pct' variable has almost the same behavior as target**

In [None]:
averaged_data['pct'] = averaged_data['pct']*50

In [None]:
averaged_data[['target1To4Avg','pct']].plot()
plt.show()

In [None]:
final_data = standings_stats.drop(['engagementMetricsDate','target1To4Avg'],axis=1)

final_data = pd.merge(
  final_data,
  awards[['dailyDataDate','awardName','playerId']],
   on = ['dailyDataDate','playerId'],
   how = 'left'
   )

final_data = final_data.fillna(0)

final_data['awardName'] = [1 if x!=0 else 0 for x in final_data['awardName'] ]

final_data = pd.merge(
  final_data,
  playerTwitterFollowers[['dailyDataDate','numberOfFollowers','playerId']],
   on = ['dailyDataDate','playerId'],
   how = 'left'
   )

final_data = pd.merge(
  final_data,
  teamTwitterFollowers[['dailyDataDate','numberOfFollowers','teamId']],
   on = ['dailyDataDate','teamId'],
   how = 'left'
   )

final_data = pd.merge(
  final_data,
  games[['dailyDataDate','gamePk','isTie','gamesInSeries','seriesDescription','homeWinPct','awayWinPct','homeId','awayId']],
   on = ['dailyDataDate','gamePk'],
   how = 'left'
   )

final_data['pct_diff'] = (final_data['homeWinPct'] - final_data['awayWinPct']).abs()

final_data = pd.merge(
  final_data,
  standings[['dailyDataDate','teamId','divisionRank', 'leagueRank','wildCardRank']],
   left_on = ['dailyDataDate','homeId'],
   right_on= ['dailyDataDate','teamId'],
   how = 'left'
   )

final_data = pd.merge(
  final_data,
  standings[['dailyDataDate','teamId','divisionRank', 'leagueRank','wildCardRank']],
   left_on = ['dailyDataDate','awayId'],
   right_on= ['dailyDataDate','teamId'],
   how = 'left'
   )

final_data = final_data.rename(columns={'divisionRank_x': 'player_divisionRank', 'divisionRank_y': 'home_divisionRank', 
                                       'divisionRank' : 'away_divisionRank'})
final_data = final_data.rename(columns={'leagueRank_x': 'player_leagueRank', 'leagueRank_y': 'home_leagueRank', 
                                       'leagueRank' : 'away_leagueRank'})
final_data = final_data.rename(columns={'wildCardRank_x': 'player_wildCardRank', 'wildCardRank_y': 'home_wildCardRank', 
                                       'wildCardRank' : 'away_wildCardRank'})

final_data['divisionRank_diff'] = (final_data['home_divisionRank'] - final_data['away_divisionRank']).abs()
final_data['leagueRank_diff'] = (final_data['home_leagueRank'] - final_data['away_leagueRank']).abs()
final_data['wildCardRank_diff'] = (final_data['home_wildCardRank'] - final_data['away_wildCardRank']).abs()

final_data = final_data.drop(['playerName','homeId', 'awayId'],axis=1)
final_data = final_data.drop(['seriesDescription'],axis=1)
final_data = final_data.drop(['teamId_x','teamId_y', 'teamId'],axis=1)


In [None]:
data = final_data.copy()
data = pd.merge(
  data,
  playerBoxScores[['dailyDataDate','playerId','assists', 'balls', 'baseOnBalls',
       'baseOnBallsPitching', 'battersFaced', 'battingOrder', 'blownSaves',
       'catchersInterference', 'catchersInterferencePitching',
       'caughtStealing', 'caughtStealingPitching',
       'completeGamesPitching', 'doubles', 'doublesPitching', 'earnedRuns','totalBases', 'triples', 'triplesPitching', 'wildPitches',
       'winsPitching']],
   on = ['dailyDataDate','playerId'],
   how = 'left'
   )

In [None]:
data.shape

In [None]:
data = data.drop(['gamePk'],axis=1)

In [None]:
data = data.fillna(0)
feature_columns = [x for x in data.columns[7:]]
target_columns = [x for x in data.columns[2:6]]
data[feature_columns] = data[feature_columns].astype(np.float32)

data = data.fillna(0)

In [None]:
def remove_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    data = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]
    
remove_outliers(data[feature_columns])

In [None]:
def log_transform(data,feature_columns):
    for x in feature_columns:
        data[x] = np.log10(data[x] + 1)

In [None]:
from sklearn.model_selection import train_test_split            
x_train, x_test, y_train, y_test = train_test_split(data[feature_columns],data[target_columns],test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train.shape

In [None]:
y_train.iloc[:,1]

In [None]:
'''from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(x_train,y_train.iloc[:,1])
gb.score(x_train,y_train.iloc[:,1])'''

In [None]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from keras.layers import Dense, Conv1D, Flatten
from keras import backend as K

#_train = x_train.reshape(x_train.shape[0],x_train.shape[1],1)
#_test=x_test.reshape(x_test.shape[0],x_test.shape[1],1)

model = Sequential()
model.add(Dense(64,input_dim=55, kernel_initializer='he_uniform', activation='relu'))
#odel.add(Conv1D(32,16,activation="relu", input_shape=(59, 1)))
#odel.add(Flatten())
model.add(Dense(32, activation="relu"))
model.add(Dense(16,activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(4, activation='linear'))

model.compile(loss='mae', optimizer='sgd',metrics=['mae'])
#K.set_value(model.optimizer.learning_rate, 0.001)

history = model.fit(x_train, y_train, epochs=50,batch_size=100, verbose=1, validation_data=(x_test,y_test))

In [None]:
print(history.history.keys())
# "Loss"
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
import mlb

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

#target_columns = ['target1', 'target2', 'target3', 'target4']

for (test_df, sample_prediction_df) in iter_test:
    
    test_df = test_df.reset_index().rename(columns = {'index':'date'})
    sample_prediction_df = sample_prediction_df.reset_index()
    sample_prediction_df = sample_prediction_df.rename(columns={'date':'ddate'})
    sample_prediction_df['date'] = pd.to_datetime(sample_prediction_df['ddate'], format='%Y%m%d')
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId'].apply(lambda x: x.split('_')[1]).astype(int)

    #games
    
    games_nested_table = test_df[['date','games']]
    games_nested_table = (games_nested_table.reset_index(drop = True))
    games_test = [] 
    for date_index, date_row in games_nested_table.iterrows():
        daily_df = pd.read_json(date_row['games'])   
        daily_df['date'] = date_row['date']
        games_test = games_test + [daily_df] 
        
    games_test = (pd.concat(games_test,
      ignore_index = True).
      reset_index()
      )
    #games_test['date'] = sample_prediction_df['date']
    games_test = games_test[['date','gamePk','isTie','gamesInSeries','seriesDescription',
                             'homeWinPct','awayWinPct','homeId','awayId']]
    
    #players
    
    players_nested_table = test_df[['date','playerBoxScores']]
    players_nested_table = (players_nested_table.reset_index(drop = True))
    players_test = []    
    for date_index, date_row in players_nested_table.iterrows():
        daily_df = pd.read_json(date_row['playerBoxScores'])  
        daily_df['date'] = date_row['date']
        players_test = players_test + [daily_df]   
    players_test = (pd.concat(players_test,
      ignore_index = True).
      reset_index()
      )
    #players_test['date'] = sample_prediction_df['date']
    players_test = players_test[['date','playerId','gamePk','teamId','runsScored', 'atBats', 'homeRuns',
                             'flyOuts','hits','strikes','balks','errors','chances','rbi','assists', 'balls',
       'baseOnBalls', 'baseOnBallsPitching', 'battersFaced', 'battingOrder',
       'blownSaves', 'catchersInterference', 'catchersInterferencePitching',
       'caughtStealing', 'caughtStealingPitching', 'completeGamesPitching',
       'doubles', 'doublesPitching', 'earnedRuns', 'totalBases', 'triples',
       'triplesPitching', 'wildPitches', 'winsPitching']]

    
    #standings
    
    
    standings_nested_table = test_df[['date','standings']]
    standings_nested_table = (standings_nested_table.reset_index(drop = True))
    standings_test = [] 
    for date_index, date_row in standings_nested_table.iterrows():
        daily_df = pd.read_json(date_row['standings']) 
        daily_df['date'] = date_row['date']
        standings_test = standings_test + [daily_df]    
    standings_test = (pd.concat(standings_test,
      ignore_index = True).
      reset_index()
      )
    #standings_test['date'] = sample_prediction_df['date']
    standings_test = standings_test[['date','teamId','wins','losses','pct','xWinLossPct','divisionRank'
             ,'leagueRank','wildCardRank','lastTenWins','lastTenLosses']]
    
    
    players_test['date'] = pd.to_datetime(players_test['date'], format='%Y%m%d')
    final_test4 = pd.merge(
    sample_prediction_df,
    players_test.drop_duplicates(subset=['date','playerId']),
    on = ['playerId','date'],
    how = 'left'
    )


    games_test['date'] = pd.to_datetime(games_test['date'], format='%Y%m%d')
    final_test3 = pd.merge(
    final_test4,
    games_test,
    on = ['date','gamePk'],
    how = 'left'
    )
    
    standings_test['date'] = pd.to_datetime(standings_test['date'], format='%Y%m%d')
    final_test2 = pd.merge(
    final_test3,
    standings_test,
    on = ['date','teamId'],
    how = 'left'
    )
    
    final_test1 = pd.merge(
    final_test2,
    standings_test[['date','teamId','divisionRank', 'leagueRank','wildCardRank']],
    left_on = ['date','homeId'],
    right_on= ['date','teamId'],
    how = 'left'
    )
    
    final_test = pd.merge(
    final_test1,
    standings_test[['date','teamId','divisionRank', 'leagueRank','wildCardRank']],
    left_on = ['date','awayId'],
    right_on= ['date','teamId'],
    how = 'left'
    )
    
    final_test = final_test.rename(columns={'divisionRank_x': 'player_divisionRank', 'divisionRank_y': 'home_divisionRank', 
                                       'divisionRank' : 'away_divisionRank'})
    final_test = final_test.rename(columns={'leagueRank_x': 'player_leagueRank', 'leagueRank_y': 'home_leagueRank', 
                                       'leagueRank' : 'away_leagueRank'})
    final_test = final_test.rename(columns={'wildCardRank_x': 'player_wildCardRank', 'wildCardRank_y': 'home_wildCardRank', 
                                       'wildCardRank' : 'away_wildCardRank'})

    final_test['divisionRank_diff'] = (final_test['home_divisionRank'] - final_test['away_divisionRank']).abs()
    final_test['leagueRank_diff'] = (final_test['home_leagueRank'] - final_test['away_leagueRank']).abs()
    final_test['wildCardRank_diff'] = (final_test['home_wildCardRank'] - final_test['away_wildCardRank']).abs()
    final_test['pct_diff'] = (final_test['homeWinPct'] - final_test['awayWinPct']).abs()
    
    final_test = final_test.drop(['awayId','teamId','homeId','playerId','gamePk'],axis=1)
    final_test['numberOfFollowers_x'] = 0
    final_test['numberOfFollowers_y'] = 0
    final_test['awardName'] = 0
    final_test = final_test.rename(columns={'chances' : 'chances_x'})
    final_test = final_test.fillna(0)  
    
    cols = ['atBats',
 'homeRuns',
 'flyOuts',
 'hits',
 'strikes',
 'balks',
 'errors',
 'chances_x',
 'rbi',
 'wins',
 'losses',
 'pct',
 'xWinLossPct',
 'player_divisionRank',
 'player_leagueRank',
 'player_wildCardRank',
 'lastTenWins',
 'lastTenLosses',
 'awardName',
 'numberOfFollowers_x',
 'numberOfFollowers_y',
 'isTie',
 'gamesInSeries',
 'homeWinPct',
 'awayWinPct',
 'pct_diff',
 'home_divisionRank',
 'home_leagueRank',
 'home_wildCardRank',
 'away_divisionRank',
 'away_leagueRank','away_wildCardRank', 'divisionRank_diff','leagueRank_diff','wildCardRank_diff','assists','balls',
 'baseOnBalls', 'baseOnBallsPitching','battersFaced','battingOrder','blownSaves',
 'catchersInterference','catchersInterferencePitching','caughtStealing','caughtStealingPitching','completeGamesPitching','doubles',
 'doublesPitching','earnedRuns','totalBases','triples', 'triplesPitching','wildPitches','winsPitching']
    
    final_test = final_test[cols]
    
    
    final_test = sc.transform(final_test)
    
    final_test = final_test.astype(np.float32)
    sample_prediction_df = sample_prediction_df.drop(['playerId'],axis=1)
    sample_prediction_df = sample_prediction_df.set_index('date')
    sample_prediction_df = sample_prediction_df.rename(columns={'ddate':'date'})
    sample_prediction_df = sample_prediction_df[['date','date_playerId','target1', 'target2', 'target3', 'target4']]
    sample_prediction_df = sample_prediction_df.drop(['date'],axis=1)
    sample_prediction_df['target1'] = np.clip(model.predict(final_test)[:,0], 0, 100)
    sample_prediction_df['target2'] = np.clip(model.predict(final_test)[:,1], 0, 100)
    sample_prediction_df['target3'] = np.clip(model.predict(final_test)[:,2], 0, 100)
    sample_prediction_df['target4'] = np.clip(model.predict(final_test)[:,3], 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0.)
    env.predict(sample_prediction_df)

In [None]:
'''sample_prediction_df = sample_prediction_df.drop(['date'],axis=1)
sample_prediction_df = sample_prediction_df.rename(columns={'ddate':'date'})
sample_prediction_df = sample_prediction_df[['date','date_playerId','target1', 'target2', 'target3', 'target4']]
sample_prediction_df = sample_prediction_df.drop(['date'],axis=1)'''

In [None]:
sample_prediction_df.head()

In [None]:
#sample_prediction_df.to_csv('submission.csv',index=False)