参考文献  
https://www.kaggle.com/fumiyakomatsu/explanation-of-train-csv-each-variable-ver
https://www.kaggle.com/chumajin/eda-of-mlb-for-starter-version

# 0.モジュールのインポート

In [None]:
import gc
import sys
import warnings
from pathlib import Path
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
warnings.simplefilter("ignore")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 1.何を予測するか確認する

In [None]:
example_sample_submission = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_sample_submission.csv")
example_sample_submission

# 2.どんな情報から推測するか確認する

In [None]:
example_test = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_test.csv")
example_test

In [None]:
# Helper function to unpack json found in daily data
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

In [None]:
example_test.head(3)

In [None]:
unpack_json(example_test["games"].iloc[0])

In [None]:
unpack_json(example_test["rosters"].iloc[0])

# 3.train.csv

In [None]:
training = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/train.csv")
training['date'] = pd.to_datetime(training['date'], format="%Y%m%d")
display(training.info())

In [None]:
training.head(3)

In [None]:
training['date'] = pd.to_datetime(training['date'], format="%Y%m%d")

# 3.1 nextDayPlayerEngagement
目的変数(予測したい情報)を含んだデータ  
このtarget1~4を予測します

In [None]:
nextDayPlayerEngagement = unpack_json(training['nextDayPlayerEngagement'].iloc[0])
nextDayPlayerEngagement.columns

In [None]:
nextDayPlayerEngagement

# 3.2 games
試合情報  
初めのデータが2018-2-23でこの日から'gameType'がSであることから、春季トレーニングのデータが入っているとわかる


In [None]:
games = unpack_json(training['games'].iloc[53])
games.columns

In [None]:
games

# 3.3 rosters
チーム名簿情報  
ケガやマイナーへの降格情報など確認できる

In [None]:
rosters=unpack_json(training['rosters'].iloc[0])
rosters.columns

In [None]:
rosters

# 3.4 playerBoxScores
選手の試合成績
試合ごとに集計されていて、2018-3-29が初めてのデータだから、この日からレギュラーシーズンが始まったとわかる。

In [None]:
playerBoxScores = unpack_json(training['playerBoxScores'].iloc[87])
playerBoxScores.columns

In [None]:
playerBoxScores

# 3.5 teamBoxScores
チームごとの試合情報  
その日に行われる試合数は異なるため、行数が日によって違う

In [None]:
teamBoxScores=unpack_json(training['teamBoxScores'].iloc[87])
teamBoxScores.columns

In [None]:
teamBoxScores

# 3.6 transactions
選手やチームのトランザクション  
一行目を見てみると、選手のトレード情報だとわかる。

In [None]:
transactions=unpack_json(training['transactions'].iloc[1])
transactions.columns

In [None]:
transactions

# 3.7 standings
チームの順位情報で全30チーム分のデータがある

In [None]:
standings=unpack_json(training['standings'].iloc[87])
standings.columns

In [None]:
standings

# 3.8 awards
選手の表彰情報

In [None]:
awards=unpack_json(training['awards'].iloc[14])
awards.columns

In [None]:
awards

# 3.9 events
フィールド上で起きた出来事のデータ

In [None]:
events=unpack_json(training['events'].iloc[87])
events.columns

In [None]:
events

# 3.10 playerTwitterFollowers
選手のTwitterフォロワー数

In [None]:
playerTwitterFollowers=unpack_json(training['playerTwitterFollowers'].iloc[0])
playerTwitterFollowers.columns

In [None]:
playerTwitterFollowers

# 3.11 teamTwitterFollowers
全３０チーム公式Twitterアカウントのフォロワー数

In [None]:
teamTwitterFollowers=unpack_json(training['teamTwitterFollowers'].iloc[0])
teamTwitterFollowers.columns

In [None]:
teamTwitterFollowers

# 4.Data Merge

In [None]:
df_names = ['seasons', 'teams', 'players', 'awards']
path = "../input/mlb-player-digital-engagement-forecasting"
kaggle_data_tabs = widgets.Tab()
kaggle_data_tabs.children = list([widgets.Output() for df_name in df_names])

In [None]:
for index in range(len(df_names)):
    kaggle_data_tabs.set_title(index, df_names[index])
    df = pd.read_csv(os.path.join(path,df_names[index]) + ".csv")
    with kaggle_data_tabs.children[index]:
        display(df)
display(kaggle_data_tabs)

In [None]:
for name in df_names:
    globals()[name] = pd.read_csv(os.path.join(path,name)+ ".csv")

In [None]:
#### Unnest various nested data within training (daily) data ####
daily_data_unnested_dfs = pd.DataFrame(data = {
  'dfName': training.drop('date', axis = 1).columns.values.tolist()
  })
daily_data_unnested_dfs['df'] = [pd.DataFrame() for row in 
  daily_data_unnested_dfs.iterrows()]
for df_index, df_row in daily_data_unnested_dfs.iterrows():
    nestedTableName = str(df_row['dfName'])
    date_nested_table = training[['date', nestedTableName]]
    date_nested_table = (date_nested_table[
      ~pd.isna(date_nested_table[nestedTableName])
      ].
      reset_index(drop = True)
      )
    daily_dfs_collection = []
    for date_index, date_row in date_nested_table.iterrows():
        daily_df = unpack_json(date_row[nestedTableName])
        daily_df['dailyDataDate'] = date_row['date']
        daily_dfs_collection = daily_dfs_collection + [daily_df]
    unnested_table = pd.concat(daily_dfs_collection,ignore_index = True).set_index('dailyDataDate').reset_index()
    # Creates 1 pandas df per unnested df from daily data read in, with same name
    globals()[df_row['dfName']] = unnested_table    
    daily_data_unnested_dfs['df'][df_index] = unnested_table
del training
gc.collect()

#### Get some information on each date in daily data (using season dates of interest) ####
dates = pd.DataFrame(data = {'dailyDataDate': nextDayPlayerEngagement['dailyDataDate'].unique()})
dates['date'] = pd.to_datetime(dates['dailyDataDate'].astype(str))
dates['year'] = dates['date'].dt.year
dates['month'] = dates['date'].dt.month
dates_with_info = pd.merge(
  dates,seasons,
  left_on = 'year',right_on = 'seasonId'
  )
dates_with_info['inSeason'] = (
  dates_with_info['date'].between(
    dates_with_info['regularSeasonStartDate'],
    dates_with_info['postSeasonEndDate'],
    inclusive = True
    )
  )
dates_with_info['seasonPart'] = np.select(
  [ dates_with_info['date'] < dates_with_info['preSeasonStartDate'],  
    dates_with_info['date'] < dates_with_info['regularSeasonStartDate'],
    dates_with_info['date'] <= dates_with_info['lastDate1stHalf'],
    dates_with_info['date'] < dates_with_info['firstDate2ndHalf'],
    dates_with_info['date'] <= dates_with_info['regularSeasonEndDate'],
    dates_with_info['date'] < dates_with_info['postSeasonStartDate'],
    dates_with_info['date'] <= dates_with_info['postSeasonEndDate'],
    dates_with_info['date'] > dates_with_info['postSeasonEndDate']], 
  [ 'Offseason',
    'Preseason',
    'Reg Season 1st Half',
    'All-Star Break',
    'Reg Season 2nd Half',
    'Between Reg and Postseason',
    'Postseason',
    'Offseason'], 
  default = np.nan
  )
#### Add some pitching stats/pieces of info to player game level stats ####
player_game_stats = (playerBoxScores.copy().
  # Change team Id/name to reflect these come from player game, not roster
  rename(columns = {'teamId': 'gameTeamId', 'teamName': 'gameTeamName'})
  )
# Adds in field for innings pitched as fraction (better for aggregation)
player_game_stats['inningsPitchedAsFrac'] = np.where(
  pd.isna(player_game_stats['inningsPitched']),
  np.nan,
  np.floor(player_game_stats['inningsPitched']) +
    (player_game_stats['inningsPitched'] -
      np.floor(player_game_stats['inningsPitched'])) * 10/3
  )

# Add in Tom Tango pitching game score (https://www.mlb.com/glossary/advanced-stats/game-score)
player_game_stats['pitchingGameScore'] = (40
#     + 2 * player_game_stats['outs']
    + 1 * player_game_stats['strikeOutsPitching']
    - 2 * player_game_stats['baseOnBallsPitching']
    - 2 * player_game_stats['hitsPitching']
    - 3 * player_game_stats['runsPitching']
    - 6 * player_game_stats['homeRunsPitching']
    )
# Add in criteria for no-hitter by pitcher (individual, not multiple pitchers)
player_game_stats['noHitter'] = np.where(
  (player_game_stats['gamesStartedPitching'] == 1) &
  (player_game_stats['inningsPitched'] >= 9) &
  (player_game_stats['hitsPitching'] == 0),
  1, 0
  )
player_date_stats_agg = pd.merge(
  (player_game_stats.
    groupby(['dailyDataDate', 'playerId'], as_index = False).
    # Some aggregations that are not simple sums
    agg(
      numGames = ('gamePk', 'nunique'),
      # Should be 1 team per player per day, but adding here for 1 exception:
      # playerId 518617 (Jake Diekman) had 2 games for different teams marked
      # as played on 5/19/19, due to resumption of game after he was traded
      numTeams = ('gameTeamId', 'nunique'),
      # Should be only 1 team for almost all player-dates, taking min to simplify
      gameTeamId = ('gameTeamId', 'min')
      )
    ),
  # Merge with a bunch of player stats that can be summed at date/player level
  (player_game_stats.
    groupby(['dailyDataDate', 'playerId'], as_index = False)
    [['runsScored', 'homeRuns', 'strikeOuts', 'baseOnBalls', 'hits',
      'hitByPitch', 'atBats', 'caughtStealing', 'stolenBases',
      'groundIntoDoublePlay', 'groundIntoTriplePlay', 'plateAppearances',
      'totalBases', 'rbi', 'leftOnBase', 'sacBunts', 'sacFlies',
      'gamesStartedPitching', 'runsPitching', 'homeRunsPitching', 
      'strikeOutsPitching', 'baseOnBallsPitching', 'hitsPitching',
      'inningsPitchedAsFrac', 'earnedRuns', 
      'battersFaced','saves', 'blownSaves', 'pitchingGameScore', 
      'noHitter'
      ]].
    sum()
    ),
  on = ['dailyDataDate', 'playerId'],
  how = 'inner'
  )
#### Turn games table into 1 row per team-game, then merge with team box scores ####
# Filter to regular or Postseason games w/ valid scores for this part
games_for_stats = games[
  np.isin(games['gameType'], ['R', 'F', 'D', 'L', 'W', 'C', 'P']) &
  ~pd.isna(games['homeScore']) &
  ~pd.isna(games['awayScore'])
  ]
# Get games table from home team perspective
games_home_perspective = games_for_stats.copy()
# Change column names so that "team" is "home", "opp" is "away"
games_home_perspective.columns = [
  col_value.replace('home', 'team').replace('away', 'opp') for 
    col_value in games_home_perspective.columns.values]
games_home_perspective['isHomeTeam'] = 1
# Get games table from away team perspective
games_away_perspective = games_for_stats.copy()
# Change column names so that "opp" is "home", "team" is "away"
games_away_perspective.columns = [
  col_value.replace('home', 'opp').replace('away', 'team') for 
    col_value in games_away_perspective.columns.values]
games_away_perspective['isHomeTeam'] = 0
# Put together games from home/away perspective to get df w/ 1 row per team game
team_games = (pd.concat([
  games_home_perspective,
  games_away_perspective
  ],
  ignore_index = True)
  )
# Copy over team box scores data to modify
team_game_stats = teamBoxScores.copy()
# Change column names to reflect these are all "team" stats - helps 
# to differentiate from individual player stats if/when joining later
team_game_stats.columns = [
  (col_value + 'Team') 
  if (col_value not in ['dailyDataDate', 'home', 'teamId', 'gamePk',
    'gameDate', 'gameTimeUTC'])
    else col_value
  for col_value in team_game_stats.columns.values
  ]
# Merge games table with team game stats
team_games_with_stats = pd.merge(
  team_games,
  team_game_stats.
    # Drop some fields that are already present in team_games table
    drop(['home', 'gameDate', 'gameTimeUTC'], axis = 1),
  on = ['dailyDataDate', 'gamePk', 'teamId'],
  # Doing this as 'inner' join excludes spring training games, postponed games,
  # etc. from original games table, but this may be fine for purposes here 
  how = 'inner'
  )
team_date_stats_agg = (team_games_with_stats.
  groupby(['dailyDataDate', 'teamId', 'gameType', 'oppId', 'oppName'], 
    as_index = False).
  agg(
    numGamesTeam = ('gamePk', 'nunique'),
    winsTeam = ('teamWinner', 'sum'),
    lossesTeam = ('oppWinner', 'sum'),
    runsScoredTeam = ('teamScore', 'sum'),
    runsAllowedTeam = ('oppScore', 'sum')
    )
   )
# Prepare standings table for merge w/ player digital engagement data
# Pick only certain fields of interest from standings for merge
standings_selected_fields = (standings[['dailyDataDate', 'teamId', 
  'streakCode', 'divisionRank', 'leagueRank', 'wildCardRank', 'pct'
  ]].
  rename(columns = {'pct': 'winPct'})
  )
# Change column names to reflect these are all "team" standings - helps 
# to differentiate from player-related fields if/when joining later
standings_selected_fields.columns = [
  (col_value + 'Team') 
  if (col_value not in ['dailyDataDate', 'teamId'])
    else col_value
  for col_value in standings_selected_fields.columns.values
  ]
standings_selected_fields['streakLengthTeam'] = (
  standings_selected_fields['streakCodeTeam'].
    str.replace('W', '').
    str.replace('L', '').
    astype(float)
    )
# Add fields to separate winning and losing streak from streak code
standings_selected_fields['winStreakTeam'] = np.where(
  standings_selected_fields['streakCodeTeam'].str[0] == 'W',
  standings_selected_fields['streakLengthTeam'],
  np.nan
  )
standings_selected_fields['lossStreakTeam'] = np.where(
  standings_selected_fields['streakCodeTeam'].str[0] == 'L',
  standings_selected_fields['streakLengthTeam'],
  np.nan
  )
standings_for_digital_engagement_merge = (pd.merge(
  standings_selected_fields,
  dates_with_info[['dailyDataDate', 'inSeason']],
  on = ['dailyDataDate'],
  how = 'left'
  ).
  # Limit down standings to only in season version
  query("inSeason").
  # Drop fields no longer necessary (in derived values, etc.)
  drop(['streakCodeTeam', 'streakLengthTeam', 'inSeason'], axis = 1).
  reset_index(drop = True)
  )
#### Merge together various data frames to add date, player, roster, and team info ####
# Copy over player engagement df to add various pieces to it
player_engagement_with_info = nextDayPlayerEngagement.copy()
# Take "row mean" across targets to add (helps with studying all 4 targets at once)
player_engagement_with_info['targetAvg'] = np.mean(
  player_engagement_with_info[['target1', 'target2', 'target3', 'target4']],
  axis = 1)
# Merge in date information
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  dates_with_info[['dailyDataDate', 'date', 'year', 'month', 'inSeason','seasonPart']],
  on = ['dailyDataDate'],
  how = 'left'
  )
# Merge in some player information
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  players[['playerId', 'playerName', 'DOB', 'mlbDebutDate', 'birthCity','birthStateProvince', 'birthCountry', 'primaryPositionName']],
   on = ['playerId'],
   how = 'left'
   )
# Merge in some player roster information by date
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  (rosters[['dailyDataDate', 'playerId', 'statusCode', 'status', 'teamId']].
    rename(columns = {'statusCode': 'rosterStatusCode','status': 'rosterStatus','teamId': 'rosterTeamId'})
    ),
  on = ['dailyDataDate', 'playerId'],
  how = 'left'
  )
# Merge in team name from player's roster team
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  (teams[['id', 'teamName']].
    rename(columns = {'id': 'rosterTeamId','teamName': 'rosterTeamName'})
    ),
  on = ['rosterTeamId'],
  how = 'left'
  )
# Merge in some player game stats (previously aggregated) from that date
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  player_date_stats_agg,
  on = ['dailyDataDate', 'playerId'],
  how = 'left'
  )
# Merge in team name from player's game team
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  (teams[['id', 'teamName']].
    rename(columns = {'id': 'gameTeamId','teamName': 'gameTeamName'})
    ),
  on = ['gameTeamId'],
  how = 'left'
  )
# Merge in some team game stats/results (previously aggregated) from that date
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  team_date_stats_agg.rename(columns = {'teamId': 'gameTeamId'}),
  on = ['dailyDataDate', 'gameTeamId'],
  how = 'left'
  )
# Merge in player transactions of note on that date
# Merge in some pieces of team standings (previously filter/processed) from that date
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  standings_for_digital_engagement_merge.
    rename(columns = {'teamId': 'gameTeamId'}),
  on = ['dailyDataDate', 'gameTeamId'],
  how = 'left'
  )
display(player_engagement_with_info)

In [None]:
player_engagement_with_info.info()

In [None]:
player_engagement_with_info.to_pickle("player_engagement_with_info.pkl")

In [None]:
t1_median = player_engagement_with_info["target1"].median()
t2_median = player_engagement_with_info["target2"].median()
t3_median = player_engagement_with_info["target3"].median()
t4_median = player_engagement_with_info["target4"].median()

In [None]:
print(t1_median,t2_median,t3_median,t4_median)

In [None]:
"""
if 'kaggle_secrets' in sys.modules:  # only run while on Kaggle
    import mlb

    env = mlb.make_env()
    iter_test = env.iter_test()

    for (test_df, sample_prediction_df) in iter_test:
    
        # Example: unpack a dataframe from a json column
        today_games = unpack_json(test_df['games'].iloc[0])
    
        # Make your predictions for the next day's engagement
        sample_prediction_df['target1'] = 100.00
    
        # Submit your predictions 
        env.predict(sample_prediction_df)


"""

In [None]:
if 'kaggle_secrets' in sys.modules:  # only run while on Kaggle
    import mlb

In [None]:
env = mlb.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    display(test_df)
    display(sample_prediction_df)
    break

In [None]:
sample_prediction_df["target1"] = t1_median
sample_prediction_df["target2"] = t2_median
sample_prediction_df["target3"] = t3_median
sample_prediction_df["target4"] = t4_median
sample_prediction_df

In [None]:
env.predict(sample_prediction_df)

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    display(test_df)
    display(sample_prediction_df)
    break

In [None]:
"""
if 'kaggle_secrets' in sys.modules:  # only run while on Kaggle
    import mlb

    env = mlb.make_env()
    iter_test = env.iter_test()

    for (test_df, sample_prediction_df) in iter_test:
    
        # Example: unpack a dataframe from a json column
        today_games = unpack_json(test_df['games'].iloc[0])
    
        # Make your predictions for the next day's engagement
        sample_prediction_df['target1'] = 100.00
    
        # Submit your predictions 
        env.predict(sample_prediction_df)


"""

In [None]:
# 2回目の提出

sample_prediction_df["target1"] = t1_median
sample_prediction_df["target2"] = t2_median
sample_prediction_df["target3"] = t3_median
sample_prediction_df["target4"] = t4_median
env.predict(sample_prediction_df)

In [None]:
# 残り最後まで

for (test_df, sample_prediction_df) in iter_test:
    
        # Example: unpack a dataframe from a json column
        #today_games = unpack_json(test_df['games'].iloc[0])
    
        # Make your predictions for the next day's engagement
        sample_prediction_df["target1"] = t1_median
        sample_prediction_df["target2"] = t2_median
        sample_prediction_df["target3"] = t3_median
        sample_prediction_df["target4"] = t4_median
    
        # Submit your predictions 
        env.predict(sample_prediction_df)