## Hey Everyone !! ** please upvote this notebook if it helped **

### This notebook acts a data descriptor for all data files in this [competition](https://www.kaggle.com/c/mlb-player-digital-engagement-forecasting/overview) 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sample_submission = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_sample_submission.csv")
players = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/players.csv")
seasons = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/seasons.csv")
awards = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/awards.csv")
teams = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/teams.csv")
train = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/train.csv")
example_test = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_test.csv")

## 1. Sample submission !!

In [None]:
# For each playerid, a competition that predicts the numerical value (target) of how much the fan will engage 
# in digital content (such as "reaction" and "action") the next day (future).

# The targets are 1 to 4, and there are 4 different indicators, each of which is quantified on a scale of 0-100.

sample_submission.head(3)

## 2. Example test !!

In [None]:
# At first glance, it doesn't seem like you can immediately see the player ID that appears in the submission. 
# It seems that various information is included in json format.

# The test data consists of one line of data per day.

# For example, using a function from the starter code, it can be expanded as follows.

example_test.head(3)

In [None]:
## A small helper function !! to extract data from the json fomrat !!

def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

unpack_json(example_test["playerBoxScores"].iloc[0]).head(3)

In [None]:
# From the information in this area, I think it is a competition to estimate the expected value 
# of the evaluation items target1 to 4 on the next day for each player id.

unpack_json(example_test["games"].iloc[0]).head(3)

## 3. Training !!

In [None]:
# Data for 1216 days. No null data. There are nan data here and there.

train['date'] = pd.to_datetime(train['date'], format="%Y%m%d")
train.info()

Data for 1216 days. No null data. There are nan data here and there.

In [None]:
# From here, let's take a look at one json where there is data for each column as an example.

# As mentioned above, the contents of train.csv are also in json file format in one cell, and 
# it has a complicated shape like dataframe is further contained.

# (From the result, it is an image that about 11 DataFrames (although it is a little small even if there is nan) 
#  are hanging as the amount of information for one day of data for 1216 days in total, which is a considerable amount of information. is.

# So it's a little long from here. You may just grab the image and read it through.)

train.columns

In [None]:
# It is troublesome to input one by one, so I will omit na and make the nth (0 is the top) sample as 
# a dataframe and create a function to see the column name and contents.

def exshow(col,n):
    tmp = train[col]
    tmp = tmp.dropna()
    tmpdf = unpack_json(tmp.iloc[n])
    print(tmpdf.columns)
    return tmpdf

In [None]:
# Nested JSON containing all modeling targets from the next day onwards.

train.head(3)

In [None]:
# engagementMetricsDate-Date of player engagement metrics based on US Pacific Time 
# (matches previous day's games, addresses, field statistics, transactions, awards, and more).
#     playerId
#     target1
#     target2
#     target3
#     target4

# target1-target4 is a daily index of digital engagement on a scale of 0 to 100.

# From here, plyaerId and targets1 to 4 from the next day onwards are pulled out.

exshow("nextDayPlayerEngagement",0).head(3)

In [None]:
# Nested JSON that contains all the game information for a particular day. 
# Includes spring training and exhibition games in addition to regular season, postseason and all-star games.

exshow("games",1).head(3)

In [None]:
# Nested JSON that contains all the roster information for a particular day. 
# Includes in-season and off-season team rosters.

# playerId-Unique identifier for the player.
#     gameDate
#     teamId-The teamId that player is on that day.
#     statusCode-Abbreviation for list status.
#     status-The status of the descriptive roster.

# 1.4 playerBoxScores (4th column in train.csv)

exshow("rosters",0).head(3)

In [None]:
# Nested JSON containing game stats aggregated at the player game level for a particular day. 
# Includes regular season, postseason and all-star games.

exshow("playerBoxScores",0).head(3)

In [None]:
# Please open ↓ to translate the meaning of the column. (Because it is long, I hide it.)

In [None]:
# home: Binary, 1 for home team, 0 for distant.
# gamePk: Unique identifier for the game.
# gameDate:
# gameTimeUTC: Ceremonial first pitch in UTC.
# teamId: A unique identifier for the team.
# teamName:
# playerId: Unique identifier for the player.
# playerName:
# jerseyNum:
# positionCode: The position code of the number, details are here.
# positionName: Text position display, details are here.
# positionType: Position group, details are here.
# battingOrder: Format: "###". The first digit indicates the batting order spot, and the next two digits indicate the order in which the player occupies the batting order spot. Example: "300" indicates the starter of the third spot in batting order. "903" indicates that the fourth person (900, 901, 902 or later) occupies the ninth place in the batting order. Entered only when it appears in the game.
# gamesPlayedBatting: 1. If the player participates in the game as a batter, runner, or fielder.
# flyOuts: Total flyouts for the game.
# groundOuts: Total ground outs for the game.
# runsScored: The total run of the game has been recorded.
# doubles: The total number of games is doubled.
# triples: The total triples of the game.
# homeRuns: Game home runs.
# strikeOuts: Total strikeouts in the game.
# baseOnBalls: Total walk of the game.
# intentionalWalks: Intentional walks in the game.
# hits: Total number of hits in the game.
# hitByPitch: The total hit of the game by pitch.
# Game total at atBats:
# caughtStealing: The total of the game caught the stolen base.
# stolenBases: Total number of stolen bases in the game.
# groundIntoDoublePlay: The total double play of the game is based on.
# groundIntoTriplePlay: Based on a total of 3 games played.
# plateAppearances: Total at-bats in the game.
# totalBases: The total number of bases in the game.
# rbi: Total RBI of the game.
# leftOnBase: The total runners in the game remain on the base.
# sacBunts: Total sacrifice bunts for the game.
# sacFlies: Total sacrifice fly for the game.
# catchersInterference: The game's total catcher interference has occurred.
# pickoffs: The total number of games is off the base.
# gamesPlayedPitching: Binary, 1 if the player participates in the game as a pitcher.
# gamesStartedPitching: Binary, 1 if the player was the starting pitcher of the game.
# completeGamesPitching: Binary, 1 if credited for complete game.
# shutoutsPitching: Binary, 1 if shutout and credited.
# winsPitching: Binary, 1 if credited for winning.
# lossesPitching: Binary, 1 if losses are credited.
# flyOutsPitching: Total allowed flyout games.
# airOutsPitching: Airout (flyout + popout) game totals are allowed.
# groundOutsPitching: Allows total ground outs for the game.
# runsPitching: Allows total running of the game.
# doublesPitching: The total game is doubled.
# triplesPitching: A total of triples in the game are allowed.
# homeRunsPitching: Total home runs for the game are allowed.
# strikeOutsPitching: A total of strikeouts in the game is allowed.
# baseOnBallsPitching: The total walking of the game is allowed.
# intentionalWalksPitching: Allows a total of intentional walks in the game.
# hitsPitching: The total number of hits in the game allowed.
# hitByPitchPitching: The total number of hits in the game with the allowed pitch.
# Total games at atBatsPitching:
# caughtStealingPitching: The game total caught theft.
# stolenBasesPitching: Total stolen bases in the game are allowed.
# inningsPitched: Total pitched times of the game.
# saveOpportunities: Binary, 1 if you have the opportunity to save.
# earnedRuns: The total earned run of the game is allowed.
# battersFaced: The total batter of the game faced.
# outsPitching: The total outs of the game have been recorded.
# pitchesThrown: Total number of pitched games.
# balls: The total balls of the thrown game.
# strikes: The total strikes of the thrown game.
# hitBatsmen: The total hit by pitch batter in the game.
# balks: The total of the game will balk.
# wildPitches: Total number of wild pitch games thrown.
# pickoffsPitching: The total number of pickoffs in the game.
# rbiPitching: The total number of RBI games is allowed.
# inheritedRunners: Assuming the total number of inherited runner games.
# inheritedRunnersScored: The total number of inherited runners scored.
# catchersInterferencePitching: The game total of catcher interference was caused by the battery.
# sacBuntsPitching: The total sacrifice bunts in the game are allowed.
# sacFliesPitching: Game sacrifice flies are allowed.
# saves: Binary, 1 if credited with saves.
# holds: Binary, 1 if the hold is credited.
# blownSaves: Binary, 1 if credited with blowsave.
# assists: Total number of assists in the game.
# putOuts: Total number of game stabs.
# errors: Total number of game errors.
# chances: Total fielding chances for the game.

In [None]:
# Nested JSON containing game stats aggregated at the team game level for a particular day.
# Includes regular season, postseason and all-star games.

exshow("teamBoxScores",0).head(3)

In [None]:
# Please open ↓ to translate the meaning of the column. (Because it is long, I hide it.)

In [None]:
# Column Meaning home: Binary, 1 for home team, 0 for distant.
# teamId: A unique identifier for the team.
# gamePk: Unique identifier for the game.
# gameDate:
# gameTimeUTC: Ceremonial first pitch in UTC.
# flyOuts: Total flyouts for the game.
# groundOuts: Total ground outs for the game.
# runsScored: The total run of the game has been recorded.
# doubles: The total number of games is doubled.
# triples: The total triples of the game.
# homeRuns: Game home runs.
# strikeOuts: Total strikeouts in the game.
# baseOnBalls: Total walk of the game.
# intentionalWalks: Intentional walks in the game.
# hits: Total number of hits in the game.
# hitByPitch: The total hit of the game by pitch.
# Game total at atBats:
# caughtStealing: The total of the game caught the stolen base. Please open ↓ for translation. (Because it is long, I hide it.)
# stolenBases: Total number of stolen bases in the game.
# groundIntoDoublePlay: The total double play of the game is based on.
# groundIntoTriplePlay: Based on a total of 3 games played.
# plateAppearances: Total at-bats in the game.
# totalBases: The total number of bases in the game.
# rbi: Total RBI of the game.
# leftOnBase: The total runners in the game remain on the base.
# sacBunts: Total sacrifice bunts for the game.
# sacFlies: Total sacrifice fly for the game.
# catchersInterference: The game's total catcher interference has occurred.
# pickoffs: The total number of games is off the base.
# airOutsPitching: Airout (flyout + popout) game totals are allowed.
# groundOutsPitching: Allows total ground outs for the game.
# runsPitching: Allows total running of the game.
# doublesPitching: The total game is doubled.
# triplesPitching: A total of triples in the game are allowed.
# homeRunsPitching: Total home runs for the game are allowed.
# strikeOutsPitching: A total of strikeouts in the game is allowed.
# baseOnBallsPitching: The total walking of the game is allowed.
# intentionalWalksPitching: Allows a total of intentional walks in the game.
# hitsPitching: The total number of hits in the game allowed.
# hitByPitchPitching: The total number of hits in the game with the allowed pitch.
# Total games at atBatsPitching:
# caughtStealingPitching: The game total caught theft.
# stolenBasesPitching: Total stolen bases in the game are allowed.
# inningsPitched: Total pitched times of the game.
# earnedRuns: The total earned run of the game is allowed.
# battersFaced: The total batter of the game faced.
# outsPitching: The total outs of the game have been recorded.
# hitBatsmen: The total hit by pitch batter in the game.
# balks: The total of the game will balk.
# wildPitches: Total number of wild pitch games thrown.
# pickoffsPitching: The total number of pickoffs in the game.
# rbiPitching: The total number of RBI games is allowed.
# inheritedRunners: Assuming the total number of inherited runner games.
# inheritedRunnersScored: The total number of inherited runners scored.
# catchersInterferencePitching: The game total of catcher interference was caused by the battery.
# sacBuntsPitching: The total sacrifice bunts in the game are allowed.
# sacFliesPitching: Game sacrifice flies are allowed.

In [None]:
# Nested JSON that contains all transaction information related to the MLB team for a particular day.

# transactionId: A unique identifier for the transaction.
# playerId: Unique identifier for the player.
# playerName:
# date: date:
# fromTeamId: A unique identifier for the player's team of origin.
# fromTeamName: fromTeamName:
# toTeamId: A unique identifier for the team the player goes to.
# toTeamName:
# effectiveDate:
# resolutionDate:
# typeCode: Abbreviation for transaction status.
# typeDesc: A description of the transaction status.
# description: A textual description of the transaction.

exshow("transactions",1) .head(2)

In [None]:
# A nested JSON that contains all the ranking information about the MLB team for a particular day.

exshow("standings",0).head(3)

In [None]:
# Please open ↓ to translate the meaning of the column. (Because it is long, I hide it.)

In [None]:
# season: season:
# gameDate:
# divisionId: A unique identifier that represents the department to which this team belongs.
# teamId: A unique identifier for the team.
# teamName:
# streakCode: Abbreviation for the team's current wins and losses. The first letter indicates win or loss, and the number is the number of games.
# divisionRank: The current rank of the team's division.
# leagueRank: The current rank of the team in the league.
# wildCardRank: The current rank of the wildcard berth team.
# leagueGamesBack: The game returns to the team's league.
# sportGamesBack: Games back to all of MLB.
# divisionGamesBack: The game is back in the team division.
# wins: Current victory.
# losses: Current losses.
# pct: Current win rate.
# runsAllowed: Allowed runs during the season.
# runsScored: Runs scored in the season.
# divisionChamp: true if the team wins the division title.
# divisionLeader: true if the team is leading the division race.
#     wildCardLeader: true if the team is a wildcard reader.
# eliminationNumber: The number of games to be eliminated from the division race (team defeat + opponent win).
# wildCardEliminationNumber: The number of games before being eliminated from a wildcard race (team defeat + opponent win).
# homeWins: Home wins the season.
# homeLosses: Home loss during the season.
# awayWins: Away wins the season.
# awayLosses: Seasonal away loss.
# lastTenWins: You have won the last 10 games.
# lastTenLosses: I lost in the last 10 games.
# extraInningWins: Win with additional innings of the season.
# extraInningLosses: Losses in additional innings of the season.
# oneRunWins: Win one run during the season.
# oneRunLosses: You lose one run during the season.
# dayWins: Day games win the season.
# dayLosses Day game losses on the season.:
# nightWins: Night games win the season.
# nightLosses: Defeat of night games during the season.
# grassWins: The grass field wins the season.
# grassLosses: Seasonal grassland loss.
# turfWins: The turf field wins the season.
# turfLosses: Loss of turf field during the season.
# divWins: Win division opponents during the season.
# divLosses: Defeat of a division's opponent during the season.
# alWins: Win the AL team during the season.
# alLosses: Defeat against AL team during the season.
# nlWins: Win the NL team during the season.
# nlLosses: Defeat to the NL team during the season.
# xWinLossPct: Expected win rate based on scoring and allowed runs.

In [None]:
# Nested JSON containing all awards or honors distributed on a particular day.

# awardId:
# awardName:
# awardDate: A date award was given.
# awardSeason: The season award was empty.
# playerId: Unique identifier for the player.
# playerName:
# awardPlayerTeamId:

exshow("awards",0).head(3)

In [None]:
# Nested JSON that contains all the on-field game events for a particular day. 
# Includes regular and postseason matches.

exshow("events",0).head(3)

In [None]:
# A nested JSON that contains the number of Twitter followers for some players of the day.

# Twitter follow-up data is collected by MLB from Major League Baseball players' Twitter API on the first day of every month and dates back to January 1, 2018. This data because not all players have / have a Twitter account, there are other scenarios where players randomly create / delete / restore accounts, or cannot collect follower data on a particular day. The set does not cover all players over every month.

#     date: The date of the number of followers.
#     playerId: Unique identifier for the player.
#     playerName:
#     accountName: The name of the player's Twitter account.
#     twitterHandle: The player's Twitter handle.
#     numberOfFollowers: Number of followers

exshow("playerTwitterFollowers",0).head(3)

In [None]:
# A nested JSON that contains the number of teamTwitterFollowers followers for some players of the day.

# Twitter follow-up data was collected by MLB from the Twitter APIs of all 30 Major League Baseball teams on the 1st of every month, dating back to January 1, 2018.

#     date: The date of the number of followers.
#     teamId: A unique identifier for the team.
#     teamName:
#     accountName: The name of your team's Twitter account.
#     twitterHandle: Team Twitter handle.

exshow("teamTwitterFollowers",0).head(3)

## 4. additional data ( awards.csv, players.csv, seasons.csv, teams.csv)

In [None]:
# seasonId: Season ID
# seasonStartDate: Season start date
# seasonEndDate: Season end date
# preSeasonStartDate: Previous season start date
# preSeasonEndDate: The last day of the previous season
# regularSeasonStartDate: The start date of the regular season
# regularSeasonEndDate: The last day of the regular season
# lastDate1stHalf: Last day of 1st half
# allStarDate: Date of the All-Star match
# firstDate2ndHalf: Start date of 2nd half
# postSeasonStartDate: Start date for next season
# postSeasonEndDate: The end of the next season
    
seasons.head(3)

In [None]:
# id --team ID
# name: name
# teamName: The name of the team
# teamCode: Team code
# shortName: Short name
# abbreviation: Abbreviation
# locationName: The name of the location
# leagueId: league id
# leagueName: The name of the league
# divisionId: divisionid
# divisionName: divisionname
# venueId: Venue id
# venueName: Venue name

teams.head(3)

In [None]:
# playerId --Unique identifier for a player.: PlayerID-Unique identifier for a player.
# playerName: The name of the player
# DOB-Player ’s date of birth.: DOB-Player ’s date of birth.
# mlbDebutDate: MLB debut date
# birthCity: the town where you were born
# birthStateProvince: birth state
# birthCountry: Country of birth
# heightInches: Height (inch)
# weight: weight
# primaryPositionCode --Player ’s primary position code: Primary position code
# primaryPositionName --player ’s primary position: Primary position name
# playerForTestSetAndFuturePreds --Boolean, true if player is among those for whom predictions are to be made in test data

players.head(3)


In [None]:
# This file contains the awards won by the players in the training set before the start of the daily data (that is, before 2018).

# awardDate --Date award was given.: Awarded date --Awarded date.
# awardSeason --Season award was from .: Award Season-The season award was empty.
# awardId: award id
# awardName: award name
# playerId --Unique identifier for a player.: PlayerID-Unique identifier for a player.
# playerName: The name of the player
# awardPlayerTeamId: Player's team ID

awards.head(3)

In [None]:
## Data Merge

df_names = ['seasons', 'teams', 'players', 'awards']
import gc
path = "../input/mlb-player-digital-engagement-forecasting"


In [None]:
# for name in df_names:
#     globals()[name] = pd.read_csv(os.path.join(path,name)+ ".csv")

# #### Unnest various nested data within train (daily) data ####
# daily_data_unnested_dfs = pd.DataFrame(data = {
#   'dfName': train.drop('date', axis = 1).columns.values.tolist()
#   })

# daily_data_unnested_dfs['df'] = [pd.DataFrame() for row in 
#   daily_data_unnested_dfs.iterrows()]

# for df_index, df_row in daily_data_unnested_dfs.iterrows():
#     nestedTableName = str(df_row['dfName'])
    
#     date_nested_table = train[['date', nestedTableName]]
    
#     date_nested_table = (date_nested_table[
#       ~pd.isna(date_nested_table[nestedTableName])
#       ].
#       reset_index(drop = True)
#       )
    
#     daily_dfs_collection = []
    
#     for date_index, date_row in date_nested_table.iterrows():
#         daily_df = unpack_json(date_row[nestedTableName])
        
#         daily_df['dailyDataDate'] = date_row['date']
        
#         daily_dfs_collection = daily_dfs_collection + [daily_df]

#     unnested_table = pd.concat(daily_dfs_collection,
#       ignore_index = True).set_index('dailyDataDate').reset_index()

#     # Creates 1 pandas df per unnested df from daily data read in, with same name
#     globals()[df_row['dfName']] = unnested_table    
    
#     daily_data_unnested_dfs['df'][df_index] = unnested_table

# del train
# gc.collect()



# #### Get some information on each date in daily data (using season dates of interest) ####
# dates = pd.DataFrame(data = 
#   {'dailyDataDate': nextDayPlayerEngagement['dailyDataDate'].unique()})

# dates['date'] = pd.to_datetime(dates['dailyDataDate'].astype(str))

# dates['year'] = dates['date'].dt.year
# dates['month'] = dates['date'].dt.month

# dates_with_info = pd.merge(
#   dates,
#   seasons,
#   left_on = 'year',
#   right_on = 'seasonId'
#   )

# dates_with_info['inSeason'] = (
#   dates_with_info['date'].between(
#     dates_with_info['regularSeasonStartDate'],
#     dates_with_info['postSeasonEndDate'],
#     inclusive = True
#     )
#   )

# dates_with_info['seasonPart'] = np.select(
#   [
#     dates_with_info['date'] < dates_with_info['preSeasonStartDate'], 
#     dates_with_info['date'] < dates_with_info['regularSeasonStartDate'],
#     dates_with_info['date'] <= dates_with_info['lastDate1stHalf'],
#     dates_with_info['date'] < dates_with_info['firstDate2ndHalf'],
#     dates_with_info['date'] <= dates_with_info['regularSeasonEndDate'],
#     dates_with_info['date'] < dates_with_info['postSeasonStartDate'],
#     dates_with_info['date'] <= dates_with_info['postSeasonEndDate'],
#     dates_with_info['date'] > dates_with_info['postSeasonEndDate']
#   ], 
#   [
#     'Offseason',
#     'Preseason',
#     'Reg Season 1st Half',
#     'All-Star Break',
#     'Reg Season 2nd Half',
#     'Between Reg and Postseason',
#     'Postseason',
#     'Offseason'
#   ], 
#   default = np.nan
#   )

# #### Add some pitching stats/pieces of info to player game level stats ####

# player_game_stats = (playerBoxScores.copy().
#   # Change team Id/name to reflect these come from player game, not roster
#   rename(columns = {'teamId': 'gameTeamId', 'teamName': 'gameTeamName'})
#   )

# # Adds in field for innings pitched as fraction (better for aggregation)
# player_game_stats['inningsPitchedAsFrac'] = np.where(
#   pd.isna(player_game_stats['inningsPitched']),
#   np.nan,
#   np.floor(player_game_stats['inningsPitched']) +
#     (player_game_stats['inningsPitched'] -
#       np.floor(player_game_stats['inningsPitched'])) * 10/3
#   )

# # Add in Tom Tango pitching game score (https://www.mlb.com/glossary/advanced-stats/game-score)
# player_game_stats['pitchingGameScore'] = (40
# #     + 2 * player_game_stats['outs']
#     + 1 * player_game_stats['strikeOutsPitching']
#     - 2 * player_game_stats['baseOnBallsPitching']
#     - 2 * player_game_stats['hitsPitching']
#     - 3 * player_game_stats['runsPitching']
#     - 6 * player_game_stats['homeRunsPitching']
#     )

# # Add in criteria for no-hitter by pitcher (individual, not multiple pitchers)
# player_game_stats['noHitter'] = np.where(
#   (player_game_stats['gamesStartedPitching'] == 1) &
#   (player_game_stats['inningsPitched'] >= 9) &
#   (player_game_stats['hitsPitching'] == 0),
#   1, 0
#   )

# player_date_stats_agg = pd.merge(
#   (player_game_stats.
#     groupby(['dailyDataDate', 'playerId'], as_index = False).
#     # Some aggregations that are not simple sums
#     agg(
#       numGames = ('gamePk', 'nunique'),
#       # Should be 1 team per player per day, but adding here for 1 exception:
#       # playerId 518617 (Jake Diekman) had 2 games for different teams marked
#       # as played on 5/19/19, due to resumption of game after he was traded
#       numTeams = ('gameTeamId', 'nunique'),
#       # Should be only 1 team for almost all player-dates, taking min to simplify
#       gameTeamId = ('gameTeamId', 'min')
#       )
#     ),
#   # Merge with a bunch of player stats that can be summed at date/player level
#   (player_game_stats.
#     groupby(['dailyDataDate', 'playerId'], as_index = False)
#     [['runsScored', 'homeRuns', 'strikeOuts', 'baseOnBalls', 'hits',
#       'hitByPitch', 'atBats', 'caughtStealing', 'stolenBases',
#       'groundIntoDoublePlay', 'groundIntoTriplePlay', 'plateAppearances',
#       'totalBases', 'rbi', 'leftOnBase', 'sacBunts', 'sacFlies',
#       'gamesStartedPitching', 'runsPitching', 'homeRunsPitching', 
#       'strikeOutsPitching', 'baseOnBallsPitching', 'hitsPitching',
#       'inningsPitchedAsFrac', 'earnedRuns', 
#       'battersFaced','saves', 'blownSaves', 'pitchingGameScore', 
#       'noHitter'
#       ]].
#     sum()
#     ),
#   on = ['dailyDataDate', 'playerId'],
#   how = 'inner'
#   )

# #### Turn games table into 1 row per team-game, then merge with team box scores ####
# # Filter to regular or Postseason games w/ valid scores for this part
# games_for_stats = games[
#   np.isin(games['gameType'], ['R', 'F', 'D', 'L', 'W', 'C', 'P']) &
#   ~pd.isna(games['homeScore']) &
#   ~pd.isna(games['awayScore'])
#   ]

# # Get games table from home team perspective
# games_home_perspective = games_for_stats.copy()

# # Change column names so that "team" is "home", "opp" is "away"
# games_home_perspective.columns = [
#   col_value.replace('home', 'team').replace('away', 'opp') for 
#     col_value in games_home_perspective.columns.values]

# games_home_perspective['isHomeTeam'] = 1

# # Get games table from away team perspective
# games_away_perspective = games_for_stats.copy()

# # Change column names so that "opp" is "home", "team" is "away"
# games_away_perspective.columns = [
#   col_value.replace('home', 'opp').replace('away', 'team') for 
#     col_value in games_away_perspective.columns.values]

# games_away_perspective['isHomeTeam'] = 0

# # Put together games from home/away perspective to get df w/ 1 row per team game
# team_games = (pd.concat([
#   games_home_perspective,
#   games_away_perspective
#   ],
#   ignore_index = True)
#   )

# # Copy over team box scores data to modify
# team_game_stats = teamBoxScores.copy()

# # Change column names to reflect these are all "team" stats - helps 
# # to differentiate from individual player stats if/when joining later
# team_game_stats.columns = [
#   (col_value + 'Team') 
#   if (col_value not in ['dailyDataDate', 'home', 'teamId', 'gamePk',
#     'gameDate', 'gameTimeUTC'])
#     else col_value
#   for col_value in team_game_stats.columns.values
#   ]

# # Merge games table with team game stats
# team_games_with_stats = pd.merge(
#   team_games,
#   team_game_stats.
#     # Drop some fields that are already present in team_games table
#     drop(['home', 'gameDate', 'gameTimeUTC'], axis = 1),
#   on = ['dailyDataDate', 'gamePk', 'teamId'],
#   # Doing this as 'inner' join excludes spring train games, postponed games,
#   # etc. from original games table, but this may be fine for purposes here 
#   how = 'inner'
#   )

# team_date_stats_agg = (team_games_with_stats.
#   groupby(['dailyDataDate', 'teamId', 'gameType', 'oppId', 'oppName'], 
#     as_index = False).
#   agg(
#     numGamesTeam = ('gamePk', 'nunique'),
#     winsTeam = ('teamWinner', 'sum'),
#     lossesTeam = ('oppWinner', 'sum'),
#     runsScoredTeam = ('teamScore', 'sum'),
#     runsAllowedTeam = ('oppScore', 'sum')
#     )
#    )

# # Prepare standings table for merge w/ player digital engagement data
# # Pick only certain fields of interest from standings for merge
# standings_selected_fields = (standings[['dailyDataDate', 'teamId', 
#   'streakCode', 'divisionRank', 'leagueRank', 'wildCardRank', 'pct'
#   ]].
#   rename(columns = {'pct': 'winPct'})
#   )

# # Change column names to reflect these are all "team" standings - helps 
# # to differentiate from player-related fields if/when joining later
# standings_selected_fields.columns = [
#   (col_value + 'Team') 
#   if (col_value not in ['dailyDataDate', 'teamId'])
#     else col_value
#   for col_value in standings_selected_fields.columns.values
#   ]

# standings_selected_fields['streakLengthTeam'] = (
#   standings_selected_fields['streakCodeTeam'].
#     str.replace('W', '').
#     str.replace('L', '').
#     astype(float)
#     )

# # Add fields to separate winning and losing streak from streak code
# standings_selected_fields['winStreakTeam'] = np.where(
#   standings_selected_fields['streakCodeTeam'].str[0] == 'W',
#   standings_selected_fields['streakLengthTeam'],
#   np.nan
#   )

# standings_selected_fields['lossStreakTeam'] = np.where(
#   standings_selected_fields['streakCodeTeam'].str[0] == 'L',
#   standings_selected_fields['streakLengthTeam'],
#   np.nan
#   )

# standings_for_digital_engagement_merge = (pd.merge(
#   standings_selected_fields,
#   dates_with_info[['dailyDataDate', 'inSeason']],
#   on = ['dailyDataDate'],
#   how = 'left'
#   ).
#   # Limit down standings to only in season version
#   query("inSeason").
#   # Drop fields no longer necessary (in derived values, etc.)
#   drop(['streakCodeTeam', 'streakLengthTeam', 'inSeason'], axis = 1).
#   reset_index(drop = True)
#   )

# #### Merge together various data frames to add date, player, roster, and team info ####
# # Copy over player engagement df to add various pieces to it
# player_engagement_with_info = nextDayPlayerEngagement.copy()

# # Take "row mean" across targets to add (helps with studying all 4 targets at once)
# player_engagement_with_info['targetAvg'] = np.mean(
#   player_engagement_with_info[['target1', 'target2', 'target3', 'target4']],
#   axis = 1)

# # Merge in date information
# player_engagement_with_info = pd.merge(
#   player_engagement_with_info,
#   dates_with_info[['dailyDataDate', 'date', 'year', 'month', 'inSeason',
#     'seasonPart']],
#   on = ['dailyDataDate'],
#   how = 'left'
#   )

# # Merge in some player information
# player_engagement_with_info = pd.merge(
#   player_engagement_with_info,
#   players[['playerId', 'playerName', 'DOB', 'mlbDebutDate', 'birthCity',
#     'birthStateProvince', 'birthCountry', 'primaryPositionName']],
#    on = ['playerId'],
#    how = 'left'
#    )

# # Merge in some player roster information by date
# player_engagement_with_info = pd.merge(
#   player_engagement_with_info,
#   (rosters[['dailyDataDate', 'playerId', 'statusCode', 'status', 'teamId']].
#     rename(columns = {
#       'statusCode': 'rosterStatusCode',
#       'status': 'rosterStatus',
#       'teamId': 'rosterTeamId'
#       })
#     ),
#   on = ['dailyDataDate', 'playerId'],
#   how = 'left'
#   )
    
# # Merge in team name from player's roster team
# player_engagement_with_info = pd.merge(
#   player_engagement_with_info,
#   (teams[['id', 'teamName']].
#     rename(columns = {
#       'id': 'rosterTeamId',
#       'teamName': 'rosterTeamName'
#       })
#     ),
#   on = ['rosterTeamId'],
#   how = 'left'
#   )

# # Merge in some player game stats (previously aggregated) from that date
# player_engagement_with_info = pd.merge(
#   player_engagement_with_info,
#   player_date_stats_agg,
#   on = ['dailyDataDate', 'playerId'],
#   how = 'left'
#   )

# # Merge in team name from player's game team
# player_engagement_with_info = pd.merge(
#   player_engagement_with_info,
#   (teams[['id', 'teamName']].
#     rename(columns = {
#       'id': 'gameTeamId',
#       'teamName': 'gameTeamName'
#       })
#     ),
#   on = ['gameTeamId'],
#   how = 'left'
#   )

# # Merge in some team game stats/results (previously aggregated) from that date
# player_engagement_with_info = pd.merge(
#   player_engagement_with_info,
#   team_date_stats_agg.rename(columns = {'teamId': 'gameTeamId'}),
#   on = ['dailyDataDate', 'gameTeamId'],
#   how = 'left'
#   )

# # Merge in player transactions of note on that date
    
# # Merge in some pieces of team standings (previously filter/processed) from that date
# player_engagement_with_info = pd.merge(
#   player_engagement_with_info,
#   standings_for_digital_engagement_merge.
#     rename(columns = {'teamId': 'gameTeamId'}),
#   on = ['dailyDataDate', 'gameTeamId'],
#   how = 'left'
#   )

# display(player_engagement_with_info)

In [None]:
# Saving the merged data: it has been uploadded !!
# player_engagement_with_info.to_pickle("player_engagement_with_info.pkl")

## I have added this merged data !! it has been created from the above previous function !!
player_engagement_with_info = pd.read_pickle('../input/mlb-player-digital-engagement-merged-data/player_engagement_with_info.pkl')

In [None]:
# player_engagement_with_info = pd.read_pickle('./player_engagement_with_info.pkl')

In [None]:
player_engagement_with_info.columns

In [None]:
## getting mean of each player based on there target inform of list !!

mean_player = {}
for Id in player_engagement_with_info['playerId'].unique().tolist():
    mean_player[Id] = {}
    mean_player[Id]['target1'] = player_engagement_with_info[player_engagement_with_info['playerId']== Id]['target1'].median()
    mean_player[Id]['target2'] = player_engagement_with_info[player_engagement_with_info['playerId']== Id]['target2'].median()
    mean_player[Id]['target3'] = player_engagement_with_info[player_engagement_with_info['playerId']== Id]['target3'].median()
    mean_player[Id]['target4'] = player_engagement_with_info[player_engagement_with_info['playerId']== Id]['target4'].median()

mean_player[656669]

## Submission file creation : 

### Do reffere this notebook for any quries regarding submission !! [link](https://www.kaggle.com/chumajin/eda-of-mlb-for-starter-english-ver#-MLB-Player-Digital-Engagement-Competition%F0%9F%98%80)

In [None]:
import sys
if 'kaggle_secrets' in sys.modules:  # only run while on Kaggle
    import mlb

In [None]:
## Execute this only for one !!

env = mlb.make_env()
iter_test = env.iter_test()

In [None]:
## Getting first sample !!

# for (test_df, sample_prediction_df) in iter_test:
#     display(test_df)
#     display(sample_prediction_df)
#     break

In [None]:
## Function to create processed data that will be submitted !!

def process_pred(data):
    data = data.reset_index()
    data['playerId'] = data['date_playerId'].str.rsplit('_').apply(lambda x: int(x[-1]))

    for ply in data['playerId'].unique().tolist():
        indexes = (data[data['playerId'] == ply].index).tolist()
        data.loc[indexes,'target1'] = mean_player[ply]['target1']
        data.loc[indexes,'target2'] = mean_player[ply]['target2']
        data.loc[indexes,'target3'] = mean_player[ply]['target3']
        data.loc[indexes,'target4'] = mean_player[ply]['target4']
    
    data = data.set_index('date', drop = True)
    data = data.drop(['playerId'], axis=1)
    return data

In [None]:
# process_pred(sample_submission)
# process_pred(sample_submission.set_index('date', drop = True))

In [None]:
## loop to get all prediction !!

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df = process_pred(sample_prediction_df)
    env.predict(sample_prediction_df)

In [None]:
sample_prediction_df.head()