# Train 및 기타 데이터 살펴보기
#### Reference: 
  * https://www.kaggle.com/chumajin/eda-of-mlb-for-starter-english-ver
  * https://www.kaggle.com/ryanholbrook/getting-started-with-mlb-player-digital-engagement

## Train 데이터 살펴보기

In [1]:
from pathlib import Path
import os.path
import sys
import warnings
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets

import json

warnings.simplefilter("ignore")

In [2]:
file_path = '.../Kaggle/MLB_FAN_ENGAGEMENT/data'

train = pd.read_csv(file_path + '/train.csv')
train.head(5)

Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20180101,"[{""engagementMetricsDate"":""2018-01-02"",""player...",,"[{""playerId"":400121,""gameDate"":""2018-01-01"",""t...",,,"[{""transactionId"":340732,""playerId"":547348,""pl...",,,,"[{""date"":""2018-01-01"",""playerId"":545361,""playe...","[{""date"":""2018-01-01"",""teamId"":147,""teamName"":..."
1,20180102,"[{""engagementMetricsDate"":""2018-01-03"",""player...",,"[{""playerId"":134181,""gameDate"":""2018-01-02"",""t...",,,"[{""transactionId"":339458,""playerId"":621173,""pl...",,,,,
2,20180103,"[{""engagementMetricsDate"":""2018-01-04"",""player...",,"[{""playerId"":425492,""gameDate"":""2018-01-03"",""t...",,,"[{""transactionId"":347527,""playerId"":572389,""pl...",,,,,
3,20180104,"[{""engagementMetricsDate"":""2018-01-05"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-04"",""t...",,,"[{""transactionId"":339549,""playerId"":545343,""pl...",,,,,
4,20180105,"[{""engagementMetricsDate"":""2018-01-06"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-05"",""t...",,,"[{""transactionId"":341195,""playerId"":628336,""pl...",,,,,


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1216 entries, 0 to 1215
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   date                     1216 non-null   int64 
 1   nextDayPlayerEngagement  1216 non-null   object
 2   games                    639 non-null    object
 3   rosters                  1216 non-null   object
 4   playerBoxScores          538 non-null    object
 5   teamBoxScores            538 non-null    object
 6   transactions             1103 non-null   object
 7   standings                531 non-null    object
 8   awards                   294 non-null    object
 9   events                   536 non-null    object
 10  playerTwitterFollowers   40 non-null     object
 11  teamTwitterFollowers     40 non-null     object
dtypes: int64(1), object(11)
memory usage: 114.1+ KB


In [4]:
# to_datetime

train['date'] = pd.to_datetime(train['date'], format="%Y%m%d")

In [5]:
# json parser

def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

In [6]:
# column별 반복을 위한 코드 
# train의 각 칼럼 별로 데이터프레임 생성

def exshow(col,n):
    tmp = train[col]
    tmp = tmp.dropna() # null 제거
    tmpdf = unpack_json(tmp.iloc[n]) # json parsing
    print(tmpdf.columns) # 칼럼명 확인
    return tmpdf # 어떤 날의 해당 칼럼의 정보를 나타내는 데이터프레임 출력

### nextDayPlayerEngagement

In [7]:
nextdayPE = exshow('nextDayPlayerEngagement',0)
nextdayPE

Index(['engagementMetricsDate', 'playerId', 'target1', 'target2', 'target3',
       'target4'],
      dtype='object')


Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4
0,2018-01-02,628317,0.011167,4.474708,0.005168,5.735294
1,2018-01-02,547989,0.042993,5.593385,0.045033,2.794118
2,2018-01-02,519317,0.974327,56.177043,13.693746,64.166667
3,2018-01-02,607625,0.006700,2.675097,0.005168,1.862745
4,2018-01-02,592547,0.001117,0.632296,0.002953,0.931373
...,...,...,...,...,...,...
2056,2018-01-02,605525,0.000000,0.000000,0.000000,0.098039
2057,2018-01-02,573131,0.000000,0.048638,0.000000,1.176471
2058,2018-01-02,664199,0.000000,0.000000,0.000000,0.196078
2059,2018-01-02,663399,0.000000,0.000000,0.000000,0.098039


In [8]:
nextdayPE.describe()

Unnamed: 0,playerId,target1,target2,target3,target4
count,2061.0,2061.0,2061.0,2061.0,2061.0
mean,587590.91606,0.213906,2.889072,1.096593,1.49863
std,71127.947793,3.17814,5.708526,8.136906,4.534034
min,112526.0,0.0,0.0,0.0,0.0
25%,543343.0,0.0,0.048638,0.0,0.04902
50%,605388.0,0.0,1.55642,0.0,0.441176
75%,642758.0,0.005025,3.356031,0.005168,1.22549
max,685503.0,100.0,100.0,100.0,100.0


* engagementMetricsDate
* playerID
* target1
* target2
* target3
* target4
  - target1 ~ 4는 0 ~100의 수로 digital engagement를 나타내는 수치

### games

In [9]:
# exshow('games',0)
games = exshow('games',1)
games

Index(['gamePk', 'gameType', 'season', 'gameDate', 'gameTimeUTC', 'resumeDate',
       'resumedFrom', 'codedGameState', 'detailedGameState', 'isTie',
       'gameNumber', 'doubleHeader', 'dayNight', 'scheduledInnings',
       'gamesInSeries', 'seriesDescription', 'homeId', 'homeName',
       'homeAbbrev', 'homeWins', 'homeLosses', 'homeWinPct', 'homeWinner',
       'homeScore', 'awayId', 'awayName', 'awayAbbrev', 'awayWins',
       'awayLosses', 'awayWinPct', 'awayWinner', 'awayScore'],
      dtype='object')


Unnamed: 0,gamePk,gameType,season,gameDate,gameTimeUTC,resumeDate,resumedFrom,codedGameState,detailedGameState,isTie,...,homeWinner,homeScore,awayId,awayName,awayAbbrev,awayWins,awayLosses,awayWinPct,awayWinner,awayScore
0,534461,E,2018,2018-02-22,2018-02-22T18:05:00Z,,,F,Final,False,...,True,6,228,Florida Southern College Mocs,FSC,0,1,0,False,1
1,545334,E,2018,2018-02-22,2018-02-22T18:05:00Z,,,F,Final,False,...,True,6,231,University of Tampa Spartans,UT,0,1,0,False,0
2,547295,E,2018,2018-02-22,2018-02-22T03:33:00Z,,,F,Final,False,...,True,4,227,Boston College Eagles,BC,0,1,0,False,2
3,533784,E,2018,2018-02-22,2018-02-22T23:05:00Z,,,F,Final,False,...,True,2,4864,Minnesota Gophers,UM,0,1,0,False,1
4,547296,E,2018,2018-02-22,2018-02-22T18:05:00Z,,,F,Final,False,...,True,15,343,Northeastern University Huskies,NEU,0,1,0,False,2


In [10]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gamePk             5 non-null      int64  
 1   gameType           5 non-null      object 
 2   season             5 non-null      int64  
 3   gameDate           5 non-null      object 
 4   gameTimeUTC        5 non-null      object 
 5   resumeDate         0 non-null      float64
 6   resumedFrom        0 non-null      float64
 7   codedGameState     5 non-null      object 
 8   detailedGameState  5 non-null      object 
 9   isTie              5 non-null      bool   
 10  gameNumber         5 non-null      int64  
 11  doubleHeader       5 non-null      object 
 12  dayNight           5 non-null      object 
 13  scheduledInnings   5 non-null      int64  
 14  gamesInSeries      5 non-null      int64  
 15  seriesDescription  5 non-null      object 
 16  homeId             5 non-null 

### rosters

In [11]:
rosters = exshow('rosters',0)
rosters

Index(['playerId', 'gameDate', 'teamId', 'statusCode', 'status'], dtype='object')


Unnamed: 0,playerId,gameDate,teamId,statusCode,status
0,400121,2018-01-01,116,A,Active
1,408045,2018-01-01,142,A,Active
2,425492,2018-01-01,120,A,Active
3,429664,2018-01-01,136,A,Active
4,431151,2018-01-01,121,A,Active
...,...,...,...,...,...
1175,657228,2018-01-01,108,A,Active
1176,657240,2018-01-01,114,A,Active
1177,657253,2018-01-01,115,A,Active
1178,657557,2018-01-01,138,A,Active


In [12]:
rosters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1180 entries, 0 to 1179
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   playerId    1180 non-null   int64 
 1   gameDate    1180 non-null   object
 2   teamId      1180 non-null   int64 
 3   statusCode  1180 non-null   object
 4   status      1180 non-null   object
dtypes: int64(2), object(3)
memory usage: 46.2+ KB


### playerBoxScores

In [13]:
playerBS = exshow('playerBoxScores',0)
playerBS

Index(['home', 'gamePk', 'gameDate', 'gameTimeUTC', 'teamId', 'teamName',
       'playerId', 'playerName', 'jerseyNum', 'positionCode', 'positionName',
       'positionType', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hit

Unnamed: 0,home,gamePk,gameDate,gameTimeUTC,teamId,teamName,playerId,playerName,jerseyNum,positionCode,...,catchersInterferencePitching,sacBuntsPitching,sacFliesPitching,saves,holds,blownSaves,assists,putOuts,errors,chances
0,1,529418,2018-03-29,2018-03-29T23:08:00Z,119,Los Angeles Dodgers,605131,Austin Barnes,15,12,...,,,,,,,,,,
1,1,529406,2018-03-29,2018-03-29T20:00:00Z,139,Tampa Bay Rays,605480,Mallex Smith,0,7,...,,,,,,,0.0,0.0,0.0,0.0
2,0,529416,2018-03-29,2018-03-29T20:10:00Z,143,Philadelphia Phillies,546318,Odubel Herrera,37,8,...,,,,,,,0.0,0.0,0.0,0.0
3,0,529412,2018-03-29,2018-03-29T20:05:00Z,108,Los Angeles Angels,527043,Jefry Marte,19,3,...,,,,,,,0.0,1.0,0.0,1.0
4,1,529408,2018-03-29,2018-03-29T20:15:00Z,118,Kansas City Royals,449181,Paulo Orlando,16,8,...,,,,,,,0.0,2.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,0,529413,2018-03-29,2018-03-29T20:10:00Z,158,Milwaukee Brewers,502624,Chase Anderson,57,1,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
402,1,529419,2018-03-29,2018-03-29T17:10:00Z,121,New York Mets,592789,Noah Syndergaard,34,1,...,0.0,0.0,0.0,,,,3.0,0.0,0.0,3.0
403,1,529414,2018-03-29,2018-03-29T19:05:00Z,110,Baltimore Orioles,605164,Dylan Bundy,37,1,...,0.0,0.0,0.0,,,,0.0,1.0,0.0,1.0
404,1,529413,2018-03-29,2018-03-29T20:10:00Z,135,San Diego Padres,453385,Clayton Richard,3,1,...,0.0,0.0,0.0,,,,3.0,0.0,0.0,3.0


In [14]:
playerBS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 85 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   home                          406 non-null    int64  
 1   gamePk                        406 non-null    int64  
 2   gameDate                      406 non-null    object 
 3   gameTimeUTC                   406 non-null    object 
 4   teamId                        406 non-null    int64  
 5   teamName                      406 non-null    object 
 6   playerId                      406 non-null    int64  
 7   playerName                    406 non-null    object 
 8   jerseyNum                     406 non-null    int64  
 9   positionCode                  406 non-null    int64  
 10  positionName                  406 non-null    object 
 11  positionType                  406 non-null    object 
 12  battingOrder                  337 non-null    float64
 13  games

### teamBoxScores

In [15]:
teamBS = exshow('teamBoxScores',0)
teamBS

Index(['home', 'teamId', 'gamePk', 'gameDate', 'gameTimeUTC', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'airOutsPitching', 'groundOutsPitching', 'runsPitching',
       'doublesPitching', 'triplesPitching', 'homeRunsPitching',
       'strikeOutsPitching', 'baseOnBallsPitching', 'intentionalWalksPitching',
       'hitsPitching', 'hitByPitchPitching', 'atBatsPitching',
       'caughtStealingPitching', 'stolenBasesPitching', 'inningsPitched',
       'earnedRuns', 'battersFaced', 'outsPitching', 'hitBatsmen', 'balks',
       'wildPitches', 'pickoffsPitching', 'rbiPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterference

Unnamed: 0,home,teamId,gamePk,gameDate,gameTimeUTC,flyOuts,groundOuts,runsScored,doubles,triples,...,hitBatsmen,balks,wildPitches,pickoffsPitching,rbiPitching,inheritedRunners,inheritedRunnersScored,catchersInterferencePitching,sacBuntsPitching,sacFliesPitching
0,1,109,529410,2018-03-29,2018-03-30T02:10:00Z,4,9,8,2,1,...,0,0,0,0,2,0,0,0,1,0
1,0,114,529409,2018-03-29,2018-03-30T02:10:00Z,4,9,1,1,0,...,0,0,0,0,2,0,0,0,0,0
2,1,121,529419,2018-03-29,2018-03-29T17:10:00Z,2,10,9,2,0,...,0,0,0,0,4,0,0,0,0,0
3,1,139,529406,2018-03-29,2018-03-29T20:00:00Z,2,6,6,1,1,...,0,0,0,0,4,0,0,0,0,0
4,1,140,529411,2018-03-29,2018-03-29T19:35:00Z,9,4,1,1,0,...,0,0,0,0,4,0,0,0,0,1
5,1,146,529407,2018-03-29,2018-03-29T16:40:00Z,4,13,4,2,1,...,3,0,0,0,8,0,0,1,0,0
6,0,158,529413,2018-03-29,2018-03-29T20:10:00Z,4,19,2,2,0,...,0,0,0,0,1,0,0,0,0,0
7,1,119,529418,2018-03-29,2018-03-29T23:08:00Z,3,11,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,1,135,529413,2018-03-29,2018-03-29T20:10:00Z,6,12,1,0,0,...,1,0,0,1,2,0,0,0,0,0
9,0,142,529414,2018-03-29,2018-03-29T19:05:00Z,6,11,2,0,0,...,0,0,0,0,3,0,0,0,1,0


In [16]:
teamBS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 57 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   home                          26 non-null     int64  
 1   teamId                        26 non-null     int64  
 2   gamePk                        26 non-null     int64  
 3   gameDate                      26 non-null     object 
 4   gameTimeUTC                   26 non-null     object 
 5   flyOuts                       26 non-null     int64  
 6   groundOuts                    26 non-null     int64  
 7   runsScored                    26 non-null     int64  
 8   doubles                       26 non-null     int64  
 9   triples                       26 non-null     int64  
 10  homeRuns                      26 non-null     int64  
 11  strikeOuts                    26 non-null     int64  
 12  baseOnBalls                   26 non-null     int64  
 13  intenti

### transactions

In [17]:
transactions = exshow('transactions',2)
transactions

Index(['transactionId', 'playerId', 'playerName', 'date', 'fromTeamId',
       'fromTeamName', 'toTeamId', 'toTeamName', 'effectiveDate',
       'resolutionDate', 'typeCode', 'typeDesc', 'description'],
      dtype='object')


Unnamed: 0,transactionId,playerId,playerName,date,fromTeamId,fromTeamName,toTeamId,toTeamName,effectiveDate,resolutionDate,typeCode,typeDesc,description
0,347527,572389,Josh Prince,2018-01-03,,,109,Arizona Diamondbacks,2018-01-03,2018-01-03,SFA,Signed as Free Agent,Arizona Diamondbacks signed free agent 2B Josh...
1,339518,571697,Scooter Gennett,2018-01-03,,,113,Cincinnati Reds,2018-01-03,2018-01-03,NUM,Number Change,Scooter Gennett changed number to 3.
2,341623,542669,Roman Mendez,2018-01-03,,,120,Washington Nationals,2018-01-03,2018-01-03,SFA,Signed as Free Agent,Washington Nationals signed free agent RHP Rom...
3,339945,502544,Nick Buss,2018-01-03,,,142,Minnesota Twins,2018-01-03,2018-01-03,SFA,Signed as Free Agent,Minnesota Twins signed free agent OF Nick Buss...
4,339514,607355,Logan Moore,2018-01-03,,,143,Philadelphia Phillies,2018-01-03,2018-01-03,ASG,Assigned,Philadelphia Phillies invited non-roster C Log...
5,339517,605520,Mitch Walding,2018-01-03,,,143,Philadelphia Phillies,2018-01-03,2018-01-03,ASG,Assigned,Philadelphia Phillies invited non-roster 3B Mi...
6,339504,518700,Eric Fryer,2018-01-03,,,143,Philadelphia Phillies,2018-01-03,2018-01-03,SFA,Signed as Free Agent,Philadelphia Phillies signed free agent C Eric...
7,339512,519025,Will Middlebrooks,2018-01-03,,,143,Philadelphia Phillies,2018-01-03,2018-01-03,SFA,Signed as Free Agent,Philadelphia Phillies signed free agent 3B Wil...
8,339510,542344,Heiker Meneses,2018-01-03,,,143,Philadelphia Phillies,2018-01-03,2018-01-03,SFA,Signed as Free Agent,Philadelphia Phillies signed free agent 3B Hei...
9,339506,544993,Steve Geltz,2018-01-03,,,143,Philadelphia Phillies,2018-01-03,2018-01-03,SFA,Signed as Free Agent,Philadelphia Phillies signed free agent RHP St...


In [18]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   transactionId   15 non-null     int64         
 1   playerId        15 non-null     int64         
 2   playerName      15 non-null     object        
 3   date            15 non-null     datetime64[ns]
 4   fromTeamId      0 non-null      float64       
 5   fromTeamName    0 non-null      float64       
 6   toTeamId        15 non-null     int64         
 7   toTeamName      15 non-null     object        
 8   effectiveDate   15 non-null     object        
 9   resolutionDate  15 non-null     object        
 10  typeCode        15 non-null     object        
 11  typeDesc        15 non-null     object        
 12  description     15 non-null     object        
dtypes: datetime64[ns](1), float64(2), int64(3), object(7)
memory usage: 1.6+ KB


### standings

In [19]:
standings = exshow('standings',0)
standings

Index(['season', 'gameDate', 'divisionId', 'teamId', 'teamName', 'streakCode',
       'divisionRank', 'leagueRank', 'wildCardRank', 'leagueGamesBack',
       'sportGamesBack', 'divisionGamesBack', 'wins', 'losses', 'pct',
       'runsAllowed', 'runsScored', 'divisionChamp', 'divisionLeader',
       'wildCardLeader', 'eliminationNumber', 'wildCardEliminationNumber',
       'homeWins', 'homeLosses', 'awayWins', 'awayLosses', 'lastTenWins',
       'lastTenLosses', 'extraInningWins', 'extraInningLosses', 'oneRunWins',
       'oneRunLosses', 'dayWins', 'dayLosses', 'nightWins', 'nightLosses',
       'grassWins', 'grassLosses', 'turfWins', 'turfLosses', 'divWins',
       'divLosses', 'alWins', 'alLosses', 'nlWins', 'nlLosses', 'xWinLossPct'],
      dtype='object')


Unnamed: 0,season,gameDate,divisionId,teamId,teamName,streakCode,divisionRank,leagueRank,wildCardRank,leagueGamesBack,...,grassLosses,turfWins,turfLosses,divWins,divLosses,alWins,alLosses,nlWins,nlLosses,xWinLossPct
0,2018,2018-03-29,205,112,Chicago Cubs,W1,1,3,3.0,-,...,0,0,0,0,0,0,0,0,0,0.0
1,2018,2018-03-29,204,146,Miami Marlins,L1,4,12,12.0,1.0,...,1,0,0,0,0,0,1,0,1,0.0
2,2018,2018-03-29,204,121,New York Mets,W1,2,5,5.0,-,...,0,0,0,0,0,1,0,1,0,1.0
3,2018,2018-03-29,200,140,Texas Rangers,L1,5,14,13.0,1.0,...,1,0,0,0,1,0,0,0,0,0.0
4,2018,2018-03-29,204,144,Atlanta Braves,W1,1,2,2.0,-,...,0,0,0,1,0,0,0,0,0,0.0
5,2018,2018-03-29,201,111,Boston Red Sox,L1,4,9,8.0,1.0,...,0,0,1,0,1,0,0,0,0,0.0
6,2018,2018-03-29,202,116,Detroit Tigers,,2,8,7.0,0.5,...,0,0,0,0,0,0,0,0,0,
7,2018,2018-03-29,200,117,Houston Astros,W1,1,3,2.0,-,...,0,0,0,1,0,0,0,0,0,0.0
8,2018,2018-03-29,201,139,Tampa Bay Rays,W1,3,7,6.0,-,...,0,1,0,1,0,0,0,0,0,0.0
9,2018,2018-03-29,205,113,Cincinnati Reds,,3,7,7.0,0.5,...,0,0,0,0,0,0,0,0,0,


In [20]:
standings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 47 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   season                     30 non-null     int64  
 1   gameDate                   30 non-null     object 
 2   divisionId                 30 non-null     int64  
 3   teamId                     30 non-null     int64  
 4   teamName                   30 non-null     object 
 5   streakCode                 26 non-null     object 
 6   divisionRank               30 non-null     int64  
 7   leagueRank                 30 non-null     int64  
 8   wildCardRank               29 non-null     float64
 9   leagueGamesBack            30 non-null     object 
 10  sportGamesBack             30 non-null     object 
 11  divisionGamesBack          30 non-null     object 
 12  wins                       30 non-null     int64  
 13  losses                     30 non-null     int64  
 

### awards

In [21]:
awards = exshow('awards',0)
awards

Index(['awardId', 'awardName', 'awardDate', 'awardSeason', 'playerId',
       'playerName', 'awardPlayerTeamId'],
      dtype='object')


Unnamed: 0,awardId,awardName,awardDate,awardSeason,playerId,playerName,awardPlayerTeamId
0,VWLCPOY,VWL Comeback Player of the Year,2018-01-15,2017,150119,Freddy Garcia,699
1,VWLPOY,VWL Pitcher of the Year,2018-01-15,2017,446861,Guillermo Moscoso,699
2,VWLMOY,VWL Manager of the Year,2018-01-15,2017,492527,Mike Rojas,695
3,VWLRLOY,VWL Reliever of the Year,2018-01-15,2017,519246,Andres Santiago,694
4,VWLOPOY,VWL Offensive Player of the Year,2018-01-15,2017,543874,Jose Vargas,699
5,VWLROY,VWL Rookie of the Year,2018-01-15,2017,620444,Alexander Palma,695


In [22]:
awards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   awardId            6 non-null      object
 1   awardName          6 non-null      object
 2   awardDate          6 non-null      object
 3   awardSeason        6 non-null      int64 
 4   playerId           6 non-null      int64 
 5   playerName         6 non-null      object
 6   awardPlayerTeamId  6 non-null      int64 
dtypes: int64(3), object(4)
memory usage: 464.0+ bytes


### events

In [23]:
events = exshow('events',0)
events

Index(['gamePk', 'gameDate', 'gameTimeUTC', 'season', 'gameType', 'playId',
       'eventId', 'inning', 'halfInning', 'homeScore', 'awayScore',
       'menOnBase', 'atBatIndex', 'atBatDesc', 'atBatEvent', 'hasOut',
       'pitcherTeamId', 'isPitcherHome', 'pitcherTeam', 'hitterTeamId',
       'hitterTeam', 'pitcherId', 'pitcherName', 'isStarter', 'pitcherHand',
       'hitterId', 'hitterName', 'batSide', 'pitchNumber', 'balls', 'strikes',
       'isGB', 'isLD', 'isFB', 'isPU', 'launchSpeed', 'launchAngle',
       'totalDistance', 'event', 'description', 'rbi', 'pitchType', 'call',
       'outs', 'inPlay', 'isPaOver', 'startSpeed', 'endSpeed', 'nastyFactor',
       'breakAngle', 'breakLength', 'breakY', 'spinRate', 'spinDirection',
       'pX', 'pZ', 'aX', 'aY', 'aZ', 'pfxX', 'pfxZ', 'vX0', 'vY0', 'vZ0', 'x',
       'y', 'x0', 'y0', 'z0', 'type', 'zone'],
      dtype='object')


Unnamed: 0,gamePk,gameDate,gameTimeUTC,season,gameType,playId,eventId,inning,halfInning,homeScore,...,vX0,vY0,vZ0,x,y,x0,y0,z0,type,zone
0,529409,2018-03-29,2018-03-30T02:10:00Z,2018,R,,5,9,top,2,...,,,,,,,,,action,
1,529416,2018-03-29,2018-03-29T20:10:00Z,2018,R,,0,3,bottom,0,...,,,,,,,,,action,
2,529416,2018-03-29,2018-03-29T20:10:00Z,2018,R,,4,9,top,5,...,,,,,,,,,action,
3,529411,2018-03-29,2018-03-29T19:35:00Z,2018,R,,4,3,bottom,0,...,,,,,,,,,action,
4,529406,2018-03-29,2018-03-29T20:00:00Z,2018,R,,5,8,top,0,...,,,,,,,,,action,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4312,529409,2018-03-29,2018-03-30T02:10:00Z,2018,R,9cdd94a2-e9e3-497a-9eaa-12c17234536d,4,6,top,2,...,8.42,-136.79,-3.76,100.03,169.80,-1.73,50.0,5.16,pitch,6.0
4313,529413,2018-03-29,2018-03-29T20:10:00Z,2018,R,aab52193-4ba9-4d96-b7e2-3c833f0ffc8f,4,11,bottom,1,...,1.42,-128.94,-2.83,101.97,125.52,-0.87,50.0,5.70,pitch,13.0
4314,529416,2018-03-29,2018-03-29T20:10:00Z,2018,R,31f59de4-bdb3-4d6b-8711-f1ea08318a52,4,5,top,0,...,2.07,-114.66,2.09,157.49,159.38,-2.10,50.0,5.09,pitch,11.0
4315,529407,2018-03-29,2018-03-29T16:40:00Z,2018,R,fee561a5-844b-4e93-a2dc-421af9e4919a,3,9,bottom,4,...,-9.89,-134.46,-7.56,120.52,188.03,2.58,50.0,6.17,pitch,8.0


In [24]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4317 entries, 0 to 4316
Data columns (total 71 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gamePk         4317 non-null   int64  
 1   gameDate       4317 non-null   object 
 2   gameTimeUTC    4317 non-null   object 
 3   season         4317 non-null   int64  
 4   gameType       4317 non-null   object 
 5   playId         3995 non-null   object 
 6   eventId        4317 non-null   int64  
 7   inning         4317 non-null   int64  
 8   halfInning     4317 non-null   object 
 9   homeScore      4317 non-null   int64  
 10  awayScore      4317 non-null   int64  
 11  menOnBase      4023 non-null   object 
 12  atBatIndex     4317 non-null   int64  
 13  atBatDesc      4317 non-null   object 
 14  atBatEvent     4317 non-null   object 
 15  hasOut         4317 non-null   int64  
 16  pitcherTeamId  4317 non-null   int64  
 17  isPitcherHome  4317 non-null   int64  
 18  pitcherT

### playerTwitterFollowers

In [25]:
playerTwitter = exshow('playerTwitterFollowers',0)
playerTwitter

Index(['date', 'playerId', 'playerName', 'accountName', 'twitterHandle',
       'numberOfFollowers'],
      dtype='object')


Unnamed: 0,date,playerId,playerName,accountName,twitterHandle,numberOfFollowers
0,2018-01-01,545361,Mike Trout,Mike Trout,@miketrout,2452409
1,2018-01-01,506433,Yu Darvish,Yu Darvish,@faridyu,1945081
2,2018-01-01,434378,Justin Verlander,Justin Verlander,@justinverlander,1795985
3,2018-01-01,430897,Nick Swisher,Nick Swisher,@nickswisher,1711807
4,2018-01-01,120074,David Ortiz,David Ortiz,@davidortiz,1515463
...,...,...,...,...,...,...
808,2018-01-01,489119,Wade Miley,Wade Miley,@wademiley36,31
809,2018-01-01,453307,David Huff,David Huff,@therealdavehuff,15
810,2018-01-01,605125,Cody Asche,Cody Asche,@aschecody,8
811,2018-01-01,425766,James Loney,James Loney,@james_loney,7


In [26]:
playerTwitter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 813 entries, 0 to 812
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               813 non-null    datetime64[ns]
 1   playerId           813 non-null    int64         
 2   playerName         813 non-null    object        
 3   accountName        813 non-null    object        
 4   twitterHandle      813 non-null    object        
 5   numberOfFollowers  813 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 38.2+ KB


### teamTwitterFollowers

In [27]:
teamTwitter = exshow('teamTwitterFollowers',0)
teamTwitter

Index(['date', 'teamId', 'teamName', 'accountName', 'twitterHandle',
       'numberOfFollowers'],
      dtype='object')


Unnamed: 0,date,teamId,teamName,accountName,twitterHandle,numberOfFollowers
0,2018-01-01,147,New York Yankees,New York Yankees,@Yankees,3130482
1,2018-01-01,112,Chicago Cubs,Chicago Cubs,@Cubs,2373710
2,2018-01-01,141,Toronto Blue Jays,Toronto Blue Jays,@BlueJays,2196352
3,2018-01-01,111,Boston Red Sox,Boston Red Sox,@RedSox,1950737
4,2018-01-01,119,Los Angeles Dodgers,Los Angeles Dodgers,@Dodgers,1949542
5,2018-01-01,143,Philadelphia Phillies,Philadelphia Phillies,@Phillies,1776288
6,2018-01-01,137,San Francisco Giants,San Francisco Giants,@SFGiants,1742156
7,2018-01-01,116,Detroit Tigers,Detroit Tigers,@tigers,1413168
8,2018-01-01,140,Texas Rangers,Texas Rangers,@Rangers,1380307
9,2018-01-01,117,Houston Astros,Houston Astros,@astros,1264937


In [28]:
teamTwitter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               30 non-null     datetime64[ns]
 1   teamId             30 non-null     int64         
 2   teamName           30 non-null     object        
 3   accountName        30 non-null     object        
 4   twitterHandle      30 non-null     object        
 5   numberOfFollowers  30 non-null     int64         
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 1.5+ KB


## 기타 데이터 살펴보기

###  awards

In [29]:
awards_o = pd.read_csv(file_path + '/awards.csv') # 위의 awards와 헷갈리지 않게 다른 이름으로 데이터프레임 저장
awards_o

Unnamed: 0,awardDate,awardSeason,awardId,awardName,playerId,playerName,awardPlayerTeamId
0,2017-12-21,2017,WARRENSPAHN,Warren Spahn Award,477132,Clayton Kershaw,119.0
1,2017-12-20,2017,MILBORGAS,MiLB.com Organization All-Star,474319,Brandon Snyder,120.0
2,2017-12-20,2017,MILBORGAS,MiLB.com Organization All-Star,592530,Jose Marmolejos,120.0
3,2017-12-20,2017,MILBORGAS,MiLB.com Organization All-Star,593833,Wander Suero,120.0
4,2017-12-20,2017,MILBORGAS,MiLB.com Organization All-Star,600466,Raudy Read,120.0
...,...,...,...,...,...,...,...
11251,2000-08-27,2000,NLPOW,NL Player of the Week,134181,Adrian Beltre,119.0
11252,2000-07-09,2000,FUTURES,Futures Game Selection,282332,CC Sabathia,402.0
11253,1998-09-01,1998,TLPSAS,TEX Post-Season All-Star,134181,Adrian Beltre,510.0
11254,1998-07-07,1998,ALAS,AL All-Star,112526,Bartolo Colon,114.0


In [30]:
awards_o.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11256 entries, 0 to 11255
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   awardDate          11256 non-null  object 
 1   awardSeason        11256 non-null  int64  
 2   awardId            11256 non-null  object 
 3   awardName          11256 non-null  object 
 4   playerId           11256 non-null  int64  
 5   playerName         11256 non-null  object 
 6   awardPlayerTeamId  11243 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 615.7+ KB


### players

In [31]:
players = pd.read_csv(file_path + '/players.csv')
players

Unnamed: 0,playerId,playerName,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,665482,Gilberto Celestino,1999-02-13,2021-06-02,Santo Domingo,,Dominican Republic,72,170,8,Outfielder,False
1,593590,Webster Rivas,1990-08-08,2021-05-28,Nagua,,Dominican Republic,73,219,3,First Base,True
2,661269,Vladimir Gutierrez,1995-09-18,2021-05-28,Havana,,Cuba,73,190,1,Pitcher,True
3,669212,Eli Morgan,1996-05-13,2021-05-28,Rancho Palos Verdes,CA,USA,70,190,1,Pitcher,True
4,666201,Alek Manoah,1998-01-09,2021-05-27,Homestead,FL,USA,78,260,1,Pitcher,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2056,672695,Geraldo Perdomo,1999-10-22,,Santo Domingo,,Dominican Republic,74,203,6,Shortstop,True
2057,672911,Jesus Cruz,1995-04-15,,Salinas de Hidalgo,San Luis Potosi,Mexico,73,230,1,Pitcher,False
2058,676103,Damon Jones,1994-09-30,,Twin Falls,ID,USA,77,233,1,Pitcher,True
2059,676755,Isaac Mattson,1995-07-14,,Erie,PA,USA,74,205,1,Pitcher,True


In [32]:
players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2061 entries, 0 to 2060
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   playerId                        2061 non-null   int64 
 1   playerName                      2061 non-null   object
 2   DOB                             2061 non-null   object
 3   mlbDebutDate                    2025 non-null   object
 4   birthCity                       2061 non-null   object
 5   birthStateProvince              1516 non-null   object
 6   birthCountry                    2061 non-null   object
 7   heightInches                    2061 non-null   int64 
 8   weight                          2061 non-null   int64 
 9   primaryPositionCode             2061 non-null   object
 10  primaryPositionName             2061 non-null   object
 11  playerForTestSetAndFuturePreds  2057 non-null   object
dtypes: int64(3), object(9)
memory usage: 193.3+ KB


### teams

In [33]:
teams = pd.read_csv(file_path + '/teams.csv')
teams

Unnamed: 0,id,name,teamName,teamCode,shortName,abbreviation,locationName,leagueId,leagueName,divisionId,divisionName,venueId,venueName
0,108,Los Angeles Angels,Angels,ana,LA Angels,LAA,Anaheim,103,American League,200,American League West,1,Angel Stadium
1,109,Arizona Diamondbacks,D-backs,ari,Arizona,ARI,Phoenix,104,National League,203,National League West,15,Chase Field
2,110,Baltimore Orioles,Orioles,bal,Baltimore,BAL,Baltimore,103,American League,201,American League East,2,Oriole Park at Camden Yards
3,111,Boston Red Sox,Red Sox,bos,Boston,BOS,Boston,103,American League,201,American League East,3,Fenway Park
4,112,Chicago Cubs,Cubs,chn,Chi Cubs,CHC,Chicago,104,National League,205,National League Central,17,Wrigley Field
5,113,Cincinnati Reds,Reds,cin,Cincinnati,CIN,Cincinnati,104,National League,205,National League Central,2602,Great American Ball Park
6,114,Cleveland Indians,Indians,cle,Cleveland,CLE,Cleveland,103,American League,202,American League Central,5,Progressive Field
7,115,Colorado Rockies,Rockies,col,Colorado,COL,Denver,104,National League,203,National League West,19,Coors Field
8,116,Detroit Tigers,Tigers,det,Detroit,DET,Detroit,103,American League,202,American League Central,2394,Comerica Park
9,117,Houston Astros,Astros,hou,Houston,HOU,Houston,103,American League,200,American League West,2392,Minute Maid Park


In [34]:
teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            30 non-null     int64 
 1   name          30 non-null     object
 2   teamName      30 non-null     object
 3   teamCode      30 non-null     object
 4   shortName     30 non-null     object
 5   abbreviation  30 non-null     object
 6   locationName  30 non-null     object
 7   leagueId      30 non-null     int64 
 8   leagueName    30 non-null     object
 9   divisionId    30 non-null     int64 
 10  divisionName  30 non-null     object
 11  venueId       30 non-null     int64 
 12  venueName     30 non-null     object
dtypes: int64(4), object(9)
memory usage: 3.2+ KB


### seasons

In [35]:
seasons = pd.read_csv(file_path + '/seasons.csv')
seasons

Unnamed: 0,seasonId,seasonStartDate,seasonEndDate,preSeasonStartDate,preSeasonEndDate,regularSeasonStartDate,regularSeasonEndDate,lastDate1stHalf,allStarDate,firstDate2ndHalf,postSeasonStartDate,postSeasonEndDate
0,2017,2017-04-02,2017-11-01,2017-02-22,2017-04-01,2017-04-02,2017-10-01,2017-07-09,2017-07-11,2017-07-14,2017-10-03,2017-11-01
1,2018,2018-03-29,2018-10-28,2018-02-21,2018-03-27,2018-03-29,2018-10-01,2018-07-15,2018-07-17,2018-07-19,2018-10-02,2018-10-28
2,2019,2019-03-20,2019-10-30,2019-02-21,2019-03-26,2019-03-20,2019-09-29,2019-07-07,2019-07-09,2019-07-11,2019-10-01,2019-10-30
3,2020,2020-07-23,2020-10-28,2020-02-21,2020-07-22,2020-07-23,2020-09-27,2020-08-25,,2020-08-26,2020-09-29,2020-10-28
4,2021,2021-02-28,2021-10-31,2021-02-28,2021-03-30,2021-04-01,2021-10-03,2021-07-11,2021-07-13,2021-07-15,2021-10-04,2021-10-31


In [36]:
seasons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seasonId                5 non-null      int64 
 1   seasonStartDate         5 non-null      object
 2   seasonEndDate           5 non-null      object
 3   preSeasonStartDate      5 non-null      object
 4   preSeasonEndDate        5 non-null      object
 5   regularSeasonStartDate  5 non-null      object
 6   regularSeasonEndDate    5 non-null      object
 7   lastDate1stHalf         5 non-null      object
 8   allStarDate             4 non-null      object
 9   firstDate2ndHalf        5 non-null      object
 10  postSeasonStartDate     5 non-null      object
 11  postSeasonEndDate       5 non-null      object
dtypes: int64(1), object(11)
memory usage: 608.0+ bytes


## 위젯 활용하기

In [37]:
df_names = ['seasons', 'teams', 'players', 'awards']

for name in df_names:
    globals()[name] = pd.read_csv(file_path +'/' + f"{name}.csv" )
    
data_tabs = widgets.Tab()
data_tabs.children = list([widgets.Output() for df_name in df_names]) # add output

for i in range(0, len(df_names)):
    data_tabs.set_title(i, df_names[i]) # 각 tab bar 이름 지정
    
    with data_tabs.children[i]:
        display(eval(df_names[i])) # 각 tab에 해당하는 output 표시

display(data_tabs)

Tab(children=(Output(), Output(), Output(), Output()), _titles={'0': 'seasons', '1': 'teams', '2': 'players', …

## 데이터 합치기
* nextDayPlayerEngagement의 target1~4 & 기타 데이터를 활용하여 player engagement를 예측하는 것이 대회의 목표
* nestedJSON 형태와 아닌 것들을 나눠서 merge 진행

In [40]:
daily_data_unnested_dfs = pd.DataFrame(data = 
                                      {'dfName': train.drop('date', axis=1).columns.values.tolist()})

daily_data_unnested_dfs['df'] = [pd.DataFrame() for row in
                                daily_data_unnested_dfs.iterrows()]

for df_index, df_row in daily_data_unnested_dfs.iterrows():
    nestedTableName = str(df_row['dfName'])
    
    date_nested_table = train[['date', nestedTableName]]
    
    date_nested_table = (date_nested_table[
        ~pd.isna(date_nested_table[nestedTableName])].reset_index(drop=True))
    
    daily_dfs_collection = []
    
    for date_index, date_row in date_nested_table.iterrows():
        daily_df = unpack_json(date_row[nestedTableName])
        
        daily_df['dailyDataDate'] = date_row['date']
        
        daily_dfs_collection = daily_dfs_collection + [daily_df]
        
    unnested_table = pd.concat(daily_dfs_collection, 
                               ignore_index=True).set_index('dailyDataDate').reset_index()
    
    globals()[df_row['dfName']] = unnested_table
    
    daily_data_unnested_dfs['df'][df_index] = unnested_table
    
del train
gc.collect() # 메모리 관리

# 해당 날짜의 데이터를 가져오기
dates = pd.DataFrame(data=
                    {'dailyDataDate': nextDayPlayerEngagement['dailyDataDate'].unique()})

dates['date'] = pd.to_datetime(dates['dailyDataDate'].astype(str))

dates['year'] = dates['date'].dt.year
dates['month'] = dates['date'].dt.month

dates_with_info = pd.merge(
    dates,
    seasons,
    left_on = 'year',
    right_on = 'seasonId')

dates_with_info['inSeason'] = (
    dates_with_info['date'].between(
        dates_with_info['regularSeasonStartDate'],
        dates_with_info['postSeasonEndDate'],
        inclusive=True)) # inclusive: 시작과 끝을 포함할지 말지

dates_with_info['seasonPart'] = np.select(
  [
    dates_with_info['date'] < dates_with_info['preSeasonStartDate'], 
    dates_with_info['date'] < dates_with_info['regularSeasonStartDate'],
    dates_with_info['date'] <= dates_with_info['lastDate1stHalf'],
    dates_with_info['date'] < dates_with_info['firstDate2ndHalf'],
    dates_with_info['date'] <= dates_with_info['regularSeasonEndDate'],
    dates_with_info['date'] < dates_with_info['postSeasonStartDate'],
    dates_with_info['date'] <= dates_with_info['postSeasonEndDate'],
    dates_with_info['date'] > dates_with_info['postSeasonEndDate']
  ], 
 [
    'Offseason',
    'Preseason',
    'Reg Season 1st Half',
    'All-Star Break',
    'Reg Season 2nd Half',
    'Between Reg and Postseason',
    'Postseason',
    'Offseason'
  ],
    default = np.nan)

# player stats 관련 데이터
player_game_stats = playerBS.copy().rename(columns={'teamID': 'gameTeamID', 'teamName': 'gameTeamName' }) # roster와 혼동되지 않도록 이름 변경

# 각종 stat 정리

# innings pitched
player_game_stats['inningsPitchedAsFrac'] = np.where(
  pd.isna(player_game_stats['inningsPitched']),
  np.nan,
  np.floor(player_game_stats['inningsPitched']) +
    (player_game_stats['inningsPitched'] -
      np.floor(player_game_stats['inningsPitched'])) * 10/3
  )

# Tom Tango pitching score (https://www.mlb.com/glossary/advanced-stats/game-score)
'''
• Start with 40 points
• Add 2 points for each out recorded (or 6 points per inning)
• Add 1 additional point for every strikeout
• Remove 2 points for each walk allowed
• Remove 2 points for each hit allowed
• Remove 3 points for each run allowed (earned or unearned)
• Remove 6 additional points for each home run allowed
'''
player_game_stats['pitchingGameScore'] = (40
  #  + 2 * player_game_stats['outs']
    + 1 * player_game_stats['strikeOutsPitching']
    - 2 * player_game_stats['baseOnBallsPitching']
    - 2 * player_game_stats['hitsPitching']
    - 3 * player_game_stats['runsPitching']
    - 6 * player_game_stats['homeRunsPitching']
    )

# no-hitter
player_game_stats['noHitter'] = np.where(
    (player_game_stats['gamesStartedPitching'] == 1)&
    (player_game_stats['inningsPitched'] >= 9)&
    (player_game_stats['hitsPitching'] == 0),
    1, 0 
) # True면 1, False면 0

player_date_stats_agg = pd.merge(
  (player_game_stats.
    groupby(['dailyDataDate', 'playerId'], as_index=False).
    # 따로 더할 필요성이 있는 데이터들을 더해주고
    agg(
      numGames = ('gamePK', 'nunique'),
      numTeams = ('gameTeamId', 'nunique'),
      gameTeamId = ('gameTeamId', 'min')
      )
  ),
    # 나머지 sum()으로 구할 수 있는 데이터들을 따로 정리한다
    (player_game_stats.groupby(['dailyDataDate', 'playerId'], as_index=False)
    [['runsScored', 'homeRuns', 'strikeOuts', 'baseOnBalls', 'hits',
      'hitByPitch', 'atBats', 'caughtStealing', 'stolenBases',
      'groundIntoDoublePlay', 'groundIntoTriplePlay', 'plateAppearances',
      'totalBases', 'rbi', 'leftOnBase', 'sacBunts', 'sacFlies',
      'gamesStartedPitching', 'runsPitching', 'homeRunsPitching', 
      'strikeOutsPitching', 'baseOnBallsPitching', 'hitsPitching',
      'inningsPitchedAsFrac', 'earnedRuns', 
      'battersFaced','saves', 'blownSaves', 'pitchingGameScore', 
      'noHitter'
      ]].sum()
     ),
    on=['dailyDataDate', 'playerId'],
    how='inner'
)

# games를 팀들간 경기 당 1row로 정리하고 teamBS와 합친다
# 정규시즌 및 

KeyError: 'dailyDataDate'