In [None]:
from PIL import Image
img=Image.open('../input/mlb-img/mlb.jpg')
img

Major League Baseball (MLB) is an American professional baseball organization and the oldest of the major professional sports leagues in the United States and Canada. A **total of 30 teams** play in Major League Baseball: **15 teams in the National League (NL)** and **15 in the American League (AL)**. The NL and AL were formed in 1876 and 1901, respectively. Beginning in 1903, the two leagues cooperated but remained legally separate entities until 2000 when they merged into a single organization led by the Commissioner of Baseball. The league is headquartered in Midtown Manhattan. 
**Most recent champion(s) :	Los Angeles Dodgers(7th title)
Most titles :	New York Yankees(27 titles)**

In [None]:
colorscales=['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance','blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
             'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
             'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
             'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
             'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
             'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
             'orrd', 'oryel', 'oxy', 'peach', 'phase', 'picnic', 'pinkyl',
             'piyg', 'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn',
             'puor', 'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu',
             'rdgy', 'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar',
             'spectral', 'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn',
             'tealrose', 'tempo', 'temps', 'thermal', 'tropic', 'turbid',
             'turbo', 'twilight', 'viridis', 'ylgn', 'ylgnbu', 'ylorbr',
             'ylorrd']


In [None]:
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import json
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from plotly import tools
from plotly.offline import iplot,init_notebook_mode
init_notebook_mode()
plt.style.use('seaborn-notebook')
#%matplotlib inline
import random

In [None]:
%%time
trainn=pd.read_csv('../input/mlb-player-digital-engagement-forecasting/train.csv')
example_test=pd.read_csv('../input/mlb-player-digital-engagement-forecasting/example_test.csv')
awards=pd.read_csv('../input/mlb-player-digital-engagement-forecasting/awards.csv')
seasons=pd.read_csv('../input/mlb-player-digital-engagement-forecasting/seasons.csv')
teams=pd.read_csv('../input/mlb-player-digital-engagement-forecasting/teams.csv')
sub=pd.read_csv('../input/mlb-player-digital-engagement-forecasting/example_sample_submission.csv')
players=pd.read_csv('../input/mlb-player-digital-engagement-forecasting/players.csv')
train=trainn.copy(deep=True)

##process date
all_date_train=train.date.values
train['year']=[int(str(val)[:4]) for val in all_date_train]
train['month']=[int(str(val)[4:6]) for val in all_date_train]
train['day']=[int(str(val)[6:]) for val in all_date_train]
target_cols=['target1', 'target2', 'target3', 'target4']

train.head()

# Process train.csv

### Engagement 


In [None]:
############################### ENGAGEMENT COLUMNS PARSING ################################

all_nextday_engage=train.nextDayPlayerEngagement.values
engagement_cols=['engagementMetricsDate','date_tr','playerId','target1','target2','target3','target4']
next_engagement_processed=dict(zip(engagement_cols,[[] for val in engagement_cols]))

### convert JSON string to dict - format
max_len_json=0;temp=[]
for val in tqdm(all_nextday_engage):
    temp.append(json.loads(val))
train['nextDayPlayerEngagement']=temp
temp=[]

### compute length of all engagements(turns out every player has 2061 engagements )
for val in tqdm(all_nextday_engage):
    if len(val)>max_len_json:
        max_len_json=len(val)
    temp.append(len(val))
train['eng_len']=temp

### parse json data
for id1,eng in tqdm(zip(train.date.values,all_nextday_engage)):
    for val in eng:
        for key in val.keys():
            next_engagement_processed[key].append(val[key])
        next_engagement_processed['date_tr'].append(id1)

engagement_df=pd.DataFrame(next_engagement_processed)
engagement_df[engagement_df.columns[0]]=pd.to_datetime(engagement_df[engagement_df.columns[0]])
engagement_df['day']=engagement_df['engagementMetricsDate'].dt.day
engagement_df['month']=engagement_df['engagementMetricsDate'].dt.month
engagement_df['year']=engagement_df['engagementMetricsDate'].dt.year
engagement_df.sample(5)

In [None]:
for cols in target_cols:
    print(f"Skewness of {cols} : ",engagement_df[cols].skew())
engagement_df[target_cols].describe()


In [None]:
temp=engagement_df[target_cols].sample(frac=0.1)
temp['year']=engagement_df['year'].astype(str)
g=sns.pairplot(temp, hue='year', height=2.5)
g.add_legend()
plt.suptitle(" Target variable distribution across all Seasons")

### Games representation

In [None]:
############################## GAMES COLUMNS PARSINBG########################

all_games=train[~train.games.isna()].games.values
game_keys=list(json.loads(all_games[0])[0].keys())
games_processed=dict(zip(game_keys,[[] for val in game_keys]))
games_processed['date_tr']=[]


### parse and store data
### select only NON-NAN entries
temp=train[~train.games.isna()]

for date,game in tqdm(zip(temp.date.values,all_games)):
    game=json.loads(game)
    for val in game:
        for key in val.keys():
            games_processed[key].append(val[key])
        games_processed['date_tr'].append(date)

In [None]:
games_df=pd.DataFrame(games_processed)
games_df['gameDate']=pd.to_datetime(games_df['gameDate'])
games_df['day']=games_df['gameDate'].dt.day
games_df['month']=games_df['gameDate'].dt.month
games_df['year']=games_df['gameDate'].dt.year
games_df.sample(5)

In [None]:
gametype_desc=[{"id":"S","description":"Spring Training"},{"id":"R","description":"Regular Season"},{"id":"F","description":"Wild Card Game"},{"id":"D","description":"Division Series"},{"id":"L","description":"League Championship Series"},{"id":"W","description":"World Series"},{"id":"C","description":"Championship"},{"id":"N","description":"Nineteenth Century Series"},{"id":"P","description":"Playoffs"},{"id":"A","description":"All-Star Game"},{"id":"I","description":"Intrasquad"},{"id":"E","description":"Exhibition"}]
gametype_map=dict()
for val in gametype_desc:
    gametype_map[val['id']]=val['description']
games_df['gameType']=games_df['gameType'].map(gametype_map)


In [None]:
gametype_dict=games_df['gameType'].value_counts().to_dict()
gametype_keys=gametype_dict.keys()

colors = [plt.cm.Spectral(i/float(len(gametype_keys))) for i in range(len(gametype_keys))]

fig = plt.figure(figsize = (20, 8))
ax = fig.add_subplot(1,2,1)

ax.bar(gametype_keys, gametype_dict.values(), color = colors)

for k, v in gametype_dict.items():
    ax.text(k, v+1, v, fontsize = 14, horizontalalignment='center', verticalalignment='center')

ax.tick_params(axis='x', labelrotation = 45, labelsize = 12)
ax.set_ylim(0, 7000)
ax.set_title("Weightage of gaming sessions", fontsize = 14);
#################################  season   ##########################################
season_dict=games_df['season'].value_counts().to_dict()
season_keys=list(season_dict.keys())
for keys in season_keys:
    season_dict[f'Season_{keys}']=season_dict.pop(keys)
season_keys=season_dict.keys()


colors = [plt.cm.RdGy_r(i/float(len(season_keys))) for i in range(len(season_keys))]


ax1 = fig.add_subplot(1,2,2)
ax1.bar(season_keys, season_dict.values(), color = colors)

for k, v in season_dict.items():
    ax1.text(k, v+1, v, fontsize = 14, horizontalalignment='center', verticalalignment='center')

ax1.tick_params(axis='x', labelrotation = 45, labelsize = 12)
ax1.set_title("Number of games per season", fontsize = 14);


#######################
games_df['resumedFrom']=games_df['resumedFrom'].astype(str)
gametype_dict=games_df['resumedFrom'].value_counts().to_dict()
gametype_keys=gametype_dict.keys()

colors = [plt.cm.Spectral(i/float(len(gametype_keys))) for i in range(len(gametype_keys))]

fig = plt.figure(figsize = (20, 8))
ax = fig.add_subplot(1,2,1)

ax.bar(gametype_keys, gametype_dict.values(), color = colors)

for k, v in gametype_dict.items():
    ax.text(k, v+1, v, fontsize = 14, horizontalalignment='center', verticalalignment='center')
plt.xlabel("Resumed date")
plt.xlabel("Resumed date count")
ax.tick_params(axis='x', labelrotation = 45, labelsize = 12)
ax.set_ylim(0, 9000)
ax.set_title("Weightage of gaming sessions", fontsize = 14)

########################################
games_df['resumeDate']=games_df['resumeDate'].astype(str)
season_dict=games_df['resumeDate'].value_counts().to_dict()
season_keys=list(season_dict.keys())



colors = [plt.cm.jet(i/float(len(season_keys))) for i in range(len(season_keys))]


ax1 = fig.add_subplot(1,2,2)
ax1.bar(season_keys, season_dict.values(), color = colors)

for k, v in season_dict.items():
    ax1.text(k, v+1, v, fontsize = 14, horizontalalignment='center', verticalalignment='center')

ax1.tick_params(axis='x', labelrotation = 45, labelsize = 12)
ax1.set_title("Number of games per season", fontsize = 14);


###### Inference #####

* Most number of games are played as Regular Season, Training Games comprises of roughly 1/6th of total number of games. There are very few Exhibition matches and rare ALL-Star matches.
* There is full season data for 2018-19 and  2019-20 series,  but for season 2020-21 there is roughly 1/3rd games and the deadly Corona tookover.For 2021-22 its live now and around 1/4th games is played. 
* Total of 2 games in 2018,2020,2021 and 4 games in 2018 wasn't completed on scheduled time, so they played the following match days


* resumeDate - Time game was resumed (if abandoned, otherwise null).
* resumedFrom - Time game was originally abandoned (if abandoned, otherwise null).
* codedGameState - Game status code, various types can be found here.
* detailedGameState - Game status, various types can be found here.

In [None]:
games_df['codedGameState']=games_df['codedGameState'].astype(str)
gametype_dict=games_df['codedGameState'].value_counts().to_dict()
gametype_keys=gametype_dict.keys()

colors = [plt.cm.Spectral(i/float(len(gametype_keys))) for i in range(len(gametype_keys))]

fig = plt.figure(figsize = (20, 8))
ax = fig.add_subplot(1,2,1)

ax.bar(gametype_keys, gametype_dict.values(), color = colors)

for k, v in gametype_dict.items():
    ax.text(k, v+1, v, fontsize = 14, horizontalalignment='center', verticalalignment='center')
plt.xlabel("codedGameState")
plt.ylabel("Resumed date count")
ax.tick_params(axis='x', labelrotation = 90, labelsize = 12)
ax.set_ylim(0, 9000)
ax.set_title("Weightage of gaming sessions", fontsize = 14)

########################################
games_df['detailedGameState']=games_df['detailedGameState'].astype(str)
season_dict=games_df['detailedGameState'].value_counts().to_dict()
season_keys=list(season_dict.keys())


aa=plt.cm.jet
colors = [aa(i/float(len(season_keys))) for i in range(len(season_keys))]


ax1 = fig.add_subplot(1,2,2)
ax1.bar(season_keys, season_dict.values(), color = colors)

for k, v in season_dict.items():
    ax1.text(k, v+1, v, fontsize = 14, horizontalalignment='center', verticalalignment='center')
plt.xlabel("detailedGameState")
plt.ylabel("detailedGameState count")
ax1.tick_params(axis='x', labelrotation = 90, labelsize = 12)
ax1.set_title("Number of games per season", fontsize = 14);


* homeWins - Number of current wins on season for home team.
* homeScore - Runs scored by home team.
* awayWins - Number of current wins on season for away team.
* awayLosses - Number of current losses on season for away team.


In [None]:
all_titles=('Total home runs scored throughout all seasons','Total home game win throughout all seasons',
'Total away runs scored throughout all seasons','Total away game win throughout all season ')


fig = make_subplots(rows=2, cols=2,subplot_titles=all_titles,
    specs=[[ {"type": "sunburst"},{"type": "sunburst"}],[{"type": "sunburst"},{"type": "sunburst"}]])   

row_=[1,1,2,2];col_=[1,2,1,2]

for i,cols in enumerate(['homeScore',"homeWinner",'awayScore','awayWinner']):
    fig_ = px.sunburst(games_df, path=[ "season"],values=cols)
    trace =go.Sunburst(
                labels=fig_['data'][0]['labels'].tolist(),
                    parents=fig_['data'][0]['parents'].tolist(),
                values=fig_['data'][0]['values'].tolist(),
                        ids=fig_['data'][0]['ids'].tolist(),
                marker = { 'colorscale':random.choice(colorscales)},)
    
    fig.add_trace(trace,row=row_[i],col=col_[i])
    

fig.update_layout(title_text='Games Stats with respect to player poitions', title_x=0.5, height=800,width=900,
                  margin=dict(r=10, t=50, b=40, l=60))
#iplot(fig)

##### Inference #####

* Net home and away runs scores over all seasons in almost same ,however the total number homegame win outweighs net away game wins by roughly 10%-15% for all seasons.

### Roster Data

In [None]:
  
############################## ROSTER COLUMN PARSING########################


all_roster=train[~train.rosters.isna()].rosters.values
roster_keys=list(json.loads(all_roster[0])[0].keys())
roster_processed=dict(zip(roster_keys,[[] for val in roster_keys]))
roster_processed['date_tr']=[]


### parse and store data
### select only NON-NAN entries
temp=train[~train.rosters.isna()]

for date,game in tqdm(zip(temp.date.values,all_roster)):
    game=json.loads(game)
    for val in game:
        for key in val.keys():
            roster_processed[key].append(val[key])
        roster_processed['date_tr'].append(date)
roster_df=pd.DataFrame(roster_processed)
roster_df.head(2)

In [None]:
fig = plt.figure(figsize = (20, 4))

for i,cols in enumerate(['statusCode','status']):
    ax = fig.add_subplot(1,2,i+1)
    
    roster_df[cols]=roster_df[cols].astype('str')
    temp_dict=roster_df[cols].value_counts().to_dict()
    temp_keys=list(temp_dict.keys())
    colors = [plt.cm.jet(i/float(len(temp_keys))) for i in range(len(temp_keys))]
    
    ax.bar(temp_keys, temp_dict.values(), color = colors)

    for k, v in temp_dict.items():
        ax.text(k, v+1, v, fontsize = 14, horizontalalignment='center', verticalalignment='center')
    plt.xlabel(cols)
    plt.ylabel(f"{cols} count")
    ax.tick_params(axis='x', labelrotation = 45, labelsize = 12)
    ax.set_title(f"{cols} Value Count plot", fontsize = 14)

### Player Box Scores

In [None]:
############################## PLAYERBOXSCORES COLUMN PARSING########################

all_playerBoxScores=train[~train.playerBoxScores.isna()].playerBoxScores.values
playerBoxScores_keys=list(json.loads(all_playerBoxScores[0])[0].keys())
playerBoxScores_processed=dict(zip(playerBoxScores_keys,[[] for val in playerBoxScores_keys]))
playerBoxScores_processed['date_tr']=[]


### parse and store data
### select only NON-NAN entries
temp=train[~train.playerBoxScores.isna()]

for date,box_scores in tqdm(zip(temp.date.values,all_playerBoxScores)):
    box_scores=json.loads(box_scores)
    for val in box_scores:
        for key in val.keys():
            playerBoxScores_processed[key].append(val[key])
        playerBoxScores_processed['date_tr'].append(date)
boxscores_df=pd.DataFrame(playerBoxScores_processed)
boxscores_df.gameDate=pd.to_datetime(boxscores_df.gameDate)
boxscores_df['year']=boxscores_df.gameDate.dt.year
boxscores_df['month']=boxscores_df.gameDate.dt.month
boxscores_df['day']=boxscores_df.gameDate.dt.day

boxscores_df.head(2)

In [None]:
true=True;false=False
position_code_map=dict()
position_code=[{"shortName":"Pitcher","fullName":"Pitcher","abbrev":"P","code":"1","type":"Pitcher","formalName":"Pitcher","gamePosition":true,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Pitcher"},{"shortName":"Catcher","fullName":"Catcher","abbrev":"C","code":"2","type":"Catcher","formalName":"Catcher","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Catcher"},{"shortName":"1st Base","fullName":"First Base","abbrev":"1B","code":"3","type":"Infielder","formalName":"First Baseman","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"First Base"},{"shortName":"2nd Base","fullName":"Second Base","abbrev":"2B","code":"4","type":"Infielder","formalName":"Second Baseman","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Second Base"},{"shortName":"3rd Base","fullName":"Third Base","abbrev":"3B","code":"5","type":"Infielder","formalName":"Third Baseman","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Third Base"},{"shortName":"Shortstop","fullName":"Shortstop","abbrev":"SS","code":"6","type":"Infielder","formalName":"Shortstop","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Shortstop"},{"shortName":"Left Field","fullName":"Outfielder","abbrev":"LF","code":"7","type":"Outfielder","formalName":"Left Fielder","gamePosition":true,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Outfielder"},{"shortName":"Center Field","fullName":"Outfielder","abbrev":"CF","code":"8","type":"Outfielder","formalName":"Center Fielder","gamePosition":true,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Outfielder"},{"shortName":"Right Field","fullName":"Outfielder","abbrev":"RF","code":"9","type":"Outfielder","formalName":"Right Fielder","gamePosition":true,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Outfielder"},{"shortName":"Designated Hitter","fullName":"Designated Hitter","abbrev":"DH","code":"10","type":"Hitter","formalName":"Designated Hitter","gamePosition":true,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Designated Hitter"},{"shortName":"Pinch Hitter","fullName":"Pinch Hitter","abbrev":"PH","code":"11","type":"Hitter","formalName":"Pinch Hitter","gamePosition":true,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Pinch Hitter"},{"shortName":"Pinch Runner","fullName":"Pinch Runner","abbrev":"PR","code":"12","type":"Runner","formalName":"Pinch Runner","gamePosition":true,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Pinch Runner"},{"shortName":"Extra Hitter","fullName":"Extra Hitter","abbrev":"EH","code":"13","type":"Hitter","formalName":"Extra Hitter","gamePosition":true,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Extra Hitter"},{"shortName":"Base Runner","fullName":"Base Runner","abbrev":"BR","code":"BR","type":"Runner","formalName":"Base Runner","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Base Runner"},{"shortName":"Outfield","fullName":"Outfield","abbrev":"OF","code":"O","type":"Outfielder","formalName":"Outfield","gamePosition":false,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Outfield"},{"shortName":"Infield","fullName":"Infield","abbrev":"IF","code":"I","type":"Infielder","formalName":"Infield","gamePosition":false,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Infield"},{"shortName":"Starting Pitcher","fullName":"Starting Pitcher","abbrev":"SP","code":"S","type":"Pitcher","formalName":"Starting Pitcher","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Starting Pitcher"},{"shortName":"Relief Pitcher","fullName":"Relief Pitcher","abbrev":"RP","code":"E","type":"Pitcher","formalName":"Relief Pitcher","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Relief Pitcher"},{"shortName":"Closer","fullName":"Closer","abbrev":"CP","code":"C","type":"Pitcher","formalName":"Closer","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Closer"},{"shortName":"Utility","fullName":"Utility","abbrev":"UT","code":"U","type":"Infielder","formalName":"Utility","gamePosition":false,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Utility"},{"shortName":"Utility Infielder","fullName":"Utility Infielder","abbrev":"UI","code":"V","type":"Infielder","formalName":"Utility Infielder","gamePosition":false,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Utility Infielder"},{"shortName":"Utility Outfielder","fullName":"Utility Outfielder","abbrev":"UO","code":"W","type":"Outfielder","formalName":"Utility Outfielder","gamePosition":false,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Utility Outfielder"},{"shortName":"Right-Handed Pitcher","fullName":"Right-Handed Pitcher","abbrev":"RHP","code":"K","type":"Pitcher","formalName":"Right-Handed Pitcher","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Right-Handed Pitcher"},{"shortName":"Left-Handed Pitcher","fullName":"Left-Handed Pitcher","abbrev":"LHP","code":"L","type":"Pitcher","formalName":"Left-Handed Pitcher","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Left-Handed Pitcher"},{"shortName":"Right-Handed Starter","fullName":"Right-Handed Starter","abbrev":"RHS","code":"M","type":"Pitcher","formalName":"Right-Handed Starter","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Right-Handed Starter"},{"shortName":"Left-Handed Starter","fullName":"Left-Handed Starter","abbrev":"LHS","code":"N","type":"Pitcher","formalName":"Left-Handed Starter","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Left-Handed Starter"},{"shortName":"Left-Handed Reliever","fullName":"Left-Handed Reliever","abbrev":"LHR","code":"G","type":"Pitcher","formalName":"Left-Handed Reliever","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Left-Handed Reliever"},{"shortName":"Right-Handed Reliever","fullName":"Right-Handed Reliever","abbrev":"RHR","code":"F","type":"Pitcher","formalName":"Right-Handed Reliever","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Right-Handed Reliever"},{"shortName":"Pitcher - Infielder","fullName":"Pitcher - Infielder","abbrev":"P-IF","code":"A","type":"Two-Way Player","formalName":"Pitcher - Infielder","gamePosition":false,"pitcher":true,"fielder":true,"outfield":false,"displayName":"Pitcher - Infielder"},{"shortName":"Pitcher - Outfielder","fullName":"Pitcher - Outfielder","abbrev":"P-OF","code":"J","type":"Two-Way Player","formalName":"Pitcher - Outfielder","gamePosition":false,"pitcher":true,"fielder":true,"outfield":true,"displayName":"Pitcher - Outfielder"},{"shortName":"Pitcher - Utility","fullName":"Pitcher - Utility","abbrev":"P-UT","code":"Z","type":"Two-Way Player","formalName":"Pitcher - Utility","gamePosition":false,"pitcher":true,"fielder":true,"outfield":false,"displayName":"Pitcher - Utility"},{"shortName":"Two-Way Player","fullName":"Two-Way Player","abbrev":"TWP","code":"Y","type":"Two-Way Player","formalName":"Two-Way Player","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Two-Way Player"},{"shortName":"Batter","fullName":"Batter","abbrev":"B","code":"10","type":"Batter","formalName":"Batter","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Batter"},{"shortName":"Unknown","fullName":"Unknown","abbrev":"X","code":"X","type":"Unknown","formalName":"Unknown","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Unknown"},{"shortName":"Runner on First","fullName":"Runner on First","abbrev":"R1","code":"R1","type":"Runner","formalName":"Runner on First","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Runner on First"},{"shortName":"Runner on Second","fullName":"Runner on Second","abbrev":"R2","code":"R2","type":"Runner","formalName":"Runner on Second","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Runner on Second"},{"shortName":"Runner on Third","fullName":"Runner on Third","abbrev":"R3","code":"R3","type":"Runner","formalName":"Runner on Third","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Runner on Third"}]
for val in position_code:
    position_code_map[val['code']]=val['formalName']
boxscores_df['positionCode']=boxscores_df['positionCode'].map(position_code_map)

position_type_map=dict()
position_type=[{"shortName":"Pitcher","fullName":"Pitcher","abbrev":"P","code":"1","type":"Pitcher","formalName":"Pitcher","gamePosition":true,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Pitcher"},{"shortName":"Catcher","fullName":"Catcher","abbrev":"C","code":"2","type":"Catcher","formalName":"Catcher","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Catcher"},{"shortName":"1st Base","fullName":"First Base","abbrev":"1B","code":"3","type":"Infielder","formalName":"First Baseman","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"First Base"},{"shortName":"2nd Base","fullName":"Second Base","abbrev":"2B","code":"4","type":"Infielder","formalName":"Second Baseman","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Second Base"},{"shortName":"3rd Base","fullName":"Third Base","abbrev":"3B","code":"5","type":"Infielder","formalName":"Third Baseman","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Third Base"},{"shortName":"Shortstop","fullName":"Shortstop","abbrev":"SS","code":"6","type":"Infielder","formalName":"Shortstop","gamePosition":true,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Shortstop"},{"shortName":"Left Field","fullName":"Outfielder","abbrev":"LF","code":"7","type":"Outfielder","formalName":"Left Fielder","gamePosition":true,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Outfielder"},{"shortName":"Center Field","fullName":"Outfielder","abbrev":"CF","code":"8","type":"Outfielder","formalName":"Center Fielder","gamePosition":true,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Outfielder"},{"shortName":"Right Field","fullName":"Outfielder","abbrev":"RF","code":"9","type":"Outfielder","formalName":"Right Fielder","gamePosition":true,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Outfielder"},{"shortName":"Designated Hitter","fullName":"Designated Hitter","abbrev":"DH","code":"10","type":"Hitter","formalName":"Designated Hitter","gamePosition":true,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Designated Hitter"},{"shortName":"Pinch Hitter","fullName":"Pinch Hitter","abbrev":"PH","code":"11","type":"Hitter","formalName":"Pinch Hitter","gamePosition":true,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Pinch Hitter"},{"shortName":"Pinch Runner","fullName":"Pinch Runner","abbrev":"PR","code":"12","type":"Runner","formalName":"Pinch Runner","gamePosition":true,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Pinch Runner"},{"shortName":"Extra Hitter","fullName":"Extra Hitter","abbrev":"EH","code":"13","type":"Hitter","formalName":"Extra Hitter","gamePosition":true,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Extra Hitter"},{"shortName":"Base Runner","fullName":"Base Runner","abbrev":"BR","code":"BR","type":"Runner","formalName":"Base Runner","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Base Runner"},{"shortName":"Outfield","fullName":"Outfield","abbrev":"OF","code":"O","type":"Outfielder","formalName":"Outfield","gamePosition":false,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Outfield"},{"shortName":"Infield","fullName":"Infield","abbrev":"IF","code":"I","type":"Infielder","formalName":"Infield","gamePosition":false,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Infield"},{"shortName":"Starting Pitcher","fullName":"Starting Pitcher","abbrev":"SP","code":"S","type":"Pitcher","formalName":"Starting Pitcher","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Starting Pitcher"},{"shortName":"Relief Pitcher","fullName":"Relief Pitcher","abbrev":"RP","code":"E","type":"Pitcher","formalName":"Relief Pitcher","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Relief Pitcher"},{"shortName":"Closer","fullName":"Closer","abbrev":"CP","code":"C","type":"Pitcher","formalName":"Closer","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Closer"},{"shortName":"Utility","fullName":"Utility","abbrev":"UT","code":"U","type":"Infielder","formalName":"Utility","gamePosition":false,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Utility"},{"shortName":"Utility Infielder","fullName":"Utility Infielder","abbrev":"UI","code":"V","type":"Infielder","formalName":"Utility Infielder","gamePosition":false,"pitcher":false,"fielder":true,"outfield":false,"displayName":"Utility Infielder"},{"shortName":"Utility Outfielder","fullName":"Utility Outfielder","abbrev":"UO","code":"W","type":"Outfielder","formalName":"Utility Outfielder","gamePosition":false,"pitcher":false,"fielder":true,"outfield":true,"displayName":"Utility Outfielder"},{"shortName":"Right-Handed Pitcher","fullName":"Right-Handed Pitcher","abbrev":"RHP","code":"K","type":"Pitcher","formalName":"Right-Handed Pitcher","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Right-Handed Pitcher"},{"shortName":"Left-Handed Pitcher","fullName":"Left-Handed Pitcher","abbrev":"LHP","code":"L","type":"Pitcher","formalName":"Left-Handed Pitcher","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Left-Handed Pitcher"},{"shortName":"Right-Handed Starter","fullName":"Right-Handed Starter","abbrev":"RHS","code":"M","type":"Pitcher","formalName":"Right-Handed Starter","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Right-Handed Starter"},{"shortName":"Left-Handed Starter","fullName":"Left-Handed Starter","abbrev":"LHS","code":"N","type":"Pitcher","formalName":"Left-Handed Starter","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Left-Handed Starter"},{"shortName":"Left-Handed Reliever","fullName":"Left-Handed Reliever","abbrev":"LHR","code":"G","type":"Pitcher","formalName":"Left-Handed Reliever","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Left-Handed Reliever"},{"shortName":"Right-Handed Reliever","fullName":"Right-Handed Reliever","abbrev":"RHR","code":"F","type":"Pitcher","formalName":"Right-Handed Reliever","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Right-Handed Reliever"},{"shortName":"Pitcher - Infielder","fullName":"Pitcher - Infielder","abbrev":"P-IF","code":"A","type":"Two-Way Player","formalName":"Pitcher - Infielder","gamePosition":false,"pitcher":true,"fielder":true,"outfield":false,"displayName":"Pitcher - Infielder"},{"shortName":"Pitcher - Outfielder","fullName":"Pitcher - Outfielder","abbrev":"P-OF","code":"J","type":"Two-Way Player","formalName":"Pitcher - Outfielder","gamePosition":false,"pitcher":true,"fielder":true,"outfield":true,"displayName":"Pitcher - Outfielder"},{"shortName":"Pitcher - Utility","fullName":"Pitcher - Utility","abbrev":"P-UT","code":"Z","type":"Two-Way Player","formalName":"Pitcher - Utility","gamePosition":false,"pitcher":true,"fielder":true,"outfield":false,"displayName":"Pitcher - Utility"},{"shortName":"Two-Way Player","fullName":"Two-Way Player","abbrev":"TWP","code":"Y","type":"Two-Way Player","formalName":"Two-Way Player","gamePosition":false,"pitcher":true,"fielder":false,"outfield":false,"displayName":"Two-Way Player"},{"shortName":"Batter","fullName":"Batter","abbrev":"B","code":"10","type":"Batter","formalName":"Batter","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Batter"},{"shortName":"Unknown","fullName":"Unknown","abbrev":"X","code":"X","type":"Unknown","formalName":"Unknown","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Unknown"},{"shortName":"Runner on First","fullName":"Runner on First","abbrev":"R1","code":"R1","type":"Runner","formalName":"Runner on First","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Runner on First"},{"shortName":"Runner on Second","fullName":"Runner on Second","abbrev":"R2","code":"R2","type":"Runner","formalName":"Runner on Second","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Runner on Second"},{"shortName":"Runner on Third","fullName":"Runner on Third","abbrev":"R3","code":"R3","type":"Runner","formalName":"Runner on Third","gamePosition":false,"pitcher":false,"fielder":false,"outfield":false,"displayName":"Runner on Third"}]
for val in position_type:
    position_type_map[val['code']]=val['type']
 
boxscores_df['positionType']=boxscores_df['positionType'].astype(str)
boxscores_df['positionType']=boxscores_df['positionType'].map(position_type_map)
### still cant figure out why outputing wrong dtype
boxscores_df['positionType'].dtypes

In [None]:
import missingno as msno

msno.matrix(boxscores_df)

##### Inference #####

There are some columns towards the end of dataframe and some columns at beginning have complete missing data, so they are removed from df.

In [None]:

temp=[]
for cols in boxscores_df.columns:
    if boxscores_df[cols].nunique()==1 or boxscores_df[cols].isna().sum()>0.9*len(boxscores_df):
        temp.append(cols)
boxscores_df.drop(columns=temp,inplace=True)

####### Fillnna values ########

for cols in boxscores_df.columns[13:]:
    boxscores_df[cols]=boxscores_df[cols].fillna(boxscores_df[cols].mean())
    boxscores_df[cols] = boxscores_df[cols].round(decimals=0)


* runsScored - Game total runs scored.
* doubles - Game total doubles.
* triples - Game total triples.
* homeRuns - Game total home runs.

In [None]:
all_titles=('Total runs scored throughout all seasons for each teams','Total doubles hit throughout all season for each team',
'Total triples hit throughout all season for each team','Total home_runs scored throughout all season for each team')


fig = make_subplots(rows=2, cols=2,subplot_titles=all_titles,
    specs=[[ {"type": "sunburst"},{"type": "sunburst"}],[{"type": "sunburst"},{"type": "sunburst"}]])   

row_=[1,1,2,2];col_=[1,2,1,2]

for i,cols in enumerate(['runsScored',"doubles",'triples','homeRuns']):
    fig_ = px.sunburst(boxscores_df, path=[ "year","teamName"],values=cols)
    trace =go.Sunburst(
                labels=fig_['data'][0]['labels'].tolist(),
                    parents=fig_['data'][0]['parents'].tolist(),
                values=fig_['data'][0]['values'].tolist(),
                        ids=fig_['data'][0]['ids'].tolist(),
                marker = { 'colorscale':random.choice(colorscales)},)
    
    fig.add_trace(trace,row=row_[i],col=col_[i])
    

fig.update_layout(title_text='Games Stats with respect to player poitions', title_x=0.5, height=800,width=900,
                  margin=dict(r=10, t=50, b=40, l=60))
#iplot(fig)

##### Inference #####

* In Year 2018 

* 900+ runs scores : - 1 team (Boston Red Socks -960)
* 250+ runs scored in home : - 2 teams , (Highest- New York Yankees)
* 35+ triples scored : - 7 teams,(Highest- Arizona DiamondBacks)
* 300+ doubles scored : - 6, (Highest- Boston Red Socks - 380)


* Year 2019 
* 900+ runs scores : - 6 teams,(Highest- Houston Astros - 991)
* 250+ runs scored in home : - 3 teams , (Highest- New York Yankees)
* 35+ triples scored : - teams, (Highest- Colorado Rockies)
* 300+ doubles scored : - 9, (Highest- Houston Astros - 348)

In [None]:
num_games_season=pd.DataFrame(games_df.groupby('season').gamePk.count()).reset_index()
sns.barplot(x=num_games_season.season,y=num_games_season.gamePk)
plt.title("Number of games played in respective seasons")

##### Note #####

* From the above plot it's clear that number of games played in season 2020 and 2021 are much less than what it should be in actual. So if we compute **mean stats(flyOuts,runScored,etc..) of all team across all seasons** , then it will be **unfair due to unequal number of games** played. Here comes the idea of adjusted_mean where **stats per game** is computed for respective seasons and **multiplied with number of games played in 2018 season(Reference season)**.
* The adjusted mean is a rough value and used just for intuitive purpose.

In [None]:
#Generate a set of 100 random colors
def generate_color(num_colors=50):
    x = lambda: random.randint(random.choice([50,100,0,150]),255)
    y = lambda: random.randint(0,random.choice([200,255]))
    z = lambda: random.randint(random.choice([150,100,0]),255)
    random_colors=[('#%02X%02X%02X' % (x(),y(),z())) for i in range(num_colors)]
    return random_colors


num_games_season=dict(games_df.groupby('season').gamePk.count())

def getplayer_box_scores(cols2_plot,fig_size,orient):
    fig = plt.figure(figsize=fig_size)
    plt.subplots_adjust(wspace=0.5)
    random_colors = generate_color(num_colors=50)
    for i,cols in enumerate(cols2_plot):
        if orient == 'single_row':
            ax = fig.add_subplot(len(cols2_plot),1,i+1)
        if orient == 'grid':
            ax = fig.add_subplot(len(cols2_plot)/2,2,i+1)
            

        temp=boxscores_df.groupby(['year','teamName'])[cols].sum().reset_index()

        a=temp.teamName.unique()
        temp_=np.zeros(len(a))
        temp_df=pd.DataFrame()
        temp_df['teamName']=temp['teamName'].unique()
        for year in [2018,2019,2020,2021]:
            tmp_dict=temp[temp.year==year][['teamName',cols]].to_dict(orient='list')
            tmp_dict=dict(zip(tmp_dict['teamName'],tmp_dict[cols]))

            temp_df[year]=temp_df['teamName'].map(tmp_dict).fillna(0)

        x = temp_df.teamName.values
        y1 = np.array(temp_df[2018])
        y2 = np.array(temp_df[2019])
        y3 = np.array(temp_df[2020])
        y4 = np.array(temp_df[2021])

        # plot bars in stack manner
        line1=plt.barh(x, y1, color=random.choice(random_colors))
        line1_mean=plt.axvline(np.mean(y1),color=random.choice(random_colors), linewidth = 3, linestyle='--')
        
        line2=plt.barh(x, y2, left=y1, color=random.choice(random_colors))
        line2_mean=plt.axvline(np.mean(y1)+np.mean(y2),color=random.choice(random_colors), linewidth = 3, linestyle='--')
        line2_mean_adj=plt.axvline((np.mean(y2)/num_games_season[2019])*num_games_season[2018],color=random.choice(random_colors), linewidth = 3, linestyle='--')

        line3=plt.barh(x, y3, left=y1+y2, color=random.choice(random_colors))
        line3_mean_adj=plt.axvline((np.mean(y3)/num_games_season[2020])*num_games_season[2018],color=random.choice(random_colors), linewidth = 3, linestyle='--')
        line3_mean=plt.axvline(np.mean(y1)+np.mean(y2)+np.mean(y3),color=random.choice(random_colors), linewidth = 3, linestyle='--')

        line4=plt.barh(x, y4, left=y1+y2+y3, color=random.choice(random_colors))
        line4_mean_adj=plt.axvline((np.mean(y4)/num_games_season[2021])*num_games_season[2018],color=random.choice(random_colors), linewidth = 3, linestyle='--')
        line4_mean=plt.axvline(np.mean(y1)+np.mean(y2)+np.mean(y3)+np.mean(y4),color=random.choice(random_colors), linewidth = 3,linestyle='--')

        ax.legend([line1, line2, line3, line4,line1_mean,line2_mean,line2_mean_adj,line3_mean_adj,line3_mean,line4_mean_adj,line4_mean], ['season_2018', 'season_2019', 'season_2020','season_2021',f'{cols}_mean_2018',f'{cols}_mean_2019',f'{cols}_adjusted_mean_2019',f'{cols}_adjusted_mean_2020',f'{cols}_mean_2020',f'{cols}_adjusted_mean_2021',f'{cols}_mean_2021'])        
        
        plt.ylabel("Teams")
        plt.xlabel(f"{cols} Sum")
        ax.tick_params(axis='x', labelrotation = 90, labelsize = 12)
        ax.set_title(f"Total {cols} for all team across all seasons(2018-21)", fontsize = 14)

* flyOuts - Game total fly outs.
* groundOuts - Game total ground outs.
* strikeOuts - Game total strike outs.


In [None]:
cols2_plot=['groundOuts','strikeOuts','flyOuts']
getplayer_box_scores(cols2_plot,fig_size=(18,27),orient='single_row')

* plateAppearances - Game total plate appearances.
* totalBases - Game total bases.
* rbi - Game total runs batted in.
* leftOnBase - Game total runners left on base.

In [None]:
cols2_plot=['plateAppearances','totalBases','rbi','leftOnBase']
getplayer_box_scores(cols2_plot,fig_size=(18,36),orient='single_row')

* baseOnBalls - Game total walks.
* intentionalWalks - Game total intentional walks.
* hits - Game total hits.
* hitByPitch - Game total hit by pitches.

In [None]:
cols2_plot=['baseOnBalls','intentionalWalks','hits','hitByPitch']
getplayer_box_scores(cols2_plot,fig_size=(18,36),orient='single_row')

* sacBunts - Game total sacrifice bunts.
* sacFlies - Game total sacrifice flies.
* catchersInterference - Game total catchers interference reached on.
* pickoffs - Game total number of times picked off base.

In [None]:
cols2_plot=['sacBunts','sacFlies','catchersInterference','pickoffs']
getplayer_box_scores(cols2_plot,fig_size=(20,20),orient='grid')

* assists - Game total number of assists.
* putOuts - Game total number of putouts.
* errors - Game total number of errors.
* chances - Game total fielding chances.

In [None]:
cols2_plot=['assists', 'putOuts', 'errors', 'chances']
getplayer_box_scores(cols2_plot,fig_size=(18,36),orient='single_row')