In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import gc
import sys
import warnings
import pickle
from pathlib import Path

import ipywidgets as widgets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import catboost
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.seasonal import seasonal_decompose

## Load dataset

In [None]:
df_names = ['playerBoxScores', 'teamBoxScores', 'playerTwitterFollowers', 'teamTwitterFollowers', 'awards', 'events',
           'games', 'standings', 'rosters', 'transactions']
path = "../input/mlb-pdef-train-dataset"

train_dict = {}
for index in range(len(df_names)):
    train_dict[df_names[index]] = pd.read_pickle(os.path.join(path,df_names[index]) + "_train.pkl")

In [None]:
df_names = ['seasons', 'teams', 'players', 'awards']
path = "../input/mlb-player-digital-engagement-forecasting"

my_dict = {}
for index in range(len(df_names)):
    my_dict[df_names[index]] = pd.read_csv(os.path.join(path,df_names[index]) + ".csv")

In [None]:
target = pd.read_pickle('../input/mlb-pdef-train-dataset/nextDayPlayerEngagement_train.pkl')
target = target.drop(['index'], axis = 1)

### Visualize the mean and correlation of target values

After taking mean over different players, we can see how the mean of targets move over time and their correlations. 

In [None]:
target_date = target.groupby('engagementMetricsDate').mean().drop(['playerId', 'date'], axis = 1)
target_date.plot(figsize=(20,5))

In the plot above, "target 2" value shows a much wider variation but the four engagement measurements change in a similar wary since they are correlated with each other. In  the correlation matrix shown below, we find that "target 1" is highly correlated with "target 2" and "target 3" while "target 4" is less correlated with the other three values.  

In [None]:
target_player = target.groupby('playerId').mean()
target_corr = target_player.corr()
sns.heatmap(target_corr, vmin = 0.5)

### Seasonal Decomposition

Apart from correlation, the time series plots also shows a calendar related effect, for example, a peak around March (the start of each season). To better understand the seasonal properties, we apply a seasonal decomposition to investigate the trend and seasonality. Since the amplitude of the seasonal variation does not change as the trend rises or falls, I use an additive decoposition.

In [None]:
trend, seasonal = pd.DataFrame(), pd.DataFrame()
for i in target_date:    
    series = target_date[i]
    result = seasonal_decompose(series, model='multiplicative', period = 365)
    trend[i] = result.trend
    seasonal[i] = result.seasonal

The trend component shows the rises and falls in the four series, ignoring the seasonality and any small random fluctuations.

From the four plots below, we find that target 1 and target 2 have an almost oppsite trend and target 4's trend is totally different from the other 3 measurements.

* For target 1, it goes down from the end of 2019 and remained low until around July 2020 (it may due to the covid).
* For target 2, it increases when target 1 decreases and started to decrease when target 1 start to rise.
* For target 3, it drop sharply from July 2019 and started to fluctuate from July 2020 (a similar trend with target 1).
* Target 4 is the only target value that has a cycle, the single cycle period is about 3-4 months.

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (16, 8))
axes[0, 0].plot(trend['target1'])
axes[0, 0].set_title('trend of target1')
axes[0, 0].set_xticks(trend.index[::180])
axes[0, 1].plot(trend['target2'])
axes[0, 1].set_title('trend of target2')
axes[0, 1].set_xticks(trend.index[::180])
axes[1, 0].plot(trend['target3'])
axes[1, 0].set_title('trend of target3')
axes[1, 0].set_xticks(trend.index[::180])
axes[1, 1].plot(trend['target4'])
axes[1, 1].set_title('trend of target4')
axes[1, 1].set_xticks(trend.index[::180])

The seasonality plot shows that all the target measurements have a roughly one-year pattern. Target 1 has an obvious spike around March every year. It should be related to the start of each season. Target 2 and 3 also reach peak around March but they fluctuate before and after the peak. From target 1 to 4, the seasonal patten  is becoming less obvious. As for target 4, it shows a big variation throughout the year, instead of a seasonal pattern
. 

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (16, 8))
axes[0, 0].plot(seasonal['target1'])
axes[0, 0].set_title('seasonality of target1')
axes[0, 0].set_xticks(seasonal.index[::240])
axes[0, 1].plot(seasonal['target2'])
axes[0, 1].set_title('seasonality of target2')
axes[0, 1].set_xticks(seasonal.index[::240])
axes[1, 0].plot(seasonal['target3'])
axes[1, 0].set_title('seasonality of target3')
axes[1, 0].set_xticks(seasonal.index[::240])
axes[1, 1].plot(seasonal['target4'])
axes[1, 1].set_title('seasonality of target4')
axes[1, 1].set_xticks(seasonal.index[::240])

## Anythig Different in 2020?

Due to covid, the regular season started in July instead of March. I expect that the fans engagement will be different in 2020. To verify my guess, I plot the difference of each target between 2020 and 2019 (from March to August), and then compare them with the difference between 2019 and 2018.

In [None]:
target['engagementMetricsDate'] = pd.to_datetime(target['engagementMetricsDate'], format = "%Y-%m-%d")
target['year'] = target['engagementMetricsDate'].dt.year

In [None]:
target_2020 = target[target['year']==2020].drop(['date', 'year', 'playerId'], axis = 1).groupby(
    'engagementMetricsDate').mean().reset_index()
target_2019 = target[target['year']==2019].drop(['date', 'year', 'playerId'], axis = 1).groupby(
    'engagementMetricsDate').mean().reset_index()
target_2018 = target[target['year']==2018].drop(['date', 'year', 'playerId'], axis = 1).groupby(
    'engagementMetricsDate').mean().reset_index()

In [None]:
value_2020 = target_2020[(target_2020['engagementMetricsDate'] < pd.Timestamp('2020-08-01')) & (
    target_2020['engagementMetricsDate'] > pd.Timestamp('2020-02-29'))].reset_index()
value_2019 = target_2019[(target_2019['engagementMetricsDate'] < pd.Timestamp('2019-08-01')) & (
    target_2019['engagementMetricsDate'] > pd.Timestamp('2019-02-28'))].reset_index()
value_2018 = target_2018[(target_2018['engagementMetricsDate'] < pd.Timestamp('2018-08-01')) & (
    target_2018['engagementMetricsDate'] > pd.Timestamp('2018-02-28'))].reset_index()

In [None]:
diff_2020 = (value_2020 - value_2019)[['target1', 'target2', 'target3', 'target4']].set_index(
    (pd.period_range(start='2020-03-01', end='2020-07-31', freq='d').to_timestamp()))
diff_2019 = (value_2019 - value_2018)[['target1', 'target2', 'target3', 'target4']].set_index(
    (pd.period_range(start='2020-03-01', end='2020-07-31', freq='d').to_timestamp()))

Since we choose the same period each year, if nothing special happens, the value in different years should remain similar and the difference series will fluctuate around 0. As for the difference series of target 1, there's a slight decrease 2020, especially between April and July.

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (15,5), sharey = True)

axs[0].plot(diff_2020['target1'].index, diff_2020['target1'], color='black')
axs[0].axhline(y=0,linestyle='--',color='red')
axs[0].set_title('Target 1: difference between 2020 and 2019')
axs[1].plot(diff_2019['target1'].index, diff_2019['target1'])
axs[1].axhline(y=0,linestyle='--',color='red')
axs[1].set_title('Target 1: difference between 2019 and 2018')

However, it is totally diffferent for target 2 since the values in 2020 are consistently bigger than in 2019, while the differences between 2019 and 2018 are similar. Therefore, the postpone of regular season actually increase the target 2. 

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (15,5), sharey = True)

axs[0].plot(diff_2020['target2'].index, diff_2020['target2'], color='black')
axs[0].axhline(y=0,linestyle='--',color='red')
axs[0].set_title('Target 2: difference between 2020 and 2019')
axs[1].plot(diff_2019['target2'].index, diff_2019['target2'])
axs[1].axhline(y=0,linestyle='--',color='red')
axs[1].set_title('Target 2: difference between 2019 and 2018')

Target 3 did not change much during 2020, except for a sharp decrease around March (the start of regular season).

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (15,5), sharey = True)

axs[0].plot(diff_2020['target3'].index, diff_2020['target3'], color='black')
axs[0].axhline(y=0,linestyle='--',color='red')
axs[0].set_title('Target 3: difference between 2020 and 2019')
axs[1].plot(diff_2019['target3'].index, diff_2019['target3'])
axs[1].axhline(y=0,linestyle='--',color='red')
axs[1].set_title('Target 3: difference between 2019 and 2018')

In 2020, target 4 is lower than in 2019, which may due to the influence of covid.

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (15,5), sharey = True)

axs[0].plot(diff_2020['target4'].index, diff_2020['target4'], color='black')
axs[0].axhline(y=0,linestyle='--',color='red')
axs[0].set_title('Target 4: difference between 2020 and 2019')
axs[1].plot(diff_2019['target4'].index, diff_2019['target4'])
axs[1].axhline(y=0,linestyle='--',color='red')
axs[1].set_title('Target 4: difference between 2019 and 2018')

## Find Important Features

We have find the the target values are related with time. In this part, we are going to explore other features (features related to players, teams, and games, etc).

### Construct player related features

In [None]:
player_score = train_dict['playerBoxScores']

I construct features according to the following page: [MLB stats GLOSSARY](http://www.mlb.com/glossary/). 

I choose 2 features from the "Defense" category:
* "FPCT": Fielding percentage
* "Out": including a strikeout, a groundout, a popout or a flyout.

5 features from the "Offense" category:
* "HR": home run. 
* "IBB": intentional walk.
* "TB": total bases.
* "RBI": runs batted in.
* "AVG": batting average.

5 features from the "Pitching" category:
* "SV_percnt": save percentage.
* "HLD": hold.
* "BS": blown save.
* "IR": inherited runner.
* "IP": innings pitched.

5 features from the "Advanced" category:
* "ISO": isolated power.
* "BABIP": batting average on balls in play.
* "RC": runs created.
* "PA/SO": plate appearances per strikeout.
* "K/BB": strikeout-to-walk ratio.

In [None]:
perform_stats = pd.DataFrame()

# For calculation
perform_stats['AB'] = player_score['atBats']
perform_stats['B2'] = player_score['doubles']
perform_stats['B3'] = player_score['triples']
perform_stats['H'] = player_score['hits']
perform_stats['PA'] = player_score['plateAppearances']
perform_stats['TB'] = player_score['totalBases']
perform_stats['BB'] = player_score['baseOnBalls']
perform_stats['SO'] = player_score['strikeOuts']

# Defense
perform_stats['FPCT'] = (player_score['putOuts'] + player_score['assists']) / player_score['chances']
perform_stats['Out'] = player_score['putOuts'] + player_score['groundOuts'] + player_score['strikeOuts'] + player_score['flyOuts']

# Offence
perform_stats['HR'] = player_score['homeRuns']
perform_stats['IBB'] = player_score['intentionalWalks']
perform_stats['TB'] = player_score['totalBases']
perform_stats['RBI'] = player_score['rbi']
perform_stats['AVG'] = perform_stats['H']/perform_stats['AB']

# Pitching
perform_stats['SV_percnt'] = player_score['saves']/player_score['saveOpportunities']
perform_stats['HLD'] = player_score['holds']
perform_stats['BS'] = player_score['blownSaves']
perform_stats['IR'] = player_score['inheritedRunners']
perform_stats['IP'] = player_score['inningsPitched']

# Advanced Offence
perform_stats['ISO'] = (perform_stats['B2']+2*perform_stats['B3']+3*perform_stats['HR'])/perform_stats['AB']
perform_stats['BABIP'] = (perform_stats['H'] - perform_stats['HR'])/(
    perform_stats['AB']-perform_stats['SO']-perform_stats['HR']+player_score['sacFlies'])
perform_stats['RC'] = perform_stats['TB']*(perform_stats['H']+perform_stats['BB'])/(
    perform_stats['AB']+perform_stats['BB'])
perform_stats['PA/SO'] = perform_stats['PA']/perform_stats['SO']
perform_stats['K/BB'] = perform_stats['SO']/perform_stats['BB']

In [None]:
target['date'] = target['date']
target = target[target['date'].notna()]
player_stats = perform_stats[['FPCT', 'Out', 'HR', 'IBB', 'TB', 'RBI', 'AVG', 'SV_percnt', 'HLD', 'BS', 'IR', 'IP', 'ISO', 'BABIP', 'RC', 'PA/SO', 'K/BB']]
player_stats['date'], player_stats['playerId'] = player_score['date'], player_score['playerId']

merge_1 = target.merge(player_stats, on = ['date', 'playerId'], how = 'left')

I then construct some similar features using the team scores. And I add an team specific measurement:
* "Pytha_WR": pythagorean winning percentage, it determines the number of games that a team "should" have won.

In [None]:
team_score = train_dict['teamBoxScores']

In [None]:
team_stats = pd.DataFrame()

# For fcalculation
team_stats['AB_team'] = team_score['atBats']
team_stats['B2_team'] = team_score['doubles']
team_stats['B3_team'] = team_score['triples']
team_stats['H_team'] = team_score['hits']
team_stats['PA_team'] = team_score['plateAppearances']
team_stats['BB_team'] = team_score['baseOnBalls']
team_stats['SO_team'] = team_score['strikeOuts']

# Offence
team_stats['HR_team'] = team_score['homeRuns']
team_stats['IBB_team'] = team_score['intentionalWalks']
team_stats['TB_team'] = team_score['totalBases']
team_stats['RBI_team'] = team_score['rbi']
team_stats['AVG_team'] = team_stats['H_team']/team_stats['AB_team']

# Pitching
team_stats['IR_team'] = team_score['inheritedRunners']
team_stats['IP_team'] = team_score['inningsPitched']

# Advanced Offence
team_stats['ISO_team'] = (team_stats['B2_team']+2*team_stats['B3_team']+3*team_stats['HR_team'])/team_stats['AB_team']
team_stats['BABIP_team'] = (team_stats['H_team'] - team_stats['HR_team'])/(
    team_stats['AB_team']-team_stats['SO_team']-team_stats['HR_team']+team_score['sacFlies'])
team_stats['RC_team'] = team_stats['TB_team']*(team_stats['H_team']+team_stats['BB_team'])/(
    team_stats['AB_team']+team_stats['BB_team'])
team_stats['Pytha_WR'] = team_score['runsScored']**2/(team_score['runsScored']**2 + team_score['runsPitching']**2)

In [None]:
merge_1['teamId'] = player_score['teamId']
team_stats = team_stats[['HR_team', 'IBB_team', 'TB_team', 'RBI_team', 'AVG_team', 'IR_team', 'IP_team', 'ISO_team', 'BABIP_team', 'RC_team', 'Pytha_WR']]
team_stats['date'], team_stats['teamId'] = team_score['date'], team_score['teamId']
merge_2 = merge_1.merge(team_stats, on = ['date', 'teamId'], how = 'left')

In addtion to the players and teams performance features, the "team_rank" features below describe the ranking and winnings in divisions and leagues.

In [None]:
team_rank = train_dict['standings'][['teamId', 'divisionRank', 'leagueRank', 'wildCardRank', 'divWins', 'alWins', 'nlWins', 'pct', 'homeWins', 'xWinLossPct', 'date']]

In [None]:
merge_3 = merge_2.merge(team_rank, on = ['date', 'teamId'], how = 'left')

The feature below shows whether a player has be transacted or not.

In [None]:
transaction = train_dict['transactions'][['playerId', 'date', 'typeCode']]
transaction['transacted'] = 1

merge_4 = merge_3.merge(transaction, on = ['date', 'playerId'], how = 'left')

The "award_current" features indicates whether a player has received an award after 2018 or not.

In [None]:
awards_current = train_dict['awards'][['date', 'playerId']]
awards_current['awarded_recnt'] = 1
merge_5 = merge_4.merge(awards_current, on = ['date', 'playerId',], how = 'left')

I also include the number of players' twitter followers and teams' twitter followers as features in the model.

In [None]:
player_twit = train_dict['playerTwitterFollowers'][['date', 'playerId', 'numberOfFollowers']]

merge_6 = merge_5.merge(player_twit, on = ['date', 'playerId'], how = 'left')

In [None]:
team_twit = train_dict['teamTwitterFollowers'][['date', 'teamId', 'numberOfFollowers']]
team_twit = team_twit.rename(columns = {"numberOfFollowers": "team_followers"})

merge_7 = merge_6.merge(team_twit, on = ['date', 'teamId'], how = 'left')

The "rosters" feature shows a player's status (for example, injured or active) at a certain time.

In [None]:
rosters = train_dict['rosters'][['playerId', 'date', 'status']]
merge_8 = merge_7.merge(rosters, on = ['playerId', 'date'], how = 'left')

### Prepare data for random forest

In this part, I fill the NAN values with -99, which is not in the range of the dataset. And deal with inifinite values and make sure all the features are in correct format for the future modelling procedure.

In [None]:
df = merge_8.drop(['engagementMetricsDate'], axis = 1)

In [None]:
df['pct'] = df['pct'].astype('float')

In [None]:
object_encode = df.select_dtypes(include=['object'])

def label_encode(df, column):
    func = {c: i for i, c in enumerate(df[column].unique()) if not pd.isna(c)}
    df[column] = df[column].map(func)

for col in object_encode.columns:
    label_encode(object_encode, col)
df[object_encode.columns] = object_encode

In [None]:
df.replace(np.inf, 0, inplace=True)
df = df.fillna(-99)

To investigate how the different time periods affect fans engagement, I break down the date into "year", "month" and "day".

In [None]:
df['date'] = pd.to_datetime(df['date'], format = "%Y%m%d")
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

In [None]:
df = df.set_index(['playerId', 'date'], drop = True)

### Run Random Forest Model and Extract Feature Importance

It is an ensemble learning method based on decision trees and the feature importances are calculated using Gini importance. The hyperparameters I use below are selected from random search.

In [None]:
regr = RandomForestRegressor(
    max_depth=20, 
    max_features = 'sqrt', 
    min_samples_split = 6, 
    n_estimators = 200, 
    random_state=0)

In [None]:
X = df.drop(['target1', 'target2', 'target3', 'target4'], axis = 1)
y = df[['target1', 'target2', 'target3', 'target4']]

In [None]:
%%time
feature_imp = pd.DataFrame(index = X.columns)
for i in y.columns:
    if os.path.isfile("../input/target-models/" + i + "_model.sav"):
        with open("../input/target-models/" + i + "_model.sav", 'rb') as fin:
            regr = pickle.load(fin)
            feature_imp[i] = regr.feature_importances_
    else:
        regr = regr.fit(X, y[i])
        feature_imp[i] = regr.feature_importances_
        filename = i+'_model.sav'
        pickle.dump(regr, open('../input/target-models'+ filename, 'wb'))

Before I start to look into the importance of the features, I put the features into 5 categories: 

* The "player performance" includes features of a players performance in a game.
* Other player related features include transaction, twitter followers and players' status.
* The "team performance" category include teams' performance in games.
* Other team related features include team ranking in division and league, team twitter statistics and team winning percentages.
* The "time features" include "day", "month" and "year".

In [None]:
player_perform = ['FPCT', 'Out', 'HR', 'IBB', 'TB', 'RBI', 'AVG', 'SV_percnt', 'HLD', 'BS', 'IR', 'IP', 'ISO', 'BABIP', 'RC', 'PA/SO', 'K/BB']
team_perform = ['teamId', 'HR_team', 'IBB_team', 'TB_team', 'RBI_team', 'AVG_team', 'IR_team', 'IP_team', 'ISO_team', 'BABIP_team', 'RC_team', 'Pytha_WR']
team_features = ['divisionRank', 'leagueRank', 'wildCardRank', 'divWins', 'alWins', 'nlWins', 'pct', 'homeWins', 'xWinLossPct', 'team_followers']
player_features = ['typeCode', 'transacted', 'numberOfFollowers', 'status', 'awarded_recnt']
time_features = ['year', 'month','day']

In [None]:
feature_imp['color'] = ''
for i in feature_imp.index:
    if i in player_perform:
        feature_imp.loc[i, 'color'] = 'r'
    elif i in player_features:
        feature_imp.loc[i, 'color'] = 'y'
    elif i in team_perform:
        feature_imp.loc[i, 'color'] = 'g'
    elif i in team_features:
        feature_imp.loc[i, 'color'] = 'b'
    else:
        feature_imp.loc[i, 'color'] = 'cyan'

In [None]:
y1_imp = feature_imp[['target1', 'color']].sort_values(by=['target1'], ascending = True)
y2_imp = feature_imp[['target2', 'color']].sort_values(by=['target2'], ascending = True)
y3_imp = feature_imp[['target3', 'color']].sort_values(by=['target3'], ascending = True)
y4_imp = feature_imp[['target4', 'color']].sort_values(by=['target4'], ascending = True)

From the 4 feature importance below, we can see that all the features are related to time features (the cyan bars) and player performance (the red bars
). 

In addition, the players' performance (red bar), status and twitter followers (yellow bar) are the most important for all the target measurements. 

However, team performance (green bar) and rankings (blue) in a season do not have much influence.

In [None]:
fig, axs = plt.subplots(2, 2, figsize = (24,25), sharex = True)
axs[0, 0].barh(y1_imp.index, y1_imp['target1'], color = y1_imp.color.values)
axs[0, 0].set_title('feature importance for y1')
axs[0, 1].barh(y2_imp.index, y2_imp['target2'], color = y2_imp.color.values)
axs[0, 1].set_title('feature importance for y2')
axs[1, 0].barh(y3_imp.index, y3_imp['target3'], color = y3_imp.color.values)
axs[1, 0].set_title('feature importance for y3')
axs[1, 1].barh(y4_imp.index, y4_imp['target4'], color = y4_imp.color.values)
axs[1, 1].set_title('feature importance for y4')

If I enlarge the above 4 pictures and focus on the first 10 features, we can see something in common for the 4 measurements:
* time features: "day" and "month" are always the most important for the 4 measurements;
* player performances are also important factors.

However, there are differences among the four measurements:
* For target 1, player performances are the second most important features.

* For target 2, number of team twitter followers has the most influence and the effect is much bigger than time features and player performances. Also, target 2 is affected by the combination of team expected winning percentage, player status, time and players' performances.

* For target 3 is similar with target 2, except for time features, team twitter followers has some influence on it but not as big as it has for target 2. And the award a player received will also influence the values of target 3. 

* For target 4, time, players' award and tatus and also performances are important. And so does team twitter followers.

In [None]:
fig, axs = plt.subplots(2, 2, figsize = (16,12), sharex = True)
axs[0, 0].barh(y1_imp[-10:].index, y1_imp['target1'][-10:], color = y1_imp[-10:].color.values)
axs[0, 0].set_title('feature importance for y1')
axs[0, 1].barh(y2_imp[-10:].index, y2_imp['target2'][-10:], color = y2_imp[-10:].color.values)
axs[0, 1].set_title('feature importance for y2')
axs[1, 0].barh(y3_imp[-10:].index, y3_imp['target3'][-10:], color = y3_imp[-10:].color.values)
axs[1, 0].set_title('feature importance for y3')
axs[1, 1].barh(y4_imp[-10:].index, y4_imp['target4'][-10:], color = y4_imp[-10:].color.values)
axs[1, 1].set_title('feature importance for y4')

If we look at the percentage of different feature categories in a whole, we can see that time is always the most important. Except for that, target 1 is affected a lot by players' performance in a game and target 2 is heavily affected by team related features such as twitter followers, team rankings and winning percentage, instead of team performances. 

In [None]:
grouped_feature = pd.DataFrame(columns = ['player_perform', 'player_features', 'team_perform', 'team_features', 'time_features'])
feature_imp = feature_imp.drop(['color'], axis = 1)

grouped_feature['player_perform'] = feature_imp.T[player_perform].mean(axis = 1)
grouped_feature['player_features'] = feature_imp.T[player_features].mean(axis = 1)
grouped_feature['team_perform'] = feature_imp.T[team_perform].mean(axis = 1)
grouped_feature['team_features'] = feature_imp.T[team_features].mean(axis = 1)
grouped_feature['time_features'] = feature_imp.T[time_features].mean(axis = 1)

In [None]:
stacked_data = grouped_feature.apply(lambda x: x*100/sum(x), axis=1)
stacked_data

stacked_data.plot(kind="barh", stacked=True, figsize = (12,4))
plt.title("Feature importance in percentages for different targets")
plt.xlabel("Percentage")
plt.ylabel("Target measurements")

Among the player performance, we can see from the below bar plots that the defence scores are the most important and the offence scores ranks the second. 

In [None]:
pf = feature_imp.T[player_perform]

defence = ['FPCT', 'Out']
offence = ['HR', 'IBB', 'TB', 'RBI', 'AVG']
pitching = ['SV_percnt', 'HLD', 'BS', 'IR', 'IP']
advanced = ['ISO', 'BABIP', 'RC', 'PA/SO', 'K/BB']

pf['defence'] = pf[defence].mean(axis = 1)
pf['offence'] = pf[offence].mean(axis = 1)
pf['pitching'] = pf[pitching].mean(axis = 1)
pf['advanced'] = pf[advanced].mean(axis = 1)

pf = pf.drop(defence+offence+pitching+advanced, axis = 1).T

pf['mean'] = pf.mean(axis = 1)
pf = pf.sort_values(by = 'mean', ascending = False).drop(['mean'], axis = 1)

In [None]:
pf.plot(kind = 'bar')

### Distribution of Time features

Since time features are important for all the measurements, I plot their difference among 2061 players to investigate their distributions.

In this part, I run randon forest model on different players and extract the feature importance. Since the model file is too large, I upload the results I obtained to kaggle, not the models.


In [None]:
feature_imp_y1 = pd.read_csv('../input/feature-imp/features_y1.csv', index_col=[0])
feature_imp_y2 = pd.read_csv('../input/feature-imp/features_y2.csv', index_col=[0])
feature_imp_y3 = pd.read_csv('../input/feature-imp/features_y3.csv', index_col=[0])
feature_imp_y4 = pd.read_csv('../input/feature-imp/features_y4.csv', index_col=[0])

The plots below shows the distribution of the three time features for different target values. 

* The first row is the importance distribution for "year". For most players, the influence of "year" is under 0.02. The influence is slightly higher on average for target 2.

* The second row shows the importance of feature "month". The effect of "month" is much more bigger than "year" since the distribution is centered around 0.2-0.3. As the plot for target 4 is leaning to the right, we can conclude that "month" has more influence on target 4.

* The feature "day" shows a different distribution among different target values. For target 1, the distribution is flat and ranges from 0.1 to 0.6. which means the influence is very different for various players; for target 2, 3 and 4, the distribution is more concentrated around 0.3-0.4.

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)
fig, axes = plt.subplots(len(time_features), 4, figsize = (16,6), sharey = True, sharex = True)
fea_list = [feature_imp_y1, feature_imp_y2, feature_imp_y3, feature_imp_y4]
name_list = ['1', '2', '3', '4']
for i,j in enumerate(time_features):
    for k in range(4):
        df_plot = fea_list[k]
        axes[i, k] = sns.distplot(df_plot.T[j], ax = axes[i,k])
        axes[i, k].set_ylabel(j)
        axes[i, k].set_xlabel('')
        if i == 0:
            axes[i, k].set_title('target'+ ' ' + name_list[k])

To conclude, we find that:

* The 4 target values have very different trend. 

* Target 4 has shown a cyclic feature while other values do not.

* Time features and player performance related features are important for all measurements.

* Among the player performance measurement, defence and offence scores are the most influential ones.

* Team twitter followers has a big influence on target 2 but not on other target values.

* Time feature influent players in a different way: "day" affect target 3 and 4 more than target 1; its influence on target 1 is very different among players.