# Import data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
salaries=pd.read_csv('/kaggle/input/baseball-databank/Salaries.csv')
salaries

Salaries for all MLB players from 1985 to 2015.

# Select data year

Lot of data to look at here- let's condense our dataset and use data from 2014 salaries to predict 2015 salaries.

In [None]:
recent=salaries[salaries['yearID'].isin(['2015'])]
recent

In [None]:
recent.sort_values(by='salary',ascending=False)

Top paid players look familiar, salaries are accurate.

Get player names by calling player ID.

In [None]:
#df of all player IDs mapping to player names
names=pd.read_csv('/kaggle/input/the-history-of-baseball/player.csv')
pd.set_option('display.max_columns', None)

#make column combining first and last name
names['name']=names['name_first']+' '+names['name_last']

#only need these two columns (rest of columns are unnecessary)
names=names[['player_id','name']]
names

# Import batting statistics.

In [None]:
batting=pd.read_csv('/kaggle/input/the-history-of-baseball/batting.csv')

#get batting data from 2014
hitting_2014=batting[batting['year'].isin(['2014'])]
hitting_2014

In [None]:
#all player IDs from 2014 salaries dataframe, need to convert to list for next step
players_2014=recent['playerID'].tolist()

#hitting data for players in 2014 salaries dataframe
hitting_2014_filtered=hitting_2014[hitting_2014['player_id'].isin(players_2014)]
hitting_2014_filtered

In [None]:
recent['playerID'].value_counts()

In [None]:
hitting_2014_filtered['player_id'].value_counts()

We only had 802 players in our list of player salaries, but the batting data shows 877 entries. This is becase players that were traded during the season will have seperate batting stats for each team they play for. For these duplicates, let's add their batting data into one entry and see if that matches up. It is also possible there either of the datasets were missing salary or batting data for some players.

In [None]:
players=hitting_2014_filtered['player_id'].value_counts().index
adjusted_2014_hitting=pd.DataFrame()
for player in players:
    player_df=hitting_2014_filtered[hitting_2014_filtered['player_id'].isin([player])]
    if len(player_df)>1:
        numeric_stats=['g','ab','r','h','double','triple','hr','rbi','sb','cs','bb','so','ibb','hbp','sh','sf','g_idp']
        
        df=pd.DataFrame()
#         df=player_df.sum()[numeric_stats]
        df=df.append({'player_id':player,'year':'2014','stint':len(player_df),
                'team_id':'trade','league_id':'trade'},ignore_index=True)
        for stat in numeric_stats:
            df[stat]=player_df.sum()[stat]
        adjusted_2014_hitting=adjusted_2014_hitting.append(df)
    else:
        adjusted_2014_hitting=adjusted_2014_hitting.append(player_df)
adjusted_2014_hitting

In [None]:
adjusted_2014_hitting=adjusted_2014_hitting.reset_index().drop(columns='index')

In [None]:
recent

# Add salary data

Map player ID to salary and add to dataframe.

In [None]:
adjusted_2014_hitting['salary']=[recent[recent['playerID'].isin([player])].reset_index()
                        ['salary'][0] for player in adjusted_2014_hitting['player_id']]

In [None]:
adjusted_2014_hitting.sort_values(by='salary',ascending=False)

Pitchers should not be assessed based on their hitting statistics. Let's add in each player's position.

In [None]:
fielding=pd.read_csv('/kaggle/input/the-history-of-baseball/fielding.csv')
fielding_2014=fielding[fielding['player_id'].isin(players_2014)&fielding['year'].isin(['2014'])]
fielding_2014['player_id'].value_counts()

In [None]:
fielding_2014[fielding_2014['player_id'].isin(['johnske05'])]

# Add position data

Lot of players will have multiple entries- because they will play multiple positions over the course of a season, sometimes for multiple teams. For each individual player, let's extract their most common position based on how many games were played at each position.

In [None]:
positions_adjusted=pd.DataFrame()
for player in players:
    player_df=fielding_2014[fielding_2014['player_id'].isin([player])]
    positions=player_df['pos'].value_counts().index
    tracker=pd.DataFrame()
    for position in positions:
        df=player_df[player_df['pos'].isin([position])]
        tracker=tracker.append({'pos':position,'games':df.sum()['g']},ignore_index=True)
    id_max=tracker['games'].idxmax()
    positions_adjusted=positions_adjusted.append({'player_id':player,'pos':tracker['pos'][id_max]},ignore_index=True)
positions_adjusted

In [None]:
adjusted_2014_hitting['pos']=[positions_adjusted[positions_adjusted['player_id'].isin([player])].reset_index()
                        ['pos'][0] for player in adjusted_2014_hitting['player_id']]
adjusted_2014_hitting

Take out all of the pitchers.

In [None]:
adjusted_2014_hitting=adjusted_2014_hitting[-adjusted_2014_hitting['pos'].isin(['P'])].reset_index().drop(columns='index')
adjusted_2014_hitting

# Add player names

Add in the actual player names now that we've used player ID to map to all the necessary data.

In [None]:
adjusted_2014_hitting['name']=[names[names['player_id'].isin([player])].reset_index()
                        ['name'][0] for player in adjusted_2014_hitting['player_id']]
adjusted_2014_hitting

# Determining criteria for ML model

Let's analyze which stats are most positively/negatively correlated with salary.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20,10))
sns.heatmap(adjusted_2014_hitting.corr(),annot=True,linewidth=0.5)

In [None]:
df=pd.DataFrame(adjusted_2014_hitting.corr()['salary']).reset_index()
df['Beat Threshold']=abs(df['salary'])>0.5

sns.lmplot(x='index', y="salary", data=df,hue='Beat Threshold',fit_reg=False,height=4,
           aspect=4).set_xticklabels(rotation=90)

If I set the 'correlation threshold' to be 0.4, then at-bats, runs, hits, doubles, rbi's, base on balls, intentional base on balls, and sacrifice fly's are all directly correlated with salary. Let's use these as the features for our ML model.

In [None]:
features=['ab','r','h','double','rbi','bb','sf']

In [None]:
def scatter(attribute):
    p1=sns.lmplot(x=attribute, y="salary", data=adjusted_2014_hitting,fit_reg=False,height=8,aspect=4)
    ax = p1.axes[0,0]
    for i in range(len(adjusted_2014_hitting)):
        ax.text(adjusted_2014_hitting[attribute][i], adjusted_2014_hitting['salary'][i], adjusted_2014_hitting['name'][i],
               fontsize='small',rotation=45)

In [None]:
scatter('rbi')

Salary shows positive correlation with rbis.

In [None]:
scatter('bb')

Salary shows positive correlation with base on balls.

# Building the ML model

In [None]:
y=adjusted_2014_hitting['salary']
X=adjusted_2014_hitting[features]

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state=1, test_size=0.4)
basic_model = DecisionTreeRegressor(random_state=1)
basic_model.fit(train_X, train_y)
predictions=basic_model.predict(val_X)

# Create dataframe with model data and results

In [None]:
df=pd.DataFrame(val_X)
df['prediction']=predictions
df['ID']=[adjusted_2014_hitting['player_id'][index] for index in df.reset_index()['index']]
df['name']=[adjusted_2014_hitting['name'][index] for index in df.reset_index()['index']]
df['pos']=[adjusted_2014_hitting['pos'][index] for index in df.reset_index()['index']]
df['salary']=[adjusted_2014_hitting['salary'][index] for index in df.reset_index()['index']]
df=df[['name','ID','ab','r','h','double','rbi','bb','pos','salary','prediction']]
df

In [None]:
df['excess']=df['prediction']-df['salary']
df.sort_values(by='excess')

add commas, mae, show graphs for before

In [None]:
df.style.format({'prediction': "{0:,.2f}",'salary': "{0:,.2f}",'excess': "{0:,.2f}"})
df=df.astype({"ab": int,"r": int,"h": int,"double": int,"rbi": int,"bb":int}) 
df

In [None]:
type(df.style.format({'prediction': "{0:,.2f}",'salary': "{0:,.2f}",'excess': "{0:,.2f}"})
)

# Evaluate model

In [None]:
abs(df['excess']).mean()

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(df['salary'], df['prediction'])

Average error of about $3.5 million.

In [None]:
df['salary'].mean()

In [None]:
df['salary'].median()

Average salary is about $4.5 million and median is $2 million...

In [None]:
adjusted_2014_hitting.to_csv('ML1')