In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_columns', None)
pitchers=pd.read_csv('/kaggle/input/pitching/pitching2.csv').drop(columns='Unnamed: 0')
pitchers

In [None]:
names=pd.read_csv('/kaggle/input/the-history-of-baseball/player.csv')
names['name']=names['name_first']+' '+names['name_last']
names=names[['player_id','name']]
names=names.rename(columns={'player_id':'playerID'})
names=names[names['playerID'].isin(pitchers['playerID'].tolist())]
pitchers=pitchers.join(names.set_index(['playerID']), on='playerID')
pitchers

# Heatmap for correlation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20,10))
sns.heatmap(pitchers.corr(),annot=True,linewidth=0.5)

Heatmaps are great for analyzing how one variable relates to another, and being able to view all these relationships in one figure. Notice that the white blocks outline an identity matrix, because a variable is always going to be 100% correlated with itself. An example of two variables with a high ratio is hits and runs. The more hits a pitcher gives up, the more likely they are to give up runs. In certain instances one may avoid letting up runs, but over a long career they are going to be very closely correlated. An example of two variables with a low ratio is saves and innings pitched. Saves are typicaly reserved for a closing pitcher to come in and pitch the very last inning of the game. Thus, someone who is a closer will not rack up many innings pitched, and vice versa.

In [None]:
df=pd.DataFrame(pitchers.corr()['percent']).reset_index()
df['Beat Threshold']=abs(df['percent'])>0.45
df.plot(x='index',y='percent',kind='scatter',rot=90,grid=True)

In [None]:
sns.lmplot(x='index', y="percent", data=df,hue='Beat Threshold',fit_reg=False,height=4,
           aspect=4).set_xticklabels(rotation=90)

I set a column declaring a threshold at +/- 0.4, so that helps separate each correlation by color whether it's inside or outside 0.4. This is an arbitrary threshold, but I figured it would be effective in showing the columns that are most correlated with percent.

W (wins), SHO (shutouts), H (hits), SO (strikeouts), BFP (batters faced by pitcher), IP (innings pitched)

# Correlation Plots

These should verify what we see in the heatmap and general correlation plot. SV, BAOpp, GF, and year should show no correlation between percent because their correlations are closest to zero.

Scatter plots are a great visualization tool to analyze how one variable influences another. For ease I created a method that, for each player, plots a given stat on the x axis and the HOF voting percentage on the y axis. The color of the point indicates whether or not they got inducted, and I created a for loop to add the player ID to each point.

These plots are a graphical representation between what we see in the heat map.

In [None]:
def scatter(attribute,show_annotations):
    if show_annotations==False:
        sns.lmplot(x=attribute, y="percent", data=pitchers,hue='inducted',fit_reg=False,size=8,aspect=2)
    else:
        p1=sns.lmplot(x=attribute, y="percent", data=pitchers,hue='inducted',fit_reg=False,size=8,aspect=2)
        ax = p1.axes[0,0]
        for i in range(len(pitchers)):
            ax.text(pitchers[attribute][i], pitchers['percent'][i],pitchers['name'][i],
                   fontsize='small',rotation=45)
        plt.show()

In [None]:
scatter('SV',True)

Saves are an interesting case because Trevor Hoffman has almost twice as many saves as Rich Gossage, but Gossage is the only one who's gotten the nod to the hall. (Trevor Hoffman would get in a couple years later, but he still had missed the cut when this data was collected).

In [None]:
scatter('BAOpp',True)

Clearly some outliers in the data are making this one hard to visualize. Let's manually condense the x axis to get a better look.

In [None]:
pitchers[pitchers['BAOpp']>.4]

All of these are clearly errors in the data, an opponent batting averages should not be this high (anything over 1 is statstically impossible). Let's replace these with null values. 

In [None]:
for row in range(len(pitchers)):
    if pitchers['BAOpp'][row]>0.4:
        pitchers.iloc[row,14]=None
pitchers

In [None]:
pitchers.corr()['BAOpp']

Now its correlation with percent of votes is -0.18, which shows a more negative relationship than its original -0.076, but this is not very significant and the criteria I set was to be at least 0.45.

In [None]:
scatter('BAOpp',True)

In [None]:
scatter('W',True)

Clearly a positive correlation here. There are some blue dots with a lot of wins, but in general once a player gets aa certain amount of wins it is highly likely that they will get into the hall of fame.

On another note, Roger Clemens show up a lot as a player with impressive stats, but not getting enough votes to get in. This is because he used steroids and some believe he should not be allowed in because of it.

In [None]:
pitchers_filtered=pitchers[pitchers['playerID'].isin(['clemero02'])].reset_index().drop(columns='index')

In [None]:
pitchers_filtered.to_csv('pitchers_filtered')