# The Golden Gloves
## Topic - Using historical data to predict MLB Hall of Fame status.
#### Data Source - https://www.seanlahman.com/baseball-archive/statistics/  
#### Group Hypothesis - Stats affect 
#### Approach to use for analysis - Aggregate stats for each player in regard to batting, fielding, and pitching. Using this data we can attempt to predict hall of fame status using a classificication machine learning method.

In [163]:
# Importing necessary packages
import pandas as pd


In [164]:
# Pandas setup
pd.set_option('display.max_columns', None)


# Hall of Fame Data
Providing example of how two tables could be combined to create a more readble hall of fame table.

In [165]:
# Importing data
people_df = pd.read_csv("Resources/baseballdatabank-master/core/People.csv")
hall_of_fame_df = pd.read_csv("Resources/baseballdatabank-master/core/HallOfFame.csv")

In [166]:
people_df.head()

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981.0,12.0,27.0,USA,CO,Denver,,,,,,,David,Aardsma,David Allan,215.0,75.0,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934.0,2.0,5.0,USA,AL,Mobile,2021.0,1.0,22.0,USA,GA,Atlanta,Hank,Aaron,Henry Louis,180.0,72.0,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,1939.0,8.0,5.0,USA,AL,Mobile,1984.0,8.0,16.0,USA,GA,Atlanta,Tommie,Aaron,Tommie Lee,190.0,75.0,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,1954.0,9.0,8.0,USA,CA,Orange,,,,,,,Don,Aase,Donald William,190.0,75.0,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,1972.0,8.0,25.0,USA,FL,Palm Beach,,,,,,,Andy,Abad,Fausto Andres,184.0,73.0,L,L,2001-09-10,2006-04-13,abada001,abadan01


In [167]:
hall_of_fame_df.head()

Unnamed: 0,playerID,yearID,votedBy,ballots,needed,votes,inducted,category,needed_note
0,cobbty01,1936,BBWAA,226.0,170.0,222.0,Y,Player,
1,ruthba01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
2,wagneho01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
3,mathech01,1936,BBWAA,226.0,170.0,205.0,Y,Player,
4,johnswa01,1936,BBWAA,226.0,170.0,189.0,Y,Player,


In [168]:
# Example merge and selecting necessary columns
hof_merged_df = pd.merge(hall_of_fame_df, people_df)
hof_merged_df = hof_merged_df[['playerID','nameLast', 'nameGiven', 'inducted', 'votes', 'yearID', 'needed']]

In [169]:
temp_df = hof_merged_df.loc[hof_merged_df['votes'] > 170]
temp_df.sort_values('votes', ascending=False)

Unnamed: 0,playerID,nameLast,nameGiven,inducted,votes,yearID,needed
4082,maddugr01,Maddux,Gregory Alan,Y,555.0,2014,429.0
3879,ripkeca01,Ripken,Calvin Edwin,Y,537.0,2007,409.0
4109,johnsra05,Johnson,Randall David,Y,534.0,2015,412.0
3880,gwynnto01,Gwynn,Anthony Keith,Y,532.0,2007,409.0
4083,glavito02,Glavine,Thomas Michael,Y,525.0,2014,429.0
...,...,...,...,...,...,...,...
2633,foxne01,Fox,Jacob Nelson,N,174.0,1979,324.0
2637,foxne01,Fox,Jacob Nelson,N,173.0,1983,281.0
3690,morrija02,Morris,John Scott,N,172.0,2005,387.0
3799,smithle02,Smith,Lee Arthur,N,171.0,2014,429.0


In [170]:
# Narrowing down dataset to those that barely made/missed HOF status
temp_df = hof_merged_df.loc[(hof_merged_df['votes'] > 140) & (hof_merged_df['votes'] < 200)] 
temp_df.sort_values('needed', ascending=False, inplace=True)
temp_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,playerID,nameLast,nameGiven,inducted,votes,yearID,needed
3948,martied01,Martinez,Edgar,N,191.0,2011,436.0
3769,trammal01,Trammell,Alan Stuart,N,141.0,2011,436.0
4049,bondsba01,Bonds,Barry Lamar,N,198.0,2014,429.0
3799,smithle02,Smith,Lee Arthur,N,171.0,2014,429.0
4037,schilcu01,Schilling,Curtis Montague,N,167.0,2014,429.0
...,...,...,...,...,...,...,...
250,chancfr01,Chance,Frank Leroy,N,144.0,1946,
482,gehrich01,Gehringer,Charles Leonard,Y,159.0,1949,
1504,ruffire01,Ruffing,Charles Herbert,N,184.0,1964,
1898,applilu01,Appling,Lucius Benjamin,Y,189.0,1964,


# Career Batting Data
Providing a general exmaple of how batting data could be used.

In [171]:
# Importing data
batting_df = pd.read_csv("Resources/baseballdatabank-master/core/batting.csv")

In [172]:
batting_df.drop(columns=['yearID', 'teamID', 'IBB', 'HBP', 'SH', 'SF', 'stint', 'lgID'], inplace=True)
batting_df.head()

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,GIDP
0,abercda01,1,4,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0
1,addybo01,25,118,30,32,6,0,0,13.0,8.0,1.0,4,0.0,0.0
2,allisar01,29,137,28,40,4,5,0,19.0,3.0,1.0,2,5.0,1.0
3,allisdo01,27,133,28,44,10,2,2,27.0,1.0,1.0,0,2.0,0.0
4,ansonca01,25,120,29,39,11,3,0,16.0,6.0,2.0,2,1.0,0.0


In [173]:
# Grouping by playerID and yearID, then sorting by home runs by season
career_batting_df = batting_df.groupby(['playerID']).sum()

In [174]:
# Sorting dataframe by most home runs
career_batting_df.sort_values('HR', ascending=False)

Unnamed: 0_level_0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bondsba01,2986,9847,2227,2935,601,77,762,1996.0,514.0,141.0,2558,1539.0,165.0
aaronha01,3298,12364,2174,3771,624,98,755,2297.0,240.0,73.0,1402,1383.0,328.0
ruthba01,2503,8398,2174,2873,506,136,714,2217.0,123.0,117.0,2062,1330.0,2.0
rodrial01,2784,10566,2021,3115,548,31,696,2086.0,329.0,76.0,1338,2287.0,261.0
pujolal01,2862,10839,1843,3236,669,16,662,2100.0,114.0,41.0,1331,1304.0,399.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
howarch02,44,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0
howarda01,25,36,6,8,1,0,0,1.0,0.0,0.0,1,8.0,0.0
howarea01,1,0,0,0,0,0,0,1.0,0.0,0.0,0,0.0,0.0
howarfr02,28,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0


In [175]:
# Showing batting stats of specified player
career_batting_df.loc['bondsba01']

G       2986.0
AB      9847.0
R       2227.0
H       2935.0
2B       601.0
3B        77.0
HR       762.0
RBI     1996.0
SB       514.0
CS       141.0
BB      2558.0
SO      1539.0
GIDP     165.0
Name: bondsba01, dtype: float64

# Career Fielding Data

# Career Pitching Data

# Creating Full Dataset containing:
* Hall of Fame Players (playerID, inductedStatus)
* Career Batting Data
* Career Fielding Data
* Career Pitching Data

In [176]:
# Merging all relevant datasets
train_df = pd.merge(hall_of_fame_df, career_batting_df, left_on='playerID', right_on='playerID')

In [177]:
# Cleaning data table
train_df = train_df.drop(['needed_note'], axis=1)
train_df = train_df.dropna()

# Machine Learning
Using classification to predict inductedStatus based on all stats within dataset.

In [178]:
# Importing required packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [179]:
# assigning data
X = train_df[['inducted']]
y = train_df.drop(['playerID', 'inducted', 'votedBy', 'category'], axis=1)

# splitting data
X_train, X_test, y_train, y_test = train_test_split(y, X, test_size=0.25, random_state=0)

In [180]:
# Creating and fitting logistic regression
logisticRegr = LogisticRegression(solver='lbfgs', max_iter=10000)
result = logisticRegr.fit(X_train, y_train.values.ravel())

In [181]:
# Use score method to get accuracy of model
score = logisticRegr.score(X_test, y_test)
print(score)

0.9989561586638831


# Visualizing Model

In [182]:
import matplotlib.pyplot as plt

In [183]:
# https://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic.html

# plt.plot(X_test, logisticRegr.coef_ * X_test + logisticRegr.intercept_, linewidth=1)