# The Golden Gloves
## Topic - Using historical data to predict MLB Hall of Fame status.
#### Data Source - https://www.seanlahman.com/baseball-archive/statistics/  
#### Group Hypothesis - What stats have the most impact on a player making the MLB hall of fame?  
#### Approach to use for analysis - Aggregate stats for each player in regard to batting, fielding, and pitching. Using this data we can attempt to predict hall of fame status using a classificication machine learning method.

In [1]:
# Importing necessary packages
import pandas as pd


In [2]:
# Pandas setup
pd.set_option('display.max_columns', None)


# Hall of Fame Data
Providing example of how two tables could be combined to create a more readble hall of fame table.

In [3]:
# Importing data
people_df = pd.read_csv("baseballdatabank-master/core/People.csv")
hall_of_fame_df = pd.read_csv("baseballdatabank-master/core/HallOfFame.csv")

In [4]:
people_df.head()

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,deathCountry,deathState,deathCity,nameFirst,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981.0,12.0,27.0,USA,CO,Denver,,,,,,,David,Aardsma,David Allan,215.0,75.0,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934.0,2.0,5.0,USA,AL,Mobile,2021.0,1.0,22.0,USA,GA,Atlanta,Hank,Aaron,Henry Louis,180.0,72.0,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,1939.0,8.0,5.0,USA,AL,Mobile,1984.0,8.0,16.0,USA,GA,Atlanta,Tommie,Aaron,Tommie Lee,190.0,75.0,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,1954.0,9.0,8.0,USA,CA,Orange,,,,,,,Don,Aase,Donald William,190.0,75.0,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,1972.0,8.0,25.0,USA,FL,Palm Beach,,,,,,,Andy,Abad,Fausto Andres,184.0,73.0,L,L,2001-09-10,2006-04-13,abada001,abadan01


In [5]:
hall_of_fame_df.head()

Unnamed: 0,playerID,yearID,votedBy,ballots,needed,votes,inducted,category,needed_note
0,cobbty01,1936,BBWAA,226.0,170.0,222.0,Y,Player,
1,ruthba01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
2,wagneho01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
3,mathech01,1936,BBWAA,226.0,170.0,205.0,Y,Player,
4,johnswa01,1936,BBWAA,226.0,170.0,189.0,Y,Player,


In [6]:
# Example merge and selecting necessary columns
hof_merged_df = pd.merge(hall_of_fame_df, people_df)
hof_merged_df = hof_merged_df[['playerID','nameLast', 'nameGiven', 'inducted', 'votes']]

In [7]:
hof_merged_df.head()

Unnamed: 0,playerID,nameLast,nameGiven,inducted,votes
0,cobbty01,Cobb,Tyrus Raymond,Y,222.0
1,ruthba01,Ruth,George Herman,Y,215.0
2,wagneho01,Wagner,John Peter,Y,215.0
3,wagneho01,Wagner,John Peter,N,5.0
4,mathech01,Mathewson,Christopher,Y,205.0


# Batting Data
Providing a general exmaple of how batting data could be used.

In [8]:
# Importing data
batting_df = pd.read_csv("baseballdatabank-master/core/batting.csv")

In [9]:
batting_df.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,0,0,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,0,0,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,5,0,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,2,2,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,3,0,16.0,6.0,2.0,2,1.0,,,,,0.0


In [10]:
# Grouping by playerID and yearID, then sorting by home runs by season
batting_grouped_df = batting_df.sort_values(['HR'], ascending=False).groupby(['playerID'])

In [11]:
batting_grouped_df.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
80767,bondsba01,2001,1,SFN,NL,153,476,129,156,32,2,73,137.0,13.0,3.0,177,93.0,35.0,9.0,0.0,2.0,5.0
77404,mcgwima01,1998,1,SLN,NL,155,509,130,152,21,0,70,147.0,1.0,0.0,162,155.0,28.0,6.0,0.0,4.0,8.0
77751,sosasa01,1998,1,CHN,NL,159,643,134,198,20,0,66,158.0,18.0,9.0,73,171.0,14.0,1.0,0.0,5.0,20.0
78720,mcgwima01,1999,1,SLN,NL,153,521,118,145,21,1,65,147.0,0.0,0.0,133,141.0,21.0,2.0,0.0,5.0,12.0
81785,sosasa01,2001,1,CHN,NL,160,577,146,189,34,5,64,160.0,0.0,2.0,116,153.0,37.0,6.0,0.0,12.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43648,dottega01,1964,1,MIN,AL,3,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
43644,dicksji01,1964,1,CIN,NL,4,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
43643,dickepa01,1964,1,CLE,AL,11,11,0,0,0,0,0,0.0,0.0,0.0,0,5.0,0.0,0.0,0.0,0.0,0.0
43640,deesch01,1964,1,LAA,AL,26,26,3,2,1,0,0,1.0,1.0,2.0,1,4.0,0.0,1.0,0.0,0.0,0.0


In [12]:
# Converting grouped dataframe to back to a regular dataframe
season_bats_df = batting_grouped_df.apply(lambda x: x)

In [13]:
# Getting all seasons of a particular player
season_bats_df.loc[season_bats_df['playerID'] == 'bondsba01']

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
80767,bondsba01,2001,1,SFN,NL,153,476,129,156,32,2,73,137.0,13.0,3.0,177,93.0,35.0,9.0,0.0,2.0,5.0
79392,bondsba01,2000,1,SFN,NL,143,480,129,147,28,4,49,106.0,11.0,3.0,117,77.0,22.0,3.0,0.0,7.0,6.0
82096,bondsba01,2002,1,SFN,NL,143,403,117,149,31,2,46,110.0,9.0,2.0,198,47.0,68.0,9.0,0.0,2.0,4.0
70794,bondsba01,1993,1,SFN,NL,159,539,129,181,38,4,46,123.0,29.0,12.0,126,79.0,43.0,2.0,0.0,7.0,11.0
84776,bondsba01,2004,1,SFN,NL,147,373,129,135,27,3,45,101.0,6.0,1.0,232,41.0,120.0,9.0,0.0,3.0,5.0
83424,bondsba01,2003,1,SFN,NL,130,390,111,133,22,1,45,90.0,7.0,0.0,148,58.0,61.0,10.0,0.0,2.0,7.0
74270,bondsba01,1996,1,SFN,NL,158,517,122,159,27,3,42,129.0,40.0,7.0,151,76.0,30.0,1.0,0.0,6.0,11.0
75520,bondsba01,1997,1,SFN,NL,159,532,123,155,26,5,40,101.0,37.0,8.0,145,87.0,34.0,8.0,0.0,5.0,13.0
71952,bondsba01,1994,1,SFN,NL,112,391,89,122,18,1,37,81.0,29.0,9.0,74,43.0,18.0,6.0,0.0,3.0,3.0
76764,bondsba01,1998,1,SFN,NL,156,552,120,167,44,7,37,122.0,28.0,12.0,130,92.0,29.0,8.0,1.0,6.0,15.0
