In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
file = "baseball data/player_stats.csv"

In [3]:
#reading file and fixing load issues
df = pd.read_csv(file)
df = df.drop(columns ='Unnamed: 0')
df.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,SO,IBB,HBP,SH,SF,GIDP,age,SLG,AVG,ISO
0,aaronha01,1955.0,1,ML1,NL,153,602.0,105,189.0,37.0,...,61.0,5.0,3.0,7.0,4.0,20.0,21.0,0.66113,0.313953,0.347176
1,aaronha01,1956.0,1,ML1,NL,153,609.0,106,200.0,34.0,...,54.0,6.0,2.0,5.0,7.0,21.0,22.0,0.679803,0.328407,0.351396
2,aaronha01,1957.0,1,ML1,NL,151,615.0,118,198.0,27.0,...,58.0,15.0,0.0,0.0,3.0,13.0,23.0,0.725203,0.321951,0.403252
3,aaronha01,1958.0,1,ML1,NL,153,601.0,109,196.0,34.0,...,49.0,16.0,1.0,0.0,3.0,21.0,24.0,0.658902,0.326123,0.332779
4,aaronha01,1959.0,1,ML1,NL,154,629.0,116,223.0,46.0,...,54.0,17.0,4.0,0.0,9.0,19.0,25.0,0.782194,0.354531,0.427663


In [4]:
#displaying column names
df.columns.values

array(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'G', 'AB', 'R',
       'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP',
       'SH', 'SF', 'GIDP', 'age', 'SLG', 'AVG', 'ISO'], dtype=object)

In [7]:
#grouping by year to grab statistical averages for each season
mean_df = df.groupby('yearID').mean()
mean_df['yearID'] = mean_df.index

#cleaning up and reordering column names after groupby
cols = ['yearID','age','AB','H','2B','3B','HR','RBI','SB','CS','BB','SO', 'IBB','HBP', 'SH','SF','SLG','AVG','ISO']
mean_df = mean_df[cols]
mean_df = mean_df.reset_index(drop=True)

mean_df

Unnamed: 0,yearID,age,AB,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,SLG,AVG,ISO
0,1955.0,28.796178,409.420382,111.324841,16.726115,3.719745,12.401274,54.592357,3.764331,2.910828,46.203822,45.636943,3.974522,2.566879,4.063694,3.643312,0.489581,0.267010,0.222571
1,1956.0,28.649351,419.454545,114.051948,17.928571,3.974026,13.084416,55.610390,4.129870,2.649351,46.629870,50.435065,4.350649,2.500000,4.357143,3.454545,0.500054,0.267969,0.232084
2,1957.0,29.256250,407.637500,111.031250,17.543750,3.450000,11.956250,51.725000,4.268750,2.931250,40.987500,49.268750,3.900000,2.600000,3.737500,3.550000,0.493365,0.269446,0.223919
3,1958.0,29.106250,400.400000,109.012500,17.550000,3.362500,11.825000,51.100000,4.075000,2.650000,40.437500,48.825000,3.662500,2.512500,3.387500,3.306250,0.494421,0.268069,0.226353
4,1959.0,28.904459,412.070064,112.082803,18.171975,3.165605,12.299363,53.229299,4.872611,2.866242,42.171975,53.121019,3.815287,2.547771,3.681529,3.235669,0.491840,0.267795,0.224045
5,1960.0,28.581699,416.241830,112.346405,18.352941,3.581699,11.967320,53.281046,5.235294,2.993464,42.666667,55.058824,3.928105,2.542484,3.908497,3.555556,0.494731,0.266741,0.227990
6,1961.0,28.497238,417.270718,113.994475,18.071823,3.662983,13.149171,55.574586,5.254144,2.928177,45.154696,56.143646,3.535912,2.618785,3.552486,3.508287,0.502038,0.269535,0.232503
7,1962.0,28.364103,445.420513,121.261538,18.666667,3.753846,13.533333,58.461538,6.317949,3.179487,45.092308,60.512821,3.646154,3.015385,3.892308,3.641026,0.496169,0.268445,0.227724
8,1963.0,28.094527,432.990050,112.507463,17.308458,3.358209,11.925373,50.502488,5.507463,3.169154,39.019900,64.333333,3.900498,2.955224,3.751244,3.213930,0.463864,0.255661,0.208203
9,1964.0,27.733645,419.570093,111.060748,17.275701,3.163551,11.813084,50.116822,5.112150,2.967290,37.901869,63.859813,4.149533,2.728972,3.588785,2.948598,0.470104,0.260222,0.209882


In [6]:
max_hit = df.loc[df['HR']==df['HR'].max(),:]
print(max_hit)

       playerID  yearID  stint teamID lgID    G     AB    R      H    2B  \
1588  bondsba01  2001.0      1    SFN   NL  153  476.0  129  156.0  32.0   

        ...       SO   IBB  HBP   SH   SF  GIDP   age       SLG       AVG  \
1588    ...     93.0  35.0  9.0  0.0  2.0   5.0  37.0  1.088235  0.327731   

           ISO  
1588  0.760504  

[1 rows x 26 columns]
