# Baseball Game Logs File I/O
* Retrosheet.com

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

% matplotlib inline



I decided to use Retrosheet's game logs (http://www.retrosheet.org/). Retrosheet contains game logs of increasing complexity for games from years 1871 - 2016, in .txt files with no heders, and Retrosheet provides a data dictionary that describes the 160 columns for each game log (data dict: http://www.retrosheet.org/gamelogs/glfields.txt ):
- DateTime object (keep): Time series optionality
 * 'date',
- Integer: (0,1,2) (keep): Hypothesize that the later in a series against a team, the more likely an injury.
 * 'num_game
- Categorical String object: **Encode**: Hypothesize that certain days of the week related to final games of play before a break are correlated with higher injuries.
 * 'day',
- Categorical String ojbect: **Encode**
 * 'v_team','v_league',
 * 'h_team', 'h_league',
- Integer: (keep) Hypothesize that the later the game in the season, the more likely an injury because of player wear.
 * 'v_team_game_num', 'h_team_game_num',
- Integer: Max 30, Mean = 4.5
 * 'v_team_score','h_team_score',
 * 'game_length_outs',
 
 * 'day_night': **Encode** ( 'N'    29485, 'D'    14242)
- These relate to games that weren't complete or were forfeited, so I'm goint to remove those columns, they are almost completely null values .isnull().sum() : when_complete 43700, forfeit 43727, protest 43700
 * 'when_complete'(7 games),'forfeit','protest',
- Just in case playing in some parks is harder than others, keep park_id: **Encode** 
 * 'park_id', 
- If attendance is something to keep, then I would hypothesize that bigger crowds would give players adrenaline and would make them push harder and get injured. The collinearity would be that bigger crowds is a proxy for popularity, which might be a function of high scoring teams, so I have to be careful.
 *  'attendance',
- The length of the game might have an effect on injuries.
 * 'time_game_min', 
 * 'v_line_scores','h_line_scores',
 * 'v_at_bats','v_hits','v_doubles','v_triples','v_homeruns','v_RBI','v_sac_hits','v_sac_files','v_hit_pitch',
 * 'v_walks', 'v_int_walks', 'v_strikeouts', 'v_stol_base', 'v_caught_steal', 'v_grnd_dbl_plays', 
 * 'v_awd_fst_catch_intf', 'v_left_on_base','v_pitchers','v_ind_earn_runs','v_team_earn_runs',
 * 'v_wild_pitch','v_balks','v_putouts','v_assists','v_errors','v_pass_balls','v_dbl_plays', 
 * 'v_trp_plays',
 * 'h_at_bats','h_hits','h_doubles', 'h_triples','h_homeruns', 'h_RBI','h_sac_hits',
 * 'h_sac_files','h_hit_pitch','h_walks', 'h_int_walks','h_strikeouts', 'h_stol_base',
 * 'h_caught_steal','h_grnd_dbl_plays', 'h_awd_fst_catch_intf','h_left_on_base',
 * 'h_pitchers','h_ind_earn_runs','h_team_earn_runs','h_wild_pitch','h_balks',
 * 'h_putouts','h_assists','h_errors','h_pass_balls','h_dbl_plays','h_trp_plays','
- The next columns identify specific players and umpires. In the second, more complex analysis, as next steps, I would like to scrape team info like visiting team starting lineup median age, median years of major league play, past number of injuries:
- Delete umpire columns:
 * h_plate_ump_id','h_plate_ump_name','first_b_ump_id','first_b_ump_name',
 * 'sec_b_ump_id','sec_b_ump_name','third_b_ump_id','third_b_ump_name', 
 * 'lf_ump_id','lf_ump_name','rf_ump_id','rf_ump_name',
- Keep manager ids but not names: **Encode**
 * 'v_mgr_id','v_mgr_name','h_mgr_id','h_mgr_name',
- Players: Keep IDs, remove names? Move IDs to Injury tables?
 * 'w_pitch_id','w_pitch"name','l_pitch_id','l_pitch_name','sv_pitch_id','sv_pitch_name',
 * 'game_win_rbi_bat_id','game_win_rbi_bat_name',
 * 'v_start_pitch_id','v_start_pitch_name', 'h_start_pitch_id', 'h_start_pitch_name', 
 * 'v_pl_1_id','v_pl_1_name', 'v_pl_1_pos','v_pl_2_id','v_pl_2_name', 'v_pl_2_pos',
 * 'v_pl_3_id','v_pl_3_name', 'v_pl_3_pos','v_pl_4_id','v_pl_4_name', 'v_pl_4_pos',
 * 'v_pl_5_id','v_pl_5_name', 'v_pl_5_pos','v_pl_6_id','v_pl_6_name', 'v_pl_6_pos',
 * 'v_pl_7_id','v_pl_7_name', 'v_pl_7_pos', 'v_pl_8_id','v_pl_8_name', 'v_pl_8_pos',
 * 'v_pl_9_id','v_pl_9_name', 'v_pl_9_pos',
 * 'h_pl_1_id','h_pl_1_name', 'h_pl_1_pos','h_pl_2_id','h_pl_2_name', 'h_pl_2_pos',
* 'h_pl_3_id','h_pl_3_name', 'h_pl_3_pos','h_pl_4_id','h_pl_4_name', 'h_pl_4_pos',
* 'h_pl_5_id','h_pl_5_name', 'h_pl_5_pos','h_pl_6_id','h_pl_6_name', 'h_pl_6_pos',
* 'h_pl_7_id','h_pl_7_name', 'h_pl_7_pos','h_pl_8_id','h_pl_8_name', 'h_pl_8_pos',
* 'h_pl_9_id','h_pl_9_name', 'h_pl_9_pos',

Delete the following columns which don't provide much additional info:
* 'addl_info','table_acq_from'
 

In [3]:
# Import the CSV files into one database:
files = ['GL2000.TXT', 'GL2001.TXT', 'GL2002.TXT', 'GL2003.TXT', 'GL2004.TXT', 'GL2005.TXT','GL2000.TXT',
        'GL2006.TXT','GL2007.TXT','GL2008.TXT','GL2009.TXT','GL2010.TXT','GL2011.TXT',
        'GL2012.TXT','GL2013.TXT','GL2014.TXT','GL2015.TXT','GL2016.TXT']
# Let's bring in one file and see what the outcome is:

# Instantiate an empty dataframe that will hold all the game data: game_data_all
game_data_all = pd.DataFrame()

# Starting with the year 2000, let's go text file by file in the files list, read in the text file, 
# rename its columns and append it to the original data frame
year = 2000
for filey in files:
    temp_df = pd.read_csv(filey, header=None)
    temp_df.columns = [
        'date','num_game','day','v_team','v_league','v_team_game_num',
        'h_team', 'h_league','h_team_game_num','v_team_score','h_team_score','game_length_outs',
        'day_night','when_complete','forfeit','protest','park_id', 'attendance',
        'time_game_min', 'v_line_scores','h_line_scores','v_at_bats','v_hits','v_doubles',
        'v_triples','v_homeruns','v_RBI','v_sac_hits', 'v_sac_files','v_hit_pitch',
        'v_walks', 'v_int_walks', 'v_strikeouts', 'v_stol_base', 'v_caught_steal', 'v_grnd_dbl_plays', 
        'v_awd_fst_catch_intf', 'v_left_on_base','v_pitchers','v_ind_earn_runs','v_team_earn_runs','v_wild_pitch',
        'v_balks','v_putouts','v_assists','v_errors','v_pass_balls','v_dbl_plays',
        'v_trp_plays','h_at_bats','h_hits','h_doubles', 'h_triples','h_homeruns',
        'h_RBI','h_sac_hits', 'h_sac_files','h_hit_pitch','h_walks', 'h_int_walks', 
        'h_strikeouts', 'h_stol_base', 'h_caught_steal','h_grnd_dbl_plays', 'h_awd_fst_catch_intf','h_left_on_base',
        'h_pitchers','h_ind_earn_runs','h_team_earn_runs','h_wild_pitch','h_balks','h_putouts',
        'h_assists','h_errors','h_pass_balls','h_dbl_plays','h_trp_plays','h_plate_ump_id',
        'h_plate_ump_name','first_b_ump_id','first_b_ump_name','sec_b_ump_id','sec_b_ump_name','third_b_ump_id',
        'third_b_ump_name', 'lf_ump_id','lf_ump_name','rf_ump_id','rf_ump_name','v_mgr_id',
        'v_mgr_name','h_mgr_id','h_mgr_name','w_pitch_id','w_pitch"name','l_pitch_id',
        'l_pitch_name','sv_pitch_id','sv_pitch_name','game_win_rbi_bat_id','game_win_rbi_bat_name','v_start_pitch_id',
        'v_start_pitch_name', 'h_start_pitch_id', 'h_start_pitch_name', 'v_pl_1_id','v_pl_1_name', 'v_pl_1_pos',
        'v_pl_2_id','v_pl_2_name', 'v_pl_2_pos','v_pl_3_id','v_pl_3_name', 'v_pl_3_pos',
        'v_pl_4_id','v_pl_4_name', 'v_pl_4_pos','v_pl_5_id','v_pl_5_name', 'v_pl_5_pos',
        'v_pl_6_id','v_pl_6_name', 'v_pl_6_pos','v_pl_7_id','v_pl_7_name', 'v_pl_7_pos',
        'v_pl_8_id','v_pl_8_name', 'v_pl_8_pos','v_pl_9_id','v_pl_9_name', 'v_pl_9_pos',
        'h_pl_1_id','h_pl_1_name', 'h_pl_1_pos','h_pl_2_id','h_pl_2_name', 'h_pl_2_pos',
        'h_pl_3_id','h_pl_3_name', 'h_pl_3_pos','h_pl_4_id','h_pl_4_name', 'h_pl_4_pos',
        'h_pl_5_id','h_pl_5_name', 'h_pl_5_pos','h_pl_6_id','h_pl_6_name', 'h_pl_6_pos',
        'h_pl_7_id','h_pl_7_name', 'h_pl_7_pos','h_pl_8_id','h_pl_8_name', 'h_pl_8_pos',
        'h_pl_9_id','h_pl_9_name', 'h_pl_9_pos','addl_info','table_acq_from']
    game_data_all = pd.concat([game_data_all,temp_df],axis=0)
    year += 1
    
game_data_all.shape

(43727, 161)

In [9]:
# # Temporarily put all game logs into csv
# game_data_all.to_csv('gamelogs-2000-2016.csv',index=False)

In [4]:
df = pd.DataFrame(game_data_all).copy()
df.reset_index(inplace=True)

In [5]:
null_df = df.isnull().sum().to_frame()
null_df.columns = ['nulls']
null_df[null_df['nulls'] > 0]

Unnamed: 0,nulls
when_complete,43700
forfeit,43727
protest,43700
attendance,2
sec_b_ump_id,73
lf_ump_id,43723
rf_ump_id,43723
w_pitch_id,7
"w_pitch""name",7
l_pitch_id,7


### Get Rid of Null and Uneccessary Columns

In [6]:
# Drop the columns that have null values, or that aren't relevant #65 columns
cols_to_drop = ['when_complete','protest','forfeit','addl_info','table_acq_from', 
                'h_plate_ump_id','h_plate_ump_name','first_b_ump_id','first_b_ump_name',
                'sec_b_ump_id','sec_b_ump_name','third_b_ump_id','third_b_ump_name','lf_ump_id',
                'lf_ump_name','rf_ump_id','rf_ump_name','sv_pitch_id','sv_pitch_name',
                'v_line_scores','h_line_scores','v_mgr_name','h_mgr_name','w_pitch"name','l_pitch_name',
                'game_win_rbi_bat_id','game_win_rbi_bat_name','v_start_pitch_name','h_start_pitch_name',
                'v_pl_1_name', 'v_pl_1_pos','v_pl_2_name', 'v_pl_2_pos','v_pl_3_name', 'v_pl_3_pos','v_pl_4_name', 
                'v_pl_4_pos','v_pl_5_name', 'v_pl_5_pos','v_pl_6_name', 'v_pl_6_pos','v_pl_7_name', 'v_pl_7_pos',
                'v_pl_8_name', 'v_pl_8_pos','v_pl_9_name', 'v_pl_9_pos','h_pl_1_name', 'h_pl_1_pos','h_pl_2_name', 
                'h_pl_2_pos','h_pl_3_name', 'h_pl_3_pos','h_pl_4_name', 'h_pl_4_pos','h_pl_5_name', 'h_pl_5_pos',
                'h_pl_6_name', 'h_pl_6_pos','h_pl_7_name', 'h_pl_7_pos','h_pl_8_name', 'h_pl_8_pos','h_pl_9_name',
                'h_pl_9_pos']
cols_to_keep = [col for col in df.columns if col not in cols_to_drop]

# Create a new dataframe with only the columns needed for this project:
df_sm = pd.DataFrame(df[cols_to_keep]).copy()

In [7]:
# We still need to impute park attendance to those 2 np.nan, and drop the seven rows where the pitcher ids are there:
df_sm.isnull().sum()[df_sm.isnull().sum() > 0]

attendance    2
w_pitch_id    7
l_pitch_id    7
dtype: int64

In [8]:
# Find the two nulls, since there are only two, I can impute their attendance manually
df[df_sm.attendance.isnull()]

Unnamed: 0,index,date,num_game,day,v_team,v_league,v_team_game_num,h_team,h_league,h_team_game_num,...,h_pl_7_name,h_pl_7_pos,h_pl_8_id,h_pl_8_name,h_pl_8_pos,h_pl_9_id,h_pl_9_name,h_pl_9_pos,addl_info,table_acq_from
39174,304,20150429,0,Wed,CHA,AL,18,BAL,AL,20,...,Everth Cabrera,6,josec002,Caleb Joseph,2,navar001,Rey Navarro,4,,Y
40888,2018,20150905,0,Sat,ARI,NL,136,CHN,NL,134,...,David Ross,2,arrij001,Jake Arrieta,1,russa002,Addison Russell,6,,Y


In [9]:
# Get mean park attendence for each park in the gamelog for 2015, when we have null values:
df_2015_park_attend = df_sm[(df_sm['date'] > 20150201) & (df_sm['date'] < 20151101)].loc[:,['park_id','attendance']]
park_attendance_mean = df_2015_park_attend.groupby('park_id').mean()

#'BAL12' park has mean attendance in 2015 of: 29626.0
print(df_sm.loc[39174,'attendance'])
# 'CHI11' park has mean attendance in 2015 of: 36489.025000
print(df_sm.loc[40888,'attendance'])

# impute mean park attendance manually:
df_sm.loc[39174,'attendance'] = 29626.0
df_sm.loc[40888,'attendance'] = 36489.025000

# drop null rows:
df_sm = df_sm.dropna(axis=0)

nan
nan


In [10]:
# Check for nulls: NONE
df_sm.isnull().sum()[df_sm.isnull().sum() >0]

Series([], dtype: int64)

In [11]:
df_sm.head()

Unnamed: 0,index,date,num_game,day,v_team,v_league,v_team_game_num,h_team,h_league,h_team_game_num,...,v_pl_9_id,h_pl_1_id,h_pl_2_id,h_pl_3_id,h_pl_4_id,h_pl_5_id,h_pl_6_id,h_pl_7_id,h_pl_8_id,h_pl_9_id
0,0,20000329,0,Wed,CHN,NL,1,NYN,NL,1,...,liebj001,hendr001,hamid001,alfoe001,piazm001,ventr001,belld001,zeilt001,ordor001,hampm001
1,1,20000330,0,Thu,NYN,NL,2,CHN,NL,2,...,reedr002,youne001,bufod001,sosas001,gracm001,rodrh001,andrs001,husoj001,giraj001,farnk001
2,2,20000403,0,Mon,COL,NL,1,ATL,NL,1,...,astap001,veraq001,sandr002,jonec004,jordb001,galaa001,jonea002,peree002,weisw001,maddg002
4,4,20000403,0,Mon,SFN,NL,1,FLO,NL,1,...,hernl003,castl001,gonza002,floyc001,wilsp002,lowem001,millk005,browb003,redmm001,ferna001
5,5,20000403,0,Mon,LAN,NL,1,MON,NL,1,...,browk001,bergp001,vidrj001,whitr001,guerv001,stevl001,barrm003,widgc001,cabro001,hermd001


In [12]:
# Final changes made to gamelog data to make it ready 
df_sm.columns

Index(['index', 'date', 'num_game', 'day', 'v_team', 'v_league',
       'v_team_game_num', 'h_team', 'h_league', 'h_team_game_num',
       'v_team_score', 'h_team_score', 'game_length_outs', 'day_night',
       'park_id', 'attendance', 'time_game_min', 'v_at_bats', 'v_hits',
       'v_doubles', 'v_triples', 'v_homeruns', 'v_RBI', 'v_sac_hits',
       'v_sac_files', 'v_hit_pitch', 'v_walks', 'v_int_walks', 'v_strikeouts',
       'v_stol_base', 'v_caught_steal', 'v_grnd_dbl_plays',
       'v_awd_fst_catch_intf', 'v_left_on_base', 'v_pitchers',
       'v_ind_earn_runs', 'v_team_earn_runs', 'v_wild_pitch', 'v_balks',
       'v_putouts', 'v_assists', 'v_errors', 'v_pass_balls', 'v_dbl_plays',
       'v_trp_plays', 'h_at_bats', 'h_hits', 'h_doubles', 'h_triples',
       'h_homeruns', 'h_RBI', 'h_sac_hits', 'h_sac_files', 'h_hit_pitch',
       'h_walks', 'h_int_walks', 'h_strikeouts', 'h_stol_base',
       'h_caught_steal', 'h_grnd_dbl_plays', 'h_awd_fst_catch_intf',
       'h_left_on_base', 

### Drop duplicate rows:

In [17]:
print(df_sm.shape)
# check duplicates:
df_sm.drop_duplicates(inplace=True)
print('After removing duplicate rows:',df_sm.shape)
df_sm.index = np.arange(0,len(df_sm))

(41292, 97)
After removing duplicate rows: (41292, 97)


In [18]:
# Combine the v_leage and h_league into one feature: intra_league 
df_sm['intra_league'] = df_sm.apply(lambda x: 1 if x['h_league']==x['v_league'] else 0, axis=1)
df_sm.drop(['v_league','h_league'],axis=1, inplace=True)

In [19]:
# Make sure the game logs are sorted by date of game and reset the index
df_sm = df_sm.sort_values('date')
df_sm.index = np.arange(0,len(df_sm))

# Add a dt_date column to df_sm table:
df_sm['dt_date'] = df_sm['date'].apply(lambda x: str(x)[4:6] + '/' + str(x)[6:]+'/' + str(x)[:4])
df_sm['dt_date'] = pd.to_datetime(df_sm['dt_date'])

In [20]:
# I want to keep the current index as a column.
df_ixd = df_sm.reset_index()
df_ixd.rename(columns={"index": "real_index"},inplace=True)
df_ixd.set_index('dt_date',inplace=True)
print(df_ixd.shape)
df_ixd.head()

(41292, 97)


Unnamed: 0_level_0,level_0,real_index,date,num_game,day,v_team,v_team_game_num,h_team,h_team_game_num,v_team_score,...,h_pl_1_id,h_pl_2_id,h_pl_3_id,h_pl_4_id,h_pl_5_id,h_pl_6_id,h_pl_7_id,h_pl_8_id,h_pl_9_id,intra_league
dt_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-03-29,0,0,20000329,0,Wed,CHN,1,NYN,1,5,...,hendr001,hamid001,alfoe001,piazm001,ventr001,belld001,zeilt001,ordor001,hampm001,1
2000-03-30,1,1,20000330,0,Thu,NYN,2,CHN,2,5,...,youne001,bufod001,sosas001,gracm001,rodrh001,andrs001,husoj001,giraj001,farnk001,1
2000-04-03,2,13,20000403,0,Mon,KCA,1,TOR,1,4,...,stews002,bushh001,mondr002,delgc001,fullb001,batit001,fletd001,cruzj004,gonza001,1
2000-04-03,3,12,20000403,0,Mon,CHA,1,TEX,1,4,...,clayr001,greer001,rodri001,palmr001,mater001,segud001,kaplg001,evant001,alicl001,1
2000-04-03,4,11,20000403,0,Mon,DET,1,OAK,1,7,...,beckr002,tejam001,giamj001,jahaj001,staim001,grieb001,hernr002,chave001,menef001,1


In [21]:
df_ixd.drop('level_0',axis=1, inplace=True)
df_ixd.head()

Unnamed: 0_level_0,real_index,date,num_game,day,v_team,v_team_game_num,h_team,h_team_game_num,v_team_score,h_team_score,...,h_pl_1_id,h_pl_2_id,h_pl_3_id,h_pl_4_id,h_pl_5_id,h_pl_6_id,h_pl_7_id,h_pl_8_id,h_pl_9_id,intra_league
dt_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-03-29,0,20000329,0,Wed,CHN,1,NYN,1,5,3,...,hendr001,hamid001,alfoe001,piazm001,ventr001,belld001,zeilt001,ordor001,hampm001,1
2000-03-30,1,20000330,0,Thu,NYN,2,CHN,2,5,1,...,youne001,bufod001,sosas001,gracm001,rodrh001,andrs001,husoj001,giraj001,farnk001,1
2000-04-03,13,20000403,0,Mon,KCA,1,TOR,1,4,5,...,stews002,bushh001,mondr002,delgc001,fullb001,batit001,fletd001,cruzj004,gonza001,1
2000-04-03,12,20000403,0,Mon,CHA,1,TEX,1,4,10,...,clayr001,greer001,rodri001,palmr001,mater001,segud001,kaplg001,evant001,alicl001,1
2000-04-03,11,20000403,0,Mon,DET,1,OAK,1,7,4,...,beckr002,tejam001,giamj001,jahaj001,staim001,grieb001,hernr002,chave001,menef001,1


In [26]:
df_new = df_new.reset_index().reset_index()
df_new.drop('real_index', axis=1, inplace=True)
df_new.rename(columns={"index": "real_index"},inplace=True)
df_new.set_index('dt_date',inplace=True)
print(df_new.shape)
df_new.head()

(41292, 96)


Unnamed: 0_level_0,real_index,date,num_game,day,v_team,v_team_game_num,h_team,h_team_game_num,v_team_score,h_team_score,...,h_pl_1_id,h_pl_2_id,h_pl_3_id,h_pl_4_id,h_pl_5_id,h_pl_6_id,h_pl_7_id,h_pl_8_id,h_pl_9_id,intra_league
dt_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-03-29,0,20000329,0,Wed,CHN,1,NYN,1,5,3,...,hendr001,hamid001,alfoe001,piazm001,ventr001,belld001,zeilt001,ordor001,hampm001,1
2000-03-30,1,20000330,0,Thu,NYN,2,CHN,2,5,1,...,youne001,bufod001,sosas001,gracm001,rodrh001,andrs001,husoj001,giraj001,farnk001,1
2000-04-03,2,20000403,0,Mon,KCA,1,TOR,1,4,5,...,stews002,bushh001,mondr002,delgc001,fullb001,batit001,fletd001,cruzj004,gonza001,1
2000-04-03,3,20000403,0,Mon,CHA,1,TEX,1,4,10,...,clayr001,greer001,rodri001,palmr001,mater001,segud001,kaplg001,evant001,alicl001,1
2000-04-03,4,20000403,0,Mon,DET,1,OAK,1,7,4,...,beckr002,tejam001,giamj001,jahaj001,staim001,grieb001,hernr002,chave001,menef001,1


In [29]:
df_new = df_new.reset_index()
print(df_new.shape)
df_new.head()

(41292, 97)


Unnamed: 0,dt_date,real_index,date,num_game,day,v_team,v_team_game_num,h_team,h_team_game_num,v_team_score,...,h_pl_1_id,h_pl_2_id,h_pl_3_id,h_pl_4_id,h_pl_5_id,h_pl_6_id,h_pl_7_id,h_pl_8_id,h_pl_9_id,intra_league
0,2000-03-29,0,20000329,0,Wed,CHN,1,NYN,1,5,...,hendr001,hamid001,alfoe001,piazm001,ventr001,belld001,zeilt001,ordor001,hampm001,1
1,2000-03-30,1,20000330,0,Thu,NYN,2,CHN,2,5,...,youne001,bufod001,sosas001,gracm001,rodrh001,andrs001,husoj001,giraj001,farnk001,1
2,2000-04-03,2,20000403,0,Mon,KCA,1,TOR,1,4,...,stews002,bushh001,mondr002,delgc001,fullb001,batit001,fletd001,cruzj004,gonza001,1
3,2000-04-03,3,20000403,0,Mon,CHA,1,TEX,1,4,...,clayr001,greer001,rodri001,palmr001,mater001,segud001,kaplg001,evant001,alicl001,1
4,2000-04-03,4,20000403,0,Mon,DET,1,OAK,1,7,...,beckr002,tejam001,giamj001,jahaj001,staim001,grieb001,hernr002,chave001,menef001,1


# Save to CSV:

In [30]:
# Put all game logs into csv
df_new.to_csv('gamelogs.csv',index=False)