# Loading Gamelog Data
In this notebook, we will load the data from the gamelogs that were downloaded from retrosheet. There is one gamelog file for each season. It would be ideal to have a file for each team. This should only need to be run once.

## Imports

In [1]:
import os
import numpy as np
import pandas as pd

## Gamelog for each team

In [2]:
TEAM_GLS_DIR = "../data/team_gls"
GLS_DIR = "../retrosheet_data/gls/"

Get a list of all teams, open random gamelog and get set of home teams

In [3]:
# Read gamelog file
gl_2015 = pd.read_csv(os.path.join(GLS_DIR, "GL2015.txt"), delimiter=",")

# Get teams
teams = gl_2015["Home Team"][~gl_2015["Home Team"].duplicated()]
assert len(teams) == 30

Get the home stats for each team when they are the home team

In [4]:
# Attributes to uniquely identify a game
game_id = ['Date',
           'Number of game',
           'Day',
           'Visiting Team',
           'Home Team', 
           'Visiting Team Starting Pitcher ID', 
           'Home Team Starting Pitcher ID',
           'Visiting Team Score',
           'Home Team Score']

# Home attributes to include
home_attr = ['Home Team At-bats',
             'Home Team Hits',
             'Home Team Doubles',
             'Home Team Triples',
             'Home Team Homeruns',
             'Home Team RBIs',
             'Home Team Sacrifice Hits',
             'Home Team Sacrifice Flies',
             'Home Team Hit-by-pitch',
             'Home Team Walks',
             'Home Team Intentional Walks',
             'Home Team Strikeouts',
             'Home Team Stolen Bases',
             'Home Team Caught Stealing',
             'Home Team Grounded into DP',
             'Home Team Awarded First Base due to CI',
             'Home Team Left on Base',
             'Home Team Pitchers Used',
             'Home Team Individual Earned Runs',
             'Home Team Earned Runs',
             'Home Team Wild Pitches',
             'Home Team Balks',
             'Home Team Putouts',
             'Home Team Assists',
             'Home Team Errors',
             'Home Team Passed Balls',
             'Home Team Double Plays',
             'Home Team Triple Plays']

# Dictionary for home games, keys are teams
home_gls = dict()

for gl_file in os.listdir(GLS_DIR):
    gl = pd.read_csv(os.path.join(GLS_DIR, gl_file), delimiter=",")
    for team in teams:
        team_gl = gl[game_id + home_attr][gl["Home Team"] == team]
        if team in home_gls.keys():
            home_gls[team] = pd.concat([home_gls[team], team_gl], ignore_index=True)
        else:
            home_gls[team] = gl[game_id + home_attr][gl["Home Team"] == team]

In [5]:
away_attr = ['Visiting Team At-bats',
             'Visiting Team Hits',
             'Visiting Team Doubles',
             'Visiting Team Triples',
             'Visiting Team Homeruns',
             'Visiting Team RBIs',
             'Visiting Team Sacrifice Hits',
             'Visiting Team Sacrifice Flies',
             'Visiting Team Hit-by-pitch',
             'Visiting Team Walks',
             'Visiting Team Intentional Walks',
             'Visiting Team Strikeouts',
             'Visiting Team Stolen Bases',
             'Visiting Team Caught Stealing',
             'Visiting Team Grounded into DP',
             'Visiting Team Awarded First Base due to CI',
             'Visiting Team Left on Base',
             'Visiting Team Pitchers Used',
             'Visiting Team Individual Earned Runs',
             'Visiting Team Earned Runs',
             'Visiting Team Wild Pitches',
             'Visiting Team Balks',
             'Visiting Team Putouts',
             'Visiting Team Assists',
             'Visiting Team Errors',
             'Visiting Team Passed Balls',
             'Visiting Team Double Plays',
             'Visiting Team Triple Plays']

# Dictionary for away games, keys are teams
away_gls = dict()

for gl_file in os.listdir(GLS_DIR):
    gl = pd.read_csv(os.path.join(GLS_DIR, gl_file), delimiter=",")
    for team in teams:
        team_gl = gl[game_id + away_attr][gl["Visiting Team"] == team]
        if team in away_gls.keys():
            away_gls[team] = pd.concat([away_gls[team], team_gl], ignore_index=True)
        else:
            away_gls[team] = gl[game_id + away_attr][gl["Visiting Team"] == team]

Save dataframes to csvs for each team in appropriate folder.

In [6]:
# Make folder for team game logs
if not os.path.exists(TEAM_GLS_DIR):
    os.makedirs(TEAM_GLS_DIR)

# Clear folder if there are files in it
for file in os.listdir(TEAM_GLS_DIR):
    os.remove(os.path.join(TEAM_GLS_DIR, file))

# Loop through teams and save df
for team in teams:
    
    # Get home and away dfs for team
    home_gl = home_gls[team]
    away_gl = away_gls[team]
    
    # Rename columns
    home_gl.columns = game_id + [col.replace("Home ", "") for col in home_attr]
    away_gl.columns = game_id + [col.replace("Visiting ", "") for col in away_attr]
    
    # Join away and home gamelogs
    team_gl = pd.concat([home_gl, away_gl], ignore_index=True).sort_values(by="Date")
    
    # ID that retrosheet uses to identify games: Home Team + Year + Month + Day + Number of Game
    team_gl["Number of game"] = team_gl["Number of game"].astype("string")
    team_gl["Date"] = team_gl["Date"].astype("string")
    team_gl["Game ID"] = team_gl["Home Team"] + team_gl["Date"] + team_gl["Number of game"]
    
    # Change back to integer
    team_gl["Number of game"] = pd.to_numeric(team_gl["Number of game"])
    team_gl["Date"] = pd.to_numeric(team_gl["Date"])
    
    # Change data to be datetime datatype and save csv
    team_gl["Date"] = pd.to_datetime(team_gl["Date"], format="%Y%m%d")
    team_gl.to_csv(os.path.join(TEAM_GLS_DIR, f"GL{team}.csv"), index=False)