In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sp
import os

In [118]:
# Set the year range
year_range = [2010, 2022]

Below is a list of all columns:
'Date': Date of the game
'team1' : Home team
'team2' : Away team
'team1_rs' : Home team run scored in the game
'team2_rs' : Away team run scored in the game
'team1_gn' : Team1's total game number (number of games before this game +1)
'team2_gn' : Team2's total game number (number of games before this game +1)
'team1_home_gn' : Team1's home game number
'team2_away_gn' : Team2's away game number
'team1_wr' : Team1's win rate before the game
'team1_home_wr' : Team1's home win rate before the game
'team2_wr': Team2's win rate before the game
'team2_away_wr' : Team2's away win rate before the game
'team1_win': Team1 win = 1 otherwise = 0
'team1_total_rs': Team1's total run scored before the game
'team2_total_rs': Team2's total run scored before the game
'team1_avg_rs': Team1's average run scored before the game
'team2_avg_rs': Team2's average run scored before the game
'team1_home_rs': Team1's total run scored as home team before the game
'team2_away_rs': Team2's total run scored as away team before the game
'team1_avg_home_rs': Team1's average run scored as home team before the game
'team2_avg_away_rs': Team2's average run scored as away team before the game

In [119]:
years = np.arange(year_range[0], year_range[1]+1)

for year in years:

    ## Load row data in years
    fname = 'gl'+str(year)
    fpath = './unsorted_game_logs/' + fname + '.txt'
    colind = np.arange(0, 11)
    colind = np.append(colind, np.array([21, 22, 49, 50]))
    colnames =  ['Date', 'Ngame', 'DOW', 'VT', 'VTL', 'VTGN', 'HT', 'HTL', 'HTGN', 'VTS', 'HTS', 
                 'VT_at_bats', 'VT_hits', 'HT_at_bats', 'HT_hits']
    gl = pd.read_csv(fpath, usecols = colind, names = colnames, index_col = False)
    
    ## team list
    teams = np.unique(gl['HT'])

    ## Copy columns from raw data
    df1 = pd.DataFrame({'Date': gl['Date'],                   # Date of the game
                        'team1': gl['HT'],                    # Home team
                        'team2': gl['VT'],                    # Away team
                        'team1_rs': gl['HTS'],                # Home team run scored
                        'team2_rs': gl['VTS']})               # Away team run scored
    

    (team1_game_number , team2_game_number , team1_home_game_number , team2_away_game_number, 
    team1_total_rs , team2_total_rs , team1_home_rs , team2_away_rs , 
    team1_home_rs, team2_away_rs, team1_win_rate, team1_home_wr, team2_win_rate, team2_away_wr,
    team1_avg_rs, team2_avg_rs, team1_avg_home_rs, team2_avg_away_rs) = (np.zeros(len(df1)) for i in range(18)) 
    
    for i in range(len(df1)):
        df_current = df1.iloc[:i]
        team1 = df1.iloc[i]['team1']
        team2 = df1.iloc[i]['team2']

        team1_game_number[i] = sum(df_current['team1'] == team1) + sum(df_current['team2'] == team1) + 1
        team2_game_number[i] = sum(df_current['team1'] == team2) + sum(df_current['team2'] == team2) + 1
        team1_home_game_number[i] = sum(df_current['team1'] == team1) + 1
        team2_away_game_number[i] = sum(df_current['team2'] == team2) + 1
        team1_home_rs[i] = sum(df_current[df_current['team1'] == team1]['team1_rs'])
        team2_away_rs[i] = sum(df_current[df_current['team2'] == team2]['team2_rs'])
        team1_avg_home_rs[i] = team1_home_rs[i] / (team1_home_game_number[i]-1)
        team2_avg_away_rs[i] = team2_away_rs[i] / (team2_away_game_number[i]-1 )
        team1_total_rs[i] = team1_home_rs[i] + sum(df_current[df_current['team2'] == team1]['team2_rs'])
        team2_total_rs[i] = sum(df_current[df_current['team1'] == team2]['team1_rs']) + team2_away_rs[i]
        team1_avg_rs[i] = team1_total_rs[i] / (team1_game_number[i]-1)
        team2_avg_rs[i] = team2_total_rs[i] / (team2_game_number[i]-1)
        team1_home_wr[i] = sum((df_current['team1'] == team1) & (df_current['team1_rs']>df_current['team2_rs'])) / (team1_home_game_number[i]-1)
        team2_away_wr[i] = sum((df_current['team2'] == team2) & (df_current['team1_rs']<df_current['team2_rs'])) / (team2_away_game_number[i]-1)
        team1_win_rate[i] = (sum((df_current['team1'] == team1) & (df_current['team1_rs']>df_current['team2_rs']))
                            + sum((df_current['team2'] == team1) & (df_current['team1_rs']<df_current['team2_rs']))) / (team1_game_number[i]-1)
        team2_win_rate[i] = (sum((df_current['team1'] == team2) & (df_current['team1_rs']>df_current['team2_rs']))
                            + sum((df_current['team2'] == team2) & (df_current['team1_rs']<df_current['team2_rs']))) / (team2_game_number[i]-1)
        

    
    df1['team1_gn'] = team1_game_number                     # Team1's total game number (number of games before this game +1)
    df1['team2_gn'] = team2_game_number                     # Team2's total game number (number of games before this game +1)
    df1['team1_home_gn'] = team1_home_game_number           # Team1's home game number
    df1['team2_away_gn'] = team2_away_game_number           # Team2's away game number
    df1['team1_win'] = (df1['team1_rs'] > df1['team2_rs']) * 1    # Team1 win the game = 1 otherwise = 0
    df1['team1_wr'] = team1_win_rate                        # Team1's win rate before the game
    df1['team1_home_wr'] = team1_home_wr                    # Team1's home win rate before the game
    df1['team2_wr'] = team2_win_rate                        # Team2's win rate before the game
    df1['team2_away_wr'] = team2_away_wr                    # Team2's away win rate before the game
    df1['team1_total_rs'] = team1_total_rs                  # Team1's total run scored before the game
    df1['team2_total_rs'] = team2_total_rs                  # Team2's total run scored before the game
    df1['team1_avg_rs'] = team1_avg_rs                      # Team1's avg run scored before the game
    df1['team2_avg_rs'] = team2_avg_rs                      # Team2's avg run scored before the game
    df1['team1_home_rs'] = team1_home_rs                    # Team1's total run scored as home team before the game
    df1['team2_away_rs'] = team2_away_rs                    # Team2's total run scored as away team before the game
    df1['team1_avg_home_rs'] = team1_avg_home_rs                    # Team1's total run scored as home team before the game
    df1['team2_avg_away_rs'] = team2_avg_away_rs                    # Team2's total run scored as away team before the game

    ## Save the dataframe to .csv file   
    save_file = './game_logs/' + fname + '_sorted.csv'
    if not os.path.exists('./game_logs'):
        os.makedirs('./game_logs')
    df1.to_csv(save_file, index = False)

  team1_avg_home_rs[i] = team1_home_rs[i] / (team1_home_game_number[i]-1)
  team2_avg_away_rs[i] = team2_away_rs[i] / (team2_away_game_number[i]-1 )
  team1_avg_rs[i] = team1_total_rs[i] / (team1_game_number[i]-1)
  team2_avg_rs[i] = team2_total_rs[i] / (team2_game_number[i]-1)
  team1_home_wr[i] = sum((df_current['team1'] == team1) & (df_current['team1_rs']>df_current['team2_rs'])) / (team1_home_game_number[i]-1)
  team2_away_wr[i] = sum((df_current['team2'] == team2) & (df_current['team1_rs']<df_current['team2_rs'])) / (team2_away_game_number[i]-1)
  team1_win_rate[i] = (sum((df_current['team1'] == team1) & (df_current['team1_rs']>df_current['team2_rs']))
  team2_win_rate[i] = (sum((df_current['team1'] == team2) & (df_current['team1_rs']>df_current['team2_rs']))
  team1_avg_home_rs[i] = team1_home_rs[i] / (team1_home_game_number[i]-1)
  team2_avg_away_rs[i] = team2_away_rs[i] / (team2_away_game_number[i]-1 )
  team1_avg_rs[i] = team1_total_rs[i] / (team1_game_number[i]-1)
  team2_avg

In [114]:
team1_home_wr

array([   nan, 0.    ,    nan, ..., 0.4625, 0.55  , 0.625 ])