# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from datetime import date
from functools import reduce
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

# Data

In [2]:
PATH_TO_GAME_LOGS = './datasets/retro_sheet_gls/'

def get_game_logs(game_log_dir=PATH_TO_GAME_LOGS):
    """Function to get game log"""
    
    def generate_date(date_of_game):
        """Helper function that parses a game id for the date"""
        
        date_of_game = str(date_of_game)
        
        # Parse the date
        year = int(date_of_game[0:4])
        month = int(date_of_game[4:6])
        day = int(date_of_game[6:8])
        
        return date(year, month, day)
    
    dfs = []
    for file in os.listdir(game_log_dir):
        df = pd.read_csv(os.path.join(game_log_dir, file))
        df['Date'] = df.apply(lambda row : generate_date(row['Date']), axis=1)
        dfs.append(df)
    return pd.concat(dfs)

In [3]:
game_logs = get_game_logs()
game_logs

Unnamed: 0,Date,Number of game,Day,Visiting Team,Visiting Team League,Visiting Team Game Number,Home Team,Home Team League,Home Team Game Number,Visiting Team Score,Home Team Score,Length of Game (outs),Time of Game (Day/Night),Completion Information,Forfeit Information,Protest Information,Park ID,Attendance,Length of Game (minutes),Visiting Team Line Score,Home Team Line Score,Visiting Team At-bats,Visiting Team Hits,Visiting Team Doubles,Visiting Team Triples,Visiting Team Homeruns,Visiting Team RBIs,Visiting Team Sacrifice Hits,Visiting Team Sacrifice Flies,Visiting Team Hit-by-pitch,Visiting Team Walks,Visiting Team Intentional Walks,Visiting Team Strikeouts,Visiting Team Stolen Bases,Visiting Team Caught Stealing,Visiting Team Grounded into DP,Visiting Team Awarded First Base due to CI,Visiting Team Left on Base,Visiting Team Pitchers Used,Visiting Team Individual Earned Runs,Visiting Team Earned Runs,Visiting Team Wild Pitches,Visiting Team Balks,Visiting Team Putouts,Visiting Team Assists,Visiting Team Errors,Visiting Team Passed Balls,Visiting Team Double Plays,Visiting Team Triple Plays,Home Team At-bats,Home Team Hits,Home Team Doubles,Home Team Triples,Home Team Homeruns,Home Team RBIs,Home Team Sacrifice Hits,Home Team Sacrifice Flies,Home Team Hit-by-pitch,Home Team Walks,Home Team Intentional Walks,Home Team Strikeouts,Home Team Stolen Bases,Home Team Caught Stealing,Home Team Grounded into DP,Home Team Awarded First Base due to CI,Home Team Left on Base,Home Team Pitchers Used,Home Team Individual Earned Runs,Home Team Earned Runs,Home Team Wild Pitches,Home Team Balks,Home Team Putouts,Home Team Assists,Home Team Errors,Home Team Passed Balls,Home Team Double Plays,Home Team Triple Plays,Home Plate Umpire ID,Home Plate Umpire Name,1B Umpire ID,1B Umpire Name,2B Umpire ID,2B Umpire Name,3B Umpire ID,3B Umpire Name,LF Umpire ID,LF Umpire Name,RF Umpire ID,RF Umpire Name,Visiting Team Manager ID,Visiting Team Manager Name,Home Team Manager ID,Home Team Manager Name,Winning Pitcher ID,Winning Pitcher Name,Losing Pitcher ID,Losing Pitcher Name,Saving Pitcher ID,Saving Pitcher Name,Game Winning RBI Batter ID,Game WInning RBI Batter Name,Visiting Team Starting Pitcher ID,Visiting Team Starting Pitcher Name,Home Team Starting Pitcher ID,Home Team Starting Pitcher Name,Visiting Team Player 1 ID,Visiting Team Player 1 Name,Visiting Team Player 1 Defensive Position,Visiting Team Player 2 ID,Visiting Team Player 2 Name,Visiting Team Player 2 Defensive Position,Visiting Team Player 3 ID,Visiting Team Player 3 Name,Visiting Team Player 3 Defensive Position,Visiting Team Player 4 ID,Visiting Team Player 4 Name,Visiting Team Player 4 Defensive Position,Visiting Team Player 5 ID,Visiting Team Player 5 Name,Visiting Team Player 5 Defensive Position,Visiting Team Player 6 ID,Visiting Team Player 6 Name,Visiting Team Player 6 Defensive Position,Visiting Team Player 7 ID,Visiting Team Player 7 Name,Visiting Team Player 7 Defensive Position,Visiting Team Player 8 ID,Visiting Team Player 8 Name,Visiting Team Player 8 Defensive Position,Visiting Team Player 9 ID,Visiting Team Player 9 Name,Visiting Team Player 9 Defensive Position,Home Team Player 1 ID,Home Team Player 1 Name,Home Team Player 1 Defensive Position,Home Team Player 2 ID,Home Team Player 2 Name,Home Team Player 2 Defensive Position,Home Team Player 3 ID,Home Team Player 3 Name,Home Team Player 3 Defensive Position,Home Team Player 4 ID,Home Team Player 4 Name,Home Team Player 4 Defensive Position,Home Team Player 5 ID,Home Team Player 5 Name,Home Team Player 5 Defensive Position,Home Team Player 6 ID,Home Team Player 6 Name,Home Team Player 6 Defensive Position,Home Team Player 7 ID,Home Team Player 7 Name,Home Team Player 7 Defensive Position,Home Team Player 8 ID,Home Team Player 8 Name,Home Team Player 8 Defensive Position,Home Team Player 9 ID,Home Team Player 9 Name,Home Team Player 9 Defensive Position,Miscellaneous,Acquisition Information
0,2014-03-22,0,Sat,LAN,NL,1,ARI,NL,1,3,1,54,N,,,,SYD01,38266.0,169,10200000,000001000,33,5,2,0,1,3,0,0,1,3,0,11,0,0,0,0,7,4,1,1,1,0,27,13,1,0,0,0,33,5,1,0,0,1,0,0,0,2,0,10,0,0,0,0,7,5,3,3,1,0,27,10,1,0,0,0,welkt901,Tim Welke,scotd901,Dale Scott,diazl901,Laz Diaz,carlm901,Mark Carlson,,(none),,(none),mattd001,Don Mattingly,gibsk001,Kirk Gibson,kersc001,Clayton Kershaw,milew001,Wade Miley,jansk001,Kenley Jansen,ethia001,Andre Ethier,kersc001,Clayton Kershaw,milew001,Wade Miley,puigy001,Yasiel Puig,9,turnj001,Justin Turner,4,ramih003,Hanley Ramirez,6,gonza003,Adrian Gonzalez,3,vanss001,Scott Van Slyke,7,uribj002,Juan Uribe,5,ethia001,Andre Ethier,8,ellia001,A.J. Ellis,2,kersc001,Clayton Kershaw,1,polla001,A.J. Pollock,8,hilla001,Aaron Hill,4,goldp001,Paul Goldschmidt,3,pradm001,Martin Prado,5,trumm001,Mark Trumbo,7,montm001,Miguel Montero,2,owinc001,Chris Owings,6,parrg001,Gerardo Parra,9,milew001,Wade Miley,1,,Y
1,2014-03-23,0,Sun,LAN,NL,2,ARI,NL,2,7,5,54,D,,,,SYD01,38079.0,241,102021100,000000014,34,13,3,0,0,6,1,2,2,8,0,7,1,0,1,0,13,8,5,5,0,0,27,4,1,0,2,0,35,8,0,0,1,5,0,0,0,8,0,8,0,0,2,0,11,6,6,6,1,0,27,15,3,0,1,0,scotd901,Dale Scott,diazl901,Laz Diaz,carlm901,Mark Carlson,welkt901,Tim Welke,,(none),,(none),mattd001,Don Mattingly,gibsk001,Kirk Gibson,ryu-h001,Hyun-Jin Ryu,cahit001,Trevor Cahill,,(none),ethia001,Andre Ethier,ryu-h001,Hyun-Jin Ryu,cahit001,Trevor Cahill,gordd002,Dee Gordon,4,puigy001,Yasiel Puig,9,ramih003,Hanley Ramirez,6,gonza003,Adrian Gonzalez,3,ethia001,Andre Ethier,8,ellia001,A.J. Ellis,2,baxtm001,Mike Baxter,7,uribj002,Juan Uribe,5,ryu-h001,Hyun-Jin Ryu,1,polla001,A.J. Pollock,8,hilla001,Aaron Hill,4,goldp001,Paul Goldschmidt,3,pradm001,Martin Prado,5,montm001,Miguel Montero,2,trumm001,Mark Trumbo,7,parrg001,Gerardo Parra,9,gregd001,Didi Gregorius,6,cahit001,Trevor Cahill,1,,Y
2,2014-03-30,0,Sun,LAN,NL,3,SDN,NL,1,1,3,51,N,,,,SAN02,45567.0,169,10000,00000003x,31,4,0,0,0,1,0,0,0,3,0,9,0,0,0,0,6,4,2,2,0,0,24,12,2,0,2,0,27,5,0,0,1,3,2,0,0,4,0,10,1,0,2,0,6,5,1,1,1,0,27,10,0,0,0,0,culbf901,Fieldin Culbreth,gonzm901,Manny Gonzalez,reynj901,Jim Reynolds,barbs901,Sean Barber,,(none),,(none),mattd001,Don Mattingly,blacb001,Buddy Black,thayd001,Dale Thayer,wilsb001,Brian Wilson,streh001,Huston Street,denoc001,Chris Denorfia,ryu-h001,Hyun-Jin Ryu,casha001,Andrew Cashner,crawc002,Carl Crawford,7,puigy001,Yasiel Puig,9,ramih003,Hanley Ramirez,6,gonza003,Adrian Gonzalez,3,ethia001,Andre Ethier,8,uribj002,Juan Uribe,5,ellia001,A.J. Ellis,2,gordd002,Dee Gordon,4,ryu-h001,Hyun-Jin Ryu,1,cabre001,Everth Cabrera,6,denoc001,Chris Denorfia,9,headc001,Chase Headley,5,gyorj001,Jedd Gyorko,4,alony001,Yonder Alonso,3,medit001,Tommy Medica,7,venaw001,Will Venable,8,river003,Rene Rivera,2,casha001,Andrew Cashner,1,,Y
3,2014-03-31,0,Mon,SEA,AL,1,ANA,AL,1,10,3,54,N,,,,ANA01,44152.0,197,10001206,201000000,36,11,4,2,1,10,0,1,0,8,1,11,1,1,0,0,8,5,2,2,2,0,27,5,1,0,0,0,34,6,1,0,1,3,0,0,1,1,0,13,1,0,0,0,6,5,9,9,0,0,27,7,1,0,0,0,westj901,Joe West,fostm901,Marty Foster,drakr901,Rob Drake,porta901,Alan Porter,,(none),,(none),mccll001,Lloyd McClendon,sciom001,Mike Scioscia,hernf002,Felix Hernandez,weavj003,Jered Weaver,,(none),almoa001,Abraham Almonte,hernf002,Felix Hernandez,weavj003,Jered Weaver,almoa001,Abraham Almonte,8,millb002,Brad Miller,6,canor001,Robinson Cano,4,smoaj001,Justin Smoak,3,morrl001,Logan Morrison,10,seagk001,Kyle Seager,5,saunm001,Michael Saunders,9,ackld001,Dustin Ackley,7,zunim001,Mike Zunino,2,calhk001,Kole Calhoun,9,troum001,Mike Trout,8,pujoa001,Albert Pujols,3,hamij003,Josh Hamilton,7,freed001,David Freese,5,ibanr001,Raul Ibanez,10,kendh001,Howie Kendrick,4,iannc001,Chris Iannetta,2,aybae001,Erick Aybar,6,,Y
4,2014-03-31,0,Mon,BOS,AL,1,BAL,AL,1,1,2,51,D,,,,BAL12,46685.0,173,100000,01000010x,36,9,2,0,1,1,0,0,1,3,0,6,0,0,0,0,12,2,2,2,0,0,24,11,0,0,2,0,28,6,0,0,1,1,0,0,0,1,0,9,0,0,2,0,3,5,1,1,0,0,27,13,0,0,0,0,demud901,Dana DeMuth,kulpr901,Ron Kulpa,hicke901,Ed Hickox,barrl901,Lance Barrett,,(none),,(none),farrj001,John Farrell,showb801,Buck Showalter,britz001,Zack Britton,lestj001,Jon Lester,huntt002,Tommy Hunter,cruzn002,Nelson Cruz,lestj001,Jon Lester,tillc001,Chris Tillman,navad002,Daniel Nava,9,pedrd001,Dustin Pedroia,4,ortid001,David Ortiz,10,napom001,Mike Napoli,3,carpm001,Mike Carp,7,sizeg001,Grady Sizemore,8,bogax001,Xander Bogaerts,6,piera001,A.J. Pierzynski,2,middw001,Will Middlebrooks,5,markn001,Nick Markakis,9,hardj003,J.J. Hardy,6,jonea003,Adam Jones,8,davic003,Chris Davis,3,cruzn002,Nelson Cruz,7,wietm001,Matt Wieters,2,yound003,Delmon Young,10,flahr001,Ryan Flaherty,5,schoj001,Jonathan Schoop,4,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2424,2019-09-29,0,Sun,DET,AL,161,CHA,AL,161,3,5,51,D,,,,CHI12,19534.0,157,101000001,10000400x,31,7,1,0,2,3,0,0,0,3,0,8,1,0,2,0,4,4,4,4,1,0,24,7,1,0,1,0,28,4,2,0,1,4,0,0,3,1,0,12,0,0,1,0,3,6,3,3,0,0,27,15,0,0,2,0,morag901,Gabe Morales,segac901,Chris Segal,mealj901,Jerry Meals,kulpr901,Ron Kulpa,,(none),,(none),gardr001,Ron Gardenhire,rentr001,Rich Renteria,cordj001,Jimmy Cordero,turns003,Spencer Turnbull,herrk001,Kelvin Herrera,jimee001,Eloy Jimenez,turns003,Spencer Turnbull,detwr001,Ross Detwiler,reyev001,Victor Reyes,8,mercj002,Jordy Mercer,4,cabrm001,Miguel Cabrera,10,hickj001,John Hicks,3,rodrr009,Ronny Rodriguez,5,stewc002,Christin Stewart,7,demet001,Travis Demeritte,9,greig001,Grayson Greiner,2,castw003,Willi Castro,6,sancc001,Yolmer Sanchez,4,andet001,Tim Anderson,6,abrej003,Jose Abreu,3,moncy001,Yoan Moncada,5,jimee001,Eloy Jimenez,7,collz001,Zack Collins,2,castw002,Welington Castillo,10,palkd001,Daniel Palka,9,engea001,Adam Engel,8,,Y
2425,2019-09-29,0,Sun,MIN,AL,162,KCA,AL,162,4,5,52,D,,,,KAN06,17875.0,160,300010000,100200011,32,6,1,0,3,4,0,0,1,0,0,8,0,0,0,0,2,6,5,5,0,0,25,8,0,0,0,0,33,9,3,1,1,5,0,1,0,3,0,7,0,0,0,0,7,4,4,4,0,0,27,4,0,0,1,0,libkj901,John Libka,hicke901,Ed Hickox,torrc901,Carlos Torres,nauep901,Paul Nauert,,(none),,(none),baldr001,Rocco Baldelli,yoste001,Ned Yost,kenni001,Ian Kennedy,gratb002,Brusdar Graterol,,(none),philb002,Brett Phillips,perem004,Martin Perez,lopej004,Jorge Lopez,wadel001,LaMonte Wade,7,polaj001,Jorge Polanco,6,sanom001,Miguel Sano,10,cronc002,C.J. Cron,3,cavej001,Jake Cave,8,schoj001,Jonathan Schoop,4,castj006,Jason Castro,2,torrr001,Ronald Torreyes,5,milli001,Ian Miller,9,merrw001,Whit Merrifield,4,solej001,Jorge Soler,10,dozih001,Hunter Dozier,5,gorda001,Alex Gordon,7,mcbrr001,Ryan McBroom,9,cuthc001,Cheslor Cuthbert,3,mejie001,Erick Mejia,8,arteh001,Humberto Arteaga,6,dinin001,Nick Dini,2,,Y
2426,2019-09-29,0,Sun,OAK,AL,162,SEA,AL,162,1,3,51,D,,,,SEA03,16819.0,172,000000001,20001000x,30,3,1,0,0,1,0,0,0,4,0,8,1,0,0,0,6,5,3,3,1,0,24,6,0,0,0,0,31,7,2,0,1,3,0,0,0,5,0,7,0,0,0,0,9,7,1,1,4,0,27,8,0,0,0,0,hudsm901,Marvin Hudson,herna901,Angel Hernandez,fairc901,Chad Fairchild,blakr901,Ryan Blakney,,(none),,(none),melvb001,Bob Melvin,servs002,Scott Servais,mcclr001,Reggie McClain,roart001,Tanner Roark,bassa001,Anthony Bass,seagk001,Kyle Seager,roart001,Tanner Roark,dunnj002,Justin Dunn,semim001,Marcus Semien,6,profj001,Jurickson Profar,7,piscs001,Stephen Piscotty,9,davik003,Khris Davis,10,brows003,Seth Brown,3,phegj001,Josh Phegley,2,neuss001,Sheldon Neuse,5,barrf001,Franklin Barreto,4,bolts001,Skye Bolt,8,longs001,Shed Long,7,crawj002,J.P. Crawford,6,nolaa002,Austin Nola,3,seagk001,Kyle Seager,5,lewik001,Kyle Lewis,9,narvo001,Omar Narvaez,2,voged001,Daniel Vogelbach,10,smitm007,Mallex Smith,8,gordd002,Dee Gordon,4,,Y
2427,2019-09-29,0,Sun,NYA,AL,162,TEX,AL,162,1,6,51,D,,,,ARL02,47144.0,167,001000000,10202100x,30,3,0,0,1,1,0,0,0,1,0,13,0,0,0,0,3,6,4,4,0,0,24,8,2,0,2,0,31,10,1,0,0,4,0,1,1,4,0,6,3,0,2,0,7,2,1,1,0,0,27,3,0,0,0,0,bakej902,Jordan Baker,tosia901,Alex Tosi,cedeg901,Gary Cederstrom,johna901,Adrian Johnson,,(none),,(none),boona001,Aaron Boone,woodc001,Chris Woodward,lynnl001,Lance Lynn,tanam001,Masahiro Tanaka,,(none),,(none),greec003,Chad Green,lynnl001,Lance Lynn,lemad001,DJ LeMahieu,3,judga001,Aaron Judge,9,gardb001,Brett Gardner,8,stanm004,Giancarlo Stanton,10,torrg001,Gleyber Torres,4,sancg002,Gary Sanchez,2,gregd001,Didi Gregorius,6,urshg001,Giovanny Urshela,5,maybc001,Cameron Maybin,7,choos001,Shin-Soo Choo,10,andre001,Elvis Andrus,6,calhw001,Willie Calhoun,7,santd001,Danny Santana,9,odorr001,Rougned Odor,4,solan001,Nick Solak,5,guzmr001,Ronald Guzman,3,deshd002,Delino DeShields,8,trevj001,Jose Trevino,2,,Y


In [4]:
PATH_TO_TEAM_GAMES = './datasets/team_stats/'
def get_team_games(teams_games_dir=PATH_TO_TEAM_GAMES):
    
    def generate_date(date_of_game):
        """Helper function that parses a game id for the date"""
        
        date_of_game = str(date_of_game)
        
        # Parse the date
        year = int(date_of_game[0:4])
        month = int(date_of_game[4:6])
        day = int(date_of_game[6:8])
        
        return date(year, month, day)
    
    team_stats = dict()
    
    for file in os.listdir(teams_games_dir):
        df = pd.read_csv(os.path.join(teams_games_dir, file))
        df['Date'] = df.apply(lambda row : generate_date(row['Date']), axis=1)
        df.rename(columns={'Earned Runs': 'Team Earned Runs'}, inplace=True)
        team_stats[file[0:3]] = df.sort_values(by=['Date', 'Number of game']).reset_index(drop=True)

    return team_stats

In [5]:
def get_team_rolling_sums(team_games, window=5):
    
    for _, df in team_games.items():
        for col in df.columns[6::]:
            df[f'{col} {window} Game Window'] = df[col].rolling(window).sum()
            df[f'{col} {window} Game Window'] = df[f'{col} {window} Game Window'].shift(1)
        
        df.drop(columns=df.columns[3:33], inplace=True)
        
        # Add BA
        df[f'BA {window} Game Window'] = (df[f'Hits {window} Game Window'] /
                                          df[f'At-bats {window} Game Window'])
        
        # Add SLG = (1B + 2Bx2 + 3Bx3 + HRx4)/AB.
        df[f'SLG {window} Game Window'] = ((df[f'Hits {window} Game Window'] +
                                            df[f'Doubles {window} Game Window'] +
                                            2 * df[f'Triples {window} Game Window'] +
                                            3 * df[f'Homeruns {window} Game Window']) /
                                           df[f'At-bats {window} Game Window'])
        
        # Add OBP = (Hits + Walks + Hit by Pitch) / (At Bats + Walks + Hit by Pitch + Sacrifice Flies
        df[f'OBP {window} Game Window'] = ((df[f'Hits {window} Game Window'] +
                                            df[f'Walks {window} Game Window'] +
                                            df[f'Hit-by-pitch {window} Game Window']) /
                                           (df[f'At-bats {window} Game Window'] + 
                                            df[f'Walks {window} Game Window'] +
                                            df[f'Hit-by-pitch {window} Game Window'] +
                                            df[f'Sacrifice Flies {window} Game Window']))
        
        # OPS = OBS + SLG
        df[f'OPS {window} Game Window'] = (df[f'OBP {window} Game Window'] +
                                           df[f'SLG {window} Game Window'])
        
        # ISO = SLG - BA
        df[f'ISO {window} Game Window'] = (df[f'SLG {window} Game Window'] -
                                           df[f'BA {window} Game Window'])
        
        # Drop stats that won't be needed
        df.drop(columns=(list(df.columns[5:8]) +
                         list(df.columns[9:21]) +
                         list(df.columns[22:30])),
                inplace=True)
        
        df.dropna(inplace=True)
        df.reset_index(drop=True, inplace=True)
            
    return team_games

In [6]:
get_team_rolling_sums(get_team_games())['NYA']

Unnamed: 0,Date,Number of game,Team,At-bats 5 Game Window,Hits 5 Game Window,RBIs 5 Game Window,Team Earned Runs 5 Game Window,BA 5 Game Window,SLG 5 Game Window,OBP 5 Game Window,OPS 5 Game Window,ISO 5 Game Window
0,2014-04-06,0,NYA,169.0,43.0,13.0,17.0,0.254438,0.319527,0.335079,0.654605,0.065089
1,2014-04-07,0,NYA,172.0,46.0,17.0,15.0,0.267442,0.354651,0.345361,0.700012,0.087209
2,2014-04-08,0,NYA,169.0,47.0,20.0,14.0,0.278107,0.366864,0.365979,0.732843,0.088757
3,2014-04-09,0,NYA,177.0,53.0,21.0,26.0,0.299435,0.435028,0.376884,0.811913,0.135593
4,2014-04-10,0,NYA,169.0,47.0,18.0,29.0,0.278107,0.437870,0.352632,0.790501,0.159763
...,...,...,...,...,...,...,...,...,...,...,...,...
962,2019-09-24,0,NYA,161.0,38.0,29.0,11.0,0.236025,0.503106,0.295455,0.798560,0.267081
963,2019-09-25,0,NYA,171.0,39.0,28.0,11.0,0.228070,0.473684,0.284946,0.758630,0.245614
964,2019-09-27,0,NYA,166.0,32.0,19.0,14.0,0.192771,0.367470,0.258242,0.625712,0.174699
965,2019-09-28,0,NYA,176.0,44.0,30.0,16.0,0.250000,0.494318,0.317949,0.812267,0.244318


Now we need to add the pitcher data

In [7]:
PATH_TO_PITCHER_STATS = './datasets/pitcher_stats/'

def get_pitching_stats(directory=PATH_TO_PITCHER_STATS):
    """Function returns a dictionary, where each key is a player_id
    and each value is a dataframe object containing the player's stats"""
    
    def generate_date(game_id):
        """Helper function that parses a game id for the date"""
        
        # Parse the game_id
        year = int(game_id[3:7])
        month = int(game_id[7:9])
        day = int(game_id[9:11])
        
        return date(year, month, day)
    
    def generate_game_number(game_id):
        """Helper function that parses a game id and returns the game number"""
        return int(game_id[-1])        
    
    player_stats_dict = dict()
    
    for file in tqdm(os.listdir(directory), unit='players', desc='Fetching Pitcher Stats'):
        df = pd.read_csv(os.path.join(directory, file))
        df['Date'] = df.apply(lambda row : generate_date(row['Game ID']), axis=1) 
        df['Game Number'] = df.apply(lambda row : generate_game_number(row['Game ID']), axis=1)
        df['ID'] = file[0:-4]
        df.drop(columns=['Unnamed: 0'], inplace=True)
        player_id = df.iloc[0]['ID']
        player_stats_dict[player_id] = df.sort_values(by=['Date', 'Game Number']).reset_index(drop=True)
    
    return player_stats_dict

In [8]:
get_pitching_stats()['kersc001']

HBox(children=(HTML(value='Fetching Pitcher Stats'), FloatProgress(value=0.0, max=746.0), HTML(value='')))




Unnamed: 0,Game ID,ID,Balls,Strikes,Homeruns Allowed,Hits Allowed,Strikeouts,Pickoff Errors,Pickoffs,Wild Pitches,Balks,Walks,Intentional Walks,Hit by Pitch,Earned Runs,Innings Pitched,Date,Game Number
0,ARI201403220,kersc001,28,40,0,5,7,0,0,1,0,1,0,0,1,6.666667,2014-03-22,0
1,WAS201405060,kersc001,21,36,0,9,9,0,1,0,0,0,0,0,0,7.000000,2014-05-06,0
2,LAN201405110,kersc001,26,30,1,7,9,0,0,0,0,0,0,0,3,7.000000,2014-05-11,0
3,ARI201405170,kersc001,20,16,0,6,3,0,0,0,1,2,0,0,6,1.666667,2014-05-17,0
4,PHI201405230,kersc001,30,39,0,2,9,0,0,0,0,3,0,0,0,6.000000,2014-05-23,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,ARI201908310,kersc001,32,33,2,5,6,0,0,0,0,2,0,0,5,5.000000,2019-08-31,0
158,LAN201909060,kersc001,30,33,1,7,6,0,0,0,0,3,0,0,1,4.000000,2019-09-06,0
159,NYN201909130,kersc001,40,32,1,4,5,0,0,0,0,3,0,0,1,6.333333,2019-09-13,0
160,LAN201909200,kersc001,32,37,3,8,5,0,0,0,0,1,0,0,4,6.000000,2019-09-20,0


In [9]:
def get_pitching_rolling_sums(dictionary, window=10):
    """Function that will generate statsistics based on a rolling window"""
    
    for k, df in tqdm(dictionary.items(), unit='players', desc='Generating Rolling Pitcher Stats'):
        df = df.copy()
        
        for col in df.columns:
            if col in {'Game ID', 'ID', 'Date', 'Game Number'}:
                continue

            df[f'{col} {window} Game Window'] = df[col].rolling(window).sum()
            df[f'{col} {window} Game Window'] = df[f'{col} {window} Game Window'].shift(1)
        
        # Calculate ERA
        df[f'ERA {window} Game Window'] = ((df[f'Earned Runs {window} Game Window'] * 9) /
                                           df[f'Innings Pitched {window} Game Window'])
        
        # Calculate WHIP
        # (Walks + hits) / innings pitched
        df[f'WHIP {window} Game Window'] = ((df[f'Hits Allowed {window} Game Window'] +
                                             df[f'Walks {window} Game Window']) /
                                            (df[f'Innings Pitched {window} Game Window']))
        
        # Calculate FIP
        # (13 * HR + 3*(BB + HBP) - 2*K) / IP + constant
        df[f'FIP {window} Game Window'] = (((13 * df[f'Homeruns Allowed {window} Game Window']) +
                                             (3 * (df[f'Walks {window} Game Window'] +
                                                   df[f'Hit by Pitch {window} Game Window'])) -
                                             (2 * df[f'Strikeouts {window} Game Window'])) /
                                            (df[f'Innings Pitched {window} Game Window'])) + 3.2

        df = df.filter(items=['Game ID',
                              'ID',
                              'Date',
                              'Game Number',
                              f'Homeruns Allowed {window} Game Window',
                              f'Hits Allowed {window} Game Window',
                              f'Strikeouts {window} Game Window',
                              f'Walks {window} Game Window',
                              f'Earned Runs {window} Game Window',
                              f'ERA {window} Game Window',
                              f'WHIP {window} Game Window',
                              f'FIP {window} Game Window'])
    
        df.dropna(inplace=True)
        
        dictionary[k] = df
    
    return dictionary

In [10]:
get_pitching_rolling_sums(get_pitching_stats(), window=5)['kersc001']

HBox(children=(HTML(value='Fetching Pitcher Stats'), FloatProgress(value=0.0, max=746.0), HTML(value='')))




HBox(children=(HTML(value='Generating Rolling Pitcher Stats'), FloatProgress(value=0.0, max=746.0), HTML(value…




Unnamed: 0,Game ID,ID,Date,Game Number,Homeruns Allowed 5 Game Window,Hits Allowed 5 Game Window,Strikeouts 5 Game Window,Walks 5 Game Window,Earned Runs 5 Game Window,ERA 5 Game Window,WHIP 5 Game Window,FIP 5 Game Window
5,LAN201405280,kersc001,2014-05-28,0,1.0,29.0,37.0,6.0,10.0,3.176471,1.235294,1.682353
6,LAN201406020,kersc001,2014-06-02,0,2.0,31.0,39.0,6.0,12.0,3.767442,1.290698,2.013953
7,COL201406080,kersc001,2014-06-08,0,3.0,26.0,39.0,6.0,14.0,4.247191,1.078652,2.492135
8,LAN201406130,kersc001,2014-06-13,0,3.0,22.0,39.0,6.0,12.0,3.903614,1.012048,2.440964
9,LAN201406180,kersc001,2014-06-18,0,3.0,24.0,43.0,5.0,7.0,1.909091,0.878788,2.230303
...,...,...,...,...,...,...,...,...,...,...,...,...
157,ARI201908310,kersc001,2019-08-31,0,7.0,22.0,42.0,9.0,9.0,2.454545,0.939394,4.321212
158,LAN201909060,kersc001,2019-09-06,0,8.0,21.0,43.0,6.0,12.0,3.375000,0.843750,4.418750
159,NYN201909130,kersc001,2019-09-13,0,9.0,24.0,40.0,8.0,12.0,3.724138,1.103448,5.406897
160,LAN201909200,kersc001,2019-09-20,0,10.0,26.0,35.0,11.0,13.0,4.129412,1.305882,6.588235


In [11]:
columns = ['Date',
           'Number of game',
           'Visiting Team',
           'Visiting Team Game Number',
           'Home Team',
           'Home Team Game Number',
           'Visiting Team Score',
           'Home Team Score',
           'Visiting Team Starting Pitcher ID',
           'Home Team Starting Pitcher ID']

filtered_game_logs = game_logs.filter(items=columns)
filtered_game_logs

Unnamed: 0,Date,Number of game,Visiting Team,Visiting Team Game Number,Home Team,Home Team Game Number,Visiting Team Score,Home Team Score,Visiting Team Starting Pitcher ID,Home Team Starting Pitcher ID
0,2014-03-22,0,LAN,1,ARI,1,3,1,kersc001,milew001
1,2014-03-23,0,LAN,2,ARI,2,7,5,ryu-h001,cahit001
2,2014-03-30,0,LAN,3,SDN,1,1,3,ryu-h001,casha001
3,2014-03-31,0,SEA,1,ANA,1,10,3,hernf002,weavj003
4,2014-03-31,0,BOS,1,BAL,1,1,2,lestj001,tillc001
...,...,...,...,...,...,...,...,...,...,...
2424,2019-09-29,0,DET,161,CHA,161,3,5,turns003,detwr001
2425,2019-09-29,0,MIN,162,KCA,162,4,5,perem004,lopej004
2426,2019-09-29,0,OAK,162,SEA,162,1,3,roart001,dunnj002
2427,2019-09-29,0,NYA,162,TEX,162,1,6,greec003,lynnl001


In [12]:
def generate_training_data(schedule, team_data, pitching_data):
    
    # Get the batting and pitching stats
    batting = pd.concat([v for k,v in team_data.items()])
    pitching = pd.concat([v for k,v in pitching_data.items()])
    
    # Merge home batting with schedule
    home_batting = pd.merge(schedule,
                            batting,
                            how='left',
                            left_on=['Date', 'Number of game', 'Home Team'],
                            right_on=['Date', 'Number of game', 'Team']).drop(columns=['Team'])
    
    # Merge away batting with schedule
    away_batting = pd.merge(schedule,
                            batting,
                            how='left',
                            left_on=['Date', 'Number of game', 'Visiting Team'],
                            right_on=['Date', 'Number of game', 'Team']).drop(columns=['Team'])
    
    # Merge home pitching with schedule
    home_pitching = pd.merge(schedule,
                             pitching,
                             how='left',
                             left_on=['Date', 'Number of game', 'Home Team Starting Pitcher ID'],
                             right_on=['Date', 'Game Number', 'ID']).drop(columns=['Game Number', 'Game ID', 'ID'])
    
    # Merge away pitching with schedule
    away_pitching = pd.merge(schedule,
                             pitching,
                             how='left',
                             left_on=['Date', 'Number of game', 'Visiting Team Starting Pitcher ID'],
                             right_on=['Date', 'Game Number', 'ID']).drop(columns=['Game Number', 'Game ID', 'ID'])
    
    # Merge home batting with home pitching
    df_home = pd.merge(home_batting,
                       home_pitching,
                       how='left',
                       on=list(schedule.columns))
    
    # Merge away batting with away pitching
    df_away = pd.merge(away_batting,
                       away_pitching,
                       how='left',
                       on=list(schedule.columns))
    
    # Merge home and away
    home_and_away = pd.merge(df_home,
                             df_away,
                             how='left',
                             on=list(schedule.columns))
        
    # Create differences between home and away
    for i, home_col in enumerate(home_and_away.columns[len(schedule.columns):len(df_home.columns)]):
        suffix_index = home_col.find('_')
        col_name = 'Difference ' + home_col[:suffix_index]
        away_col = home_col[:-2] + '_y'
        home_and_away[col_name] = home_and_away[home_col] - home_and_away[away_col]
    
    return home_and_away.drop(columns=list(home_and_away.columns)[len(schedule.columns):
                                                                  2*len(df_home.columns)-len(schedule.columns)]).dropna()

# Training a Random Forrest Classifier

In [13]:
data = generate_training_data(filtered_game_logs,
                              get_team_rolling_sums(get_team_games(), window=5),
                              get_pitching_rolling_sums(get_pitching_stats(), window=5))

training_data = data.dropna().reset_index(drop=True)
training_data['Winner'] = np.where(training_data["Home Team Score"] >
                                   training_data["Visiting Team Score"], 1, 0)
training_data.drop(columns=['Date',
                            'Number of game',
                            'Visiting Team',
                            'Visiting Team Game Number',
                            'Home Team',
                            'Home Team Game Number',
                            'Home Team Score',
                            'Visiting Team Score',
                            'Visiting Team Starting Pitcher ID',
                            'Home Team Starting Pitcher ID'],
                   inplace=True)

X_train, X_test, y_train, y_test = train_test_split(training_data.drop(columns=['Winner']),
                                                    training_data.filter(items=['Winner']).pop('Winner'),
                                                    random_state=42)

HBox(children=(HTML(value='Fetching Pitcher Stats'), FloatProgress(value=0.0, max=746.0), HTML(value='')))




HBox(children=(HTML(value='Generating Rolling Pitcher Stats'), FloatProgress(value=0.0, max=746.0), HTML(value…




In [None]:
# Number of trees in random forest
n_estimators = [i for i in range(100, 1100, 100)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [i for i in range(1, 21)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_param_grid = {'n_estimators': n_estimators,
                      'max_features': max_features,
                      'max_depth': max_depth,
                      'min_samples_split': min_samples_split,
                      'min_samples_leaf': min_samples_leaf,
                      'bootstrap': bootstrap}
# Use the random grid to find the best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_param_grid,
                               n_iter = 50,
                               cv = 3,
                               verbose = 2,
                               random_state = 42,
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [14]:
best_params = {'n_estimators': 800,
               'min_samples_split': 2,
               'min_samples_leaf': 4,
               'max_features': 'sqrt',
               'max_depth': 2,
               'bootstrap': False,
               'random_state': 42}

In [15]:
rf = RandomForestClassifier(**best_params)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.5633724176437744

In [16]:
rf.score(X_test, y_test)

0.5457286432160804

In [17]:
rf.predict_proba(X_train)

array([[0.44063418, 0.55936582],
       [0.44002376, 0.55997624],
       [0.52121273, 0.47878727],
       ...,
       [0.52580111, 0.47419889],
       [0.46422273, 0.53577727],
       [0.47810787, 0.52189213]])

# Gaussian Bayes Naive Bayes Model

In [18]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb.score(X_train, y_train)

0.5490787269681742

In [19]:
gnb.score(X_test, y_test)

0.5366834170854271