# Imports

In [1]:
import os
import shutil
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from datetime import date
from functools import reduce
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

PATH_TO_GAME_LOGS = './datasets/retro_sheet_gls/'
PATH_TO_TEAM_GAMES = './datasets/team_stats/'
TEAM_ROLLING_SUMS_DIR = './datasets/team_rolling_14/'
PATH_TO_PITCHER_STATS = './datasets/pitcher_stats/'
PITCHING_ROLLING_SUMS_DIR = './datasets/pitching_rolling_10/'
MODEL_SAVE_DIR = './datasets/models/'
DATA_2020 = './datasets/data_2020/'
LATEST_TEAM_DIR = os.path.join(DATA_2020, 'teams')
LATEST_PITCHER_DIR = os.path.join(DATA_2020, 'pitchers')

# Utils

In [2]:
def clean_dir(path):
    """Makes a clean directory, removes all files and folders in the specified path"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")    

# Data

In [3]:
def get_game_logs(game_log_dir=PATH_TO_GAME_LOGS):
    """Function to get game log"""
    
    def generate_date(date_of_game):
        """Helper function that parses a game id for the date"""
        
        date_of_game = str(date_of_game)
        
        # Parse the date
        year = int(date_of_game[0:4])
        month = int(date_of_game[4:6])
        day = int(date_of_game[6:8])
        
        return date(year, month, day)
    
    dfs = []
    for file in os.listdir(game_log_dir):
        df = pd.read_csv(os.path.join(game_log_dir, file))
        df['Date'] = df.apply(lambda row : generate_date(row['Date']), axis=1)
        dfs.append(df)
    return pd.concat(dfs)

In [4]:
game_logs = get_game_logs()
game_logs

Unnamed: 0,Date,Number of game,Day,Visiting Team,Visiting Team League,Visiting Team Game Number,Home Team,Home Team League,Home Team Game Number,Visiting Team Score,Home Team Score,Length of Game (outs),Time of Game (Day/Night),Completion Information,Forfeit Information,Protest Information,Park ID,Attendance,Length of Game (minutes),Visiting Team Line Score,Home Team Line Score,Visiting Team At-bats,Visiting Team Hits,Visiting Team Doubles,Visiting Team Triples,Visiting Team Homeruns,Visiting Team RBIs,Visiting Team Sacrifice Hits,Visiting Team Sacrifice Flies,Visiting Team Hit-by-pitch,Visiting Team Walks,Visiting Team Intentional Walks,Visiting Team Strikeouts,Visiting Team Stolen Bases,Visiting Team Caught Stealing,Visiting Team Grounded into DP,Visiting Team Awarded First Base due to CI,Visiting Team Left on Base,Visiting Team Pitchers Used,Visiting Team Individual Earned Runs,Visiting Team Earned Runs,Visiting Team Wild Pitches,Visiting Team Balks,Visiting Team Putouts,Visiting Team Assists,Visiting Team Errors,Visiting Team Passed Balls,Visiting Team Double Plays,Visiting Team Triple Plays,Home Team At-bats,Home Team Hits,Home Team Doubles,Home Team Triples,Home Team Homeruns,Home Team RBIs,Home Team Sacrifice Hits,Home Team Sacrifice Flies,Home Team Hit-by-pitch,Home Team Walks,Home Team Intentional Walks,Home Team Strikeouts,Home Team Stolen Bases,Home Team Caught Stealing,Home Team Grounded into DP,Home Team Awarded First Base due to CI,Home Team Left on Base,Home Team Pitchers Used,Home Team Individual Earned Runs,Home Team Earned Runs,Home Team Wild Pitches,Home Team Balks,Home Team Putouts,Home Team Assists,Home Team Errors,Home Team Passed Balls,Home Team Double Plays,Home Team Triple Plays,Home Plate Umpire ID,Home Plate Umpire Name,1B Umpire ID,1B Umpire Name,2B Umpire ID,2B Umpire Name,3B Umpire ID,3B Umpire Name,LF Umpire ID,LF Umpire Name,RF Umpire ID,RF Umpire Name,Visiting Team Manager ID,Visiting Team Manager Name,Home Team Manager ID,Home Team Manager Name,Winning Pitcher ID,Winning Pitcher Name,Losing Pitcher ID,Losing Pitcher Name,Saving Pitcher ID,Saving Pitcher Name,Game Winning RBI Batter ID,Game WInning RBI Batter Name,Visiting Team Starting Pitcher ID,Visiting Team Starting Pitcher Name,Home Team Starting Pitcher ID,Home Team Starting Pitcher Name,Visiting Team Player 1 ID,Visiting Team Player 1 Name,Visiting Team Player 1 Defensive Position,Visiting Team Player 2 ID,Visiting Team Player 2 Name,Visiting Team Player 2 Defensive Position,Visiting Team Player 3 ID,Visiting Team Player 3 Name,Visiting Team Player 3 Defensive Position,Visiting Team Player 4 ID,Visiting Team Player 4 Name,Visiting Team Player 4 Defensive Position,Visiting Team Player 5 ID,Visiting Team Player 5 Name,Visiting Team Player 5 Defensive Position,Visiting Team Player 6 ID,Visiting Team Player 6 Name,Visiting Team Player 6 Defensive Position,Visiting Team Player 7 ID,Visiting Team Player 7 Name,Visiting Team Player 7 Defensive Position,Visiting Team Player 8 ID,Visiting Team Player 8 Name,Visiting Team Player 8 Defensive Position,Visiting Team Player 9 ID,Visiting Team Player 9 Name,Visiting Team Player 9 Defensive Position,Home Team Player 1 ID,Home Team Player 1 Name,Home Team Player 1 Defensive Position,Home Team Player 2 ID,Home Team Player 2 Name,Home Team Player 2 Defensive Position,Home Team Player 3 ID,Home Team Player 3 Name,Home Team Player 3 Defensive Position,Home Team Player 4 ID,Home Team Player 4 Name,Home Team Player 4 Defensive Position,Home Team Player 5 ID,Home Team Player 5 Name,Home Team Player 5 Defensive Position,Home Team Player 6 ID,Home Team Player 6 Name,Home Team Player 6 Defensive Position,Home Team Player 7 ID,Home Team Player 7 Name,Home Team Player 7 Defensive Position,Home Team Player 8 ID,Home Team Player 8 Name,Home Team Player 8 Defensive Position,Home Team Player 9 ID,Home Team Player 9 Name,Home Team Player 9 Defensive Position,Miscellaneous,Acquisition Information
0,2014-03-22,0,Sat,LAN,NL,1,ARI,NL,1,3,1,54,N,,,,SYD01,38266.0,169,10200000,000001000,33,5,2,0,1,3,0,0,1,3,0,11,0,0,0,0,7,4,1,1,1,0,27,13,1,0,0,0,33,5,1,0,0,1,0,0,0,2,0,10,0,0,0,0,7,5,3,3,1,0,27,10,1,0,0,0,welkt901,Tim Welke,scotd901,Dale Scott,diazl901,Laz Diaz,carlm901,Mark Carlson,,(none),,(none),mattd001,Don Mattingly,gibsk001,Kirk Gibson,kersc001,Clayton Kershaw,milew001,Wade Miley,jansk001,Kenley Jansen,ethia001,Andre Ethier,kersc001,Clayton Kershaw,milew001,Wade Miley,puigy001,Yasiel Puig,9,turnj001,Justin Turner,4,ramih003,Hanley Ramirez,6,gonza003,Adrian Gonzalez,3,vanss001,Scott Van Slyke,7,uribj002,Juan Uribe,5,ethia001,Andre Ethier,8,ellia001,A.J. Ellis,2,kersc001,Clayton Kershaw,1,polla001,A.J. Pollock,8,hilla001,Aaron Hill,4,goldp001,Paul Goldschmidt,3,pradm001,Martin Prado,5,trumm001,Mark Trumbo,7,montm001,Miguel Montero,2,owinc001,Chris Owings,6,parrg001,Gerardo Parra,9,milew001,Wade Miley,1,,Y
1,2014-03-23,0,Sun,LAN,NL,2,ARI,NL,2,7,5,54,D,,,,SYD01,38079.0,241,102021100,000000014,34,13,3,0,0,6,1,2,2,8,0,7,1,0,1,0,13,8,5,5,0,0,27,4,1,0,2,0,35,8,0,0,1,5,0,0,0,8,0,8,0,0,2,0,11,6,6,6,1,0,27,15,3,0,1,0,scotd901,Dale Scott,diazl901,Laz Diaz,carlm901,Mark Carlson,welkt901,Tim Welke,,(none),,(none),mattd001,Don Mattingly,gibsk001,Kirk Gibson,ryu-h001,Hyun-Jin Ryu,cahit001,Trevor Cahill,,(none),ethia001,Andre Ethier,ryu-h001,Hyun-Jin Ryu,cahit001,Trevor Cahill,gordd002,Dee Gordon,4,puigy001,Yasiel Puig,9,ramih003,Hanley Ramirez,6,gonza003,Adrian Gonzalez,3,ethia001,Andre Ethier,8,ellia001,A.J. Ellis,2,baxtm001,Mike Baxter,7,uribj002,Juan Uribe,5,ryu-h001,Hyun-Jin Ryu,1,polla001,A.J. Pollock,8,hilla001,Aaron Hill,4,goldp001,Paul Goldschmidt,3,pradm001,Martin Prado,5,montm001,Miguel Montero,2,trumm001,Mark Trumbo,7,parrg001,Gerardo Parra,9,gregd001,Didi Gregorius,6,cahit001,Trevor Cahill,1,,Y
2,2014-03-30,0,Sun,LAN,NL,3,SDN,NL,1,1,3,51,N,,,,SAN02,45567.0,169,10000,00000003x,31,4,0,0,0,1,0,0,0,3,0,9,0,0,0,0,6,4,2,2,0,0,24,12,2,0,2,0,27,5,0,0,1,3,2,0,0,4,0,10,1,0,2,0,6,5,1,1,1,0,27,10,0,0,0,0,culbf901,Fieldin Culbreth,gonzm901,Manny Gonzalez,reynj901,Jim Reynolds,barbs901,Sean Barber,,(none),,(none),mattd001,Don Mattingly,blacb001,Buddy Black,thayd001,Dale Thayer,wilsb001,Brian Wilson,streh001,Huston Street,denoc001,Chris Denorfia,ryu-h001,Hyun-Jin Ryu,casha001,Andrew Cashner,crawc002,Carl Crawford,7,puigy001,Yasiel Puig,9,ramih003,Hanley Ramirez,6,gonza003,Adrian Gonzalez,3,ethia001,Andre Ethier,8,uribj002,Juan Uribe,5,ellia001,A.J. Ellis,2,gordd002,Dee Gordon,4,ryu-h001,Hyun-Jin Ryu,1,cabre001,Everth Cabrera,6,denoc001,Chris Denorfia,9,headc001,Chase Headley,5,gyorj001,Jedd Gyorko,4,alony001,Yonder Alonso,3,medit001,Tommy Medica,7,venaw001,Will Venable,8,river003,Rene Rivera,2,casha001,Andrew Cashner,1,,Y
3,2014-03-31,0,Mon,SEA,AL,1,ANA,AL,1,10,3,54,N,,,,ANA01,44152.0,197,10001206,201000000,36,11,4,2,1,10,0,1,0,8,1,11,1,1,0,0,8,5,2,2,2,0,27,5,1,0,0,0,34,6,1,0,1,3,0,0,1,1,0,13,1,0,0,0,6,5,9,9,0,0,27,7,1,0,0,0,westj901,Joe West,fostm901,Marty Foster,drakr901,Rob Drake,porta901,Alan Porter,,(none),,(none),mccll001,Lloyd McClendon,sciom001,Mike Scioscia,hernf002,Felix Hernandez,weavj003,Jered Weaver,,(none),almoa001,Abraham Almonte,hernf002,Felix Hernandez,weavj003,Jered Weaver,almoa001,Abraham Almonte,8,millb002,Brad Miller,6,canor001,Robinson Cano,4,smoaj001,Justin Smoak,3,morrl001,Logan Morrison,10,seagk001,Kyle Seager,5,saunm001,Michael Saunders,9,ackld001,Dustin Ackley,7,zunim001,Mike Zunino,2,calhk001,Kole Calhoun,9,troum001,Mike Trout,8,pujoa001,Albert Pujols,3,hamij003,Josh Hamilton,7,freed001,David Freese,5,ibanr001,Raul Ibanez,10,kendh001,Howie Kendrick,4,iannc001,Chris Iannetta,2,aybae001,Erick Aybar,6,,Y
4,2014-03-31,0,Mon,BOS,AL,1,BAL,AL,1,1,2,51,D,,,,BAL12,46685.0,173,100000,01000010x,36,9,2,0,1,1,0,0,1,3,0,6,0,0,0,0,12,2,2,2,0,0,24,11,0,0,2,0,28,6,0,0,1,1,0,0,0,1,0,9,0,0,2,0,3,5,1,1,0,0,27,13,0,0,0,0,demud901,Dana DeMuth,kulpr901,Ron Kulpa,hicke901,Ed Hickox,barrl901,Lance Barrett,,(none),,(none),farrj001,John Farrell,showb801,Buck Showalter,britz001,Zack Britton,lestj001,Jon Lester,huntt002,Tommy Hunter,cruzn002,Nelson Cruz,lestj001,Jon Lester,tillc001,Chris Tillman,navad002,Daniel Nava,9,pedrd001,Dustin Pedroia,4,ortid001,David Ortiz,10,napom001,Mike Napoli,3,carpm001,Mike Carp,7,sizeg001,Grady Sizemore,8,bogax001,Xander Bogaerts,6,piera001,A.J. Pierzynski,2,middw001,Will Middlebrooks,5,markn001,Nick Markakis,9,hardj003,J.J. Hardy,6,jonea003,Adam Jones,8,davic003,Chris Davis,3,cruzn002,Nelson Cruz,7,wietm001,Matt Wieters,2,yound003,Delmon Young,10,flahr001,Ryan Flaherty,5,schoj001,Jonathan Schoop,4,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,2020-09-27,0,Sun,MIA,NL,60,NYA,AL,60,5,0,54,D,,,,NYC21,,185,030010010,000000000,35,9,1,0,2,5,0,0,1,4,0,7,0,0,1,0,8,3,0,0,0,1,27,14,0,0,2,0,31,6,0,0,0,0,0,0,0,4,0,4,1,0,2,0,8,3,5,5,1,0,27,13,0,0,1,0,buckc901,CB Bucknor,ortir901,Roberto Ortiz,tumpj901,John Tumpane,nelsj901,Jeff Nelson,,(none),,(none),mattd001,Don Mattingly,boona001,Aaron Boone,castd003,Daniel Castano,schmc002,Clarke Schmidt,,(none),chisj001,Jazz Chisholm,urenj001,Jose Urena,schmc002,Clarke Schmidt,dickc002,Corey Dickerson,10,marts002,Starling Marte,8,aguij001,Jesus Aguilar,3,andeb006,Brian Anderson,5,joycm001,Matt Joyce,9,bertj001,Jon Berti,4,chisj001,Jazz Chisholm,6,wallc001,Chad Wallach,2,sierm002,Magneuris Sierra,7,lemad001,DJ LeMahieu,3,judga001,Aaron Judge,9,hicka001,Aaron Hicks,8,voitl001,Luke Voit,10,urshg001,Giovanny Urshela,5,torrg001,Gleyber Torres,6,gardb001,Brett Gardner,7,sancg002,Gary Sanchez,2,wadet002,Tyler Wade,4,,Y
894,2020-09-27,0,Sun,SEA,AL,60,OAK,AL,60,2,6,51,D,,,,OAK01,,186,110000000,10001031x,32,4,0,0,0,2,0,0,1,2,0,16,1,1,0,0,6,5,6,6,0,0,24,7,0,0,0,0,34,10,4,0,1,6,0,0,1,6,1,9,0,0,0,0,11,4,0,0,0,0,27,5,2,0,0,0,johna901,Adrian Johnson,barrt901,Ted Barrett,rippm901,Mark Ripperger,wolcq901,Quinn Wolcott,,(none),,(none),servs002,Scott Servais,melvb001,Bob Melvin,petiy001,Yusmeiro Petit,hiray001,Yoshihisa Hirano,,(none),lambj001,Jake Lamb,gonzm005,Marco Gonzales,montf001,Frankie Montas,crawj002,J.P. Crawford,6,lewik001,Kyle Lewis,8,seagk001,Kyle Seager,5,frant002,Ty France,4,white005,Evan White,3,marmj001,Jose Marmolejos,10,lopet001,Tim Lopes,7,bishb001,Braden Bishop,9,odomj001,Joseph Odom,2,semim001,Marcus Semien,6,lastt001,Tommy La Stella,4,davik003,Khris Davis,10,canhm001,Mark Canha,7,olsom001,Matt Olson,3,laurr001,Ramon Laureano,8,piscs001,Stephen Piscotty,9,lambj001,Jake Lamb,5,murps001,Sean Murphy,2,,Y
895,2020-09-27,0,Sun,PHI,NL,60,TBA,AL,60,0,5,51,D,,,,STP01,,179,000000000,01200200x,32,6,1,0,0,0,0,0,0,1,0,5,0,0,2,0,6,4,5,5,1,0,24,11,0,0,2,0,32,9,1,1,0,4,0,0,1,4,0,10,0,0,2,0,8,4,0,0,0,0,27,18,1,0,2,0,rackd901,David Rackley,barkl901,Lance Barksdale,conrc901,Chris Conroy,vanol901,Larry Vanover,,(none),,(none),giraj001,Joe Girardi,cashk001,Kevin Cash,flemj001,Josh Fleming,nolaa001,Aaron Nola,,(none),smitk002,Kevan Smith,nolaa001,Aaron Nola,flemj001,Josh Fleming,mccua001,Andrew McCutchen,7,bohma001,Alec Bohm,5,harpb003,Bryce Harper,10,realj001,J.T. Realmuto,2,seguj002,Jean Segura,4,gregd001,Didi Gregorius,6,gossp001,Phil Gosselin,3,kings001,Scott Kingery,8,quinr003,Roman Quinn,9,tsuty001,Yoshi Tsutsugo,5,loweb001,Brandon Lowe,10,arozr001,Randy Arozarena,7,lowen001,Nate Lowe,3,wendj002,Joey Wendle,6,renfh001,Hunter Renfroe,9,philb002,Brett Phillips,8,brosm001,Michael Brosseau,4,perem005,Michael Perez,2,,Y
896,2020-09-27,0,Sun,HOU,AL,60,TEX,AL,60,4,8,51,D,,,,ARL02,,208,101100010,01051010x,35,8,0,0,3,4,0,0,0,5,0,7,0,0,0,0,9,3,8,8,1,0,24,6,1,0,2,0,33,11,0,0,2,7,0,0,2,5,0,11,3,0,1,0,8,7,4,4,0,0,27,7,0,0,0,0,hudsm901,Marvin Hudson,hamaa901,Adam Hamari,knigb901,Brian Knight,muchm901,Mike Muchlinski,,(none),,(none),baked002,Dusty Baker,woodc001,Chris Woodward,benjw001,Wes Benjamin,dejoc001,Chase De Jong,,(none),odorr001,Rougned Odor,dejoc001,Chase De Jong,lylej001,Jordan Lyles,sprig001,George Springer,8,altuj001,Jose Altuve,4,branm003,Michael Brantley,10,brega001,Alex Bregman,5,diaza003,Aledmys Diaz,7,gurry001,Yulieski Gurriel,3,corrc001,Carlos Correa,6,reddj001,Josh Reddick,9,garnd001,Dustin Garneau,2,choos001,Shin-Soo Choo,10,tavel001,Leody Taveras,8,kinei001,Isiah Kiner-Falefa,5,gallj002,Joey Gallo,9,solan001,Nick Solak,7,odorr001,Rougned Odor,4,huffs001,Sam Huff,2,guzmr001,Ronald Guzman,3,tejea001,Anderson Tejeda,6,,Y


In [5]:
def get_team_games(teams_games_dir=PATH_TO_TEAM_GAMES):
    """Function that gets the team stats for each team"""
    
    def generate_date(date_of_game):
        """Helper function that parses a game id for the date"""
        
        date_of_game = str(date_of_game)
        
        # Parse the date
        year = int(date_of_game[0:4])
        month = int(date_of_game[4:6])
        day = int(date_of_game[6:8])
        
        return date(year, month, day)
    
    team_stats = dict()
    
    for file in os.listdir(teams_games_dir):
        df = pd.read_csv(os.path.join(teams_games_dir, file))
        df['Date'] = df.apply(lambda row : generate_date(row['Date']), axis=1)
        df.rename(columns={'Earned Runs': 'Team Earned Runs'}, inplace=True)
        team_stats[file[0:3]] = df.sort_values(by=['Date', 'Number of game']).reset_index(drop=True)

    return team_stats

In [6]:
def get_team_rolling_sums(team_games, window=14):
    """Function that calculates the rolling statistics for each team"""
    
    for _, df in team_games.items():
        for col in df.columns[6::]:
            df[f'{col} {window} Game Window'] = df[col].rolling(window).sum()
            df[f'{col} {window} Game Window'] = df[f'{col} {window} Game Window'].shift(1)
        
        df.drop(columns=df.columns[3:33], inplace=True)
        
        # Add BA
        df[f'BA {window} Game Window'] = (df[f'Hits {window} Game Window'] /
                                          df[f'At-bats {window} Game Window'])
        
        # Add SLG = (1B + 2Bx2 + 3Bx3 + HRx4)/AB.
        df[f'SLG {window} Game Window'] = ((df[f'Hits {window} Game Window'] +
                                            df[f'Doubles {window} Game Window'] +
                                            2 * df[f'Triples {window} Game Window'] +
                                            3 * df[f'Homeruns {window} Game Window']) /
                                           df[f'At-bats {window} Game Window'])
        
        # Add OBP = (Hits + Walks + Hit by Pitch) / (At Bats + Walks + Hit by Pitch + Sacrifice Flies
        df[f'OBP {window} Game Window'] = ((df[f'Hits {window} Game Window'] +
                                            df[f'Walks {window} Game Window'] +
                                            df[f'Hit-by-pitch {window} Game Window']) /
                                           (df[f'At-bats {window} Game Window'] + 
                                            df[f'Walks {window} Game Window'] +
                                            df[f'Hit-by-pitch {window} Game Window'] +
                                            df[f'Sacrifice Flies {window} Game Window']))
        
        # OPS = OBS + SLG
        df[f'OPS {window} Game Window'] = (df[f'OBP {window} Game Window'] +
                                           df[f'SLG {window} Game Window'])
        
        # ISO = SLG - BA
        df[f'ISO {window} Game Window'] = (df[f'SLG {window} Game Window'] -
                                           df[f'BA {window} Game Window'])
        
        # Drop stats that won't be needed
        df.drop(columns=(list(df.columns[5:8]) +
                         list(df.columns[9:21]) +
                         list(df.columns[22:30])),
                inplace=True)
        
        df.dropna(inplace=True)
        df.reset_index(drop=True, inplace=True)
            
    return team_games

In [7]:
# Save the rolling stats, 14 games seems like the sweet spot
clean_dir(TEAM_ROLLING_SUMS_DIR)
for team, df in get_team_rolling_sums(get_team_games(), window=14).items():
    df.to_csv(os.path.join(TEAM_ROLLING_SUMS_DIR, f'{team}.csv'), index=False)

In [8]:
pd.read_csv(os.path.join(TEAM_ROLLING_SUMS_DIR, 'NYA.csv')) # just to see

Unnamed: 0,Date,Number of game,Team,At-bats 14 Game Window,Hits 14 Game Window,RBIs 14 Game Window,Team Earned Runs 14 Game Window,BA 14 Game Window,SLG 14 Game Window,OBP 14 Game Window,OPS 14 Game Window,ISO 14 Game Window
0,2014-04-16,2,NYA,463.0,124.0,49.0,53.0,0.267819,0.423326,0.332031,0.755357,0.155508
1,2014-04-17,0,NYA,468.0,130.0,49.0,47.0,0.277778,0.431624,0.335283,0.766907,0.153846
2,2014-04-18,0,NYA,477.0,139.0,59.0,45.0,0.291405,0.475891,0.343570,0.819461,0.184486
3,2014-04-19,0,NYA,481.0,140.0,60.0,54.0,0.291060,0.471933,0.342256,0.814190,0.180873
4,2014-04-20,0,NYA,470.0,127.0,54.0,66.0,0.270213,0.444681,0.318898,0.763578,0.174468
...,...,...,...,...,...,...,...,...,...,...,...,...
1013,2020-09-23,0,NYA,474.0,136.0,102.0,46.0,0.286920,0.550633,0.378623,0.929256,0.263713
1014,2020-09-24,0,NYA,475.0,135.0,101.0,55.0,0.284211,0.545263,0.374545,0.919809,0.261053
1015,2020-09-25,0,NYA,474.0,135.0,95.0,57.0,0.284810,0.533755,0.372943,0.906699,0.248945
1016,2020-09-26,0,NYA,481.0,133.0,92.0,60.0,0.276507,0.505198,0.366906,0.872104,0.228690


Now we need to add the pitcher data

In [9]:
def get_pitching_stats(directory=PATH_TO_PITCHER_STATS):
    """Function returns a dictionary, where each key is a player_id
    and each value is a dataframe object containing the player's stats"""
    
    def generate_date(game_id):
        """Helper function that parses a game id for the date"""
        
        # Parse the game_id
        year = int(game_id[3:7])
        month = int(game_id[7:9])
        day = int(game_id[9:11])
        
        return date(year, month, day)
    
    def generate_game_number(game_id):
        """Helper function that parses a game id and returns the game number"""
        return int(game_id[-1])        
    
    player_stats_dict = dict()
    
    for file in tqdm(os.listdir(directory), unit='players', desc='Fetching Pitcher Stats'):
        df = pd.read_csv(os.path.join(directory, file))
        df['Date'] = df.apply(lambda row : generate_date(row['Game ID']), axis=1) 
        df['Game Number'] = df.apply(lambda row : generate_game_number(row['Game ID']), axis=1)
        df['ID'] = file[0:-4]
        df.drop(columns=['Unnamed: 0'], inplace=True)
        player_id = df.iloc[0]['ID']
        player_stats_dict[player_id] = df.sort_values(by=['Date', 'Game Number']).reset_index(drop=True)
    
    return player_stats_dict

In [10]:
get_pitching_stats()['kersc001']

HBox(children=(HTML(value='Fetching Pitcher Stats'), FloatProgress(value=0.0, max=817.0), HTML(value='')))




Unnamed: 0,Game ID,ID,Balls,Strikes,Homeruns Allowed,Hits Allowed,Strikeouts,Pickoff Errors,Pickoffs,Wild Pitches,Balks,Walks,Intentional Walks,Hit by Pitch,Earned Runs,Innings Pitched,Date,Game Number
0,ARI201403220,kersc001,28,40,0,5,7,0,0,1,0,1,0,0,1,6.666667,2014-03-22,0
1,WAS201405060,kersc001,21,36,0,9,9,0,1,0,0,0,0,0,0,7.000000,2014-05-06,0
2,LAN201405110,kersc001,26,30,1,7,9,0,0,0,0,0,0,0,3,7.000000,2014-05-11,0
3,ARI201405170,kersc001,20,16,0,6,3,0,0,0,1,2,0,0,6,1.666667,2014-05-17,0
4,PHI201405230,kersc001,30,39,0,2,9,0,0,0,0,3,0,0,0,6.000000,2014-05-23,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,LAN202009030,kersc001,28,35,0,1,8,0,0,0,0,2,0,1,0,6.000000,2020-09-03,0
168,ARI202009090,kersc001,28,32,0,4,3,0,0,0,0,2,0,0,3,5.000000,2020-09-09,0
169,SDN202009140,kersc001,26,42,1,5,9,0,0,0,0,0,0,0,1,6.333333,2020-09-14,0
170,COL202009190,kersc001,27,28,0,4,6,0,1,0,0,0,0,0,1,6.666667,2020-09-19,0


In [11]:
def get_pitching_rolling_sums(dictionary, window=10):
    """Function that will generate statsistics based on a rolling window"""
    
    for k, df in tqdm(dictionary.items(), unit='players', desc='Generating Rolling Pitcher Stats'):
        df = df.copy()
        
        for col in df.columns:
            if col in {'Game ID', 'ID', 'Date', 'Game Number'}:
                continue

            df[f'{col} {window} Game Window'] = df[col].rolling(window).sum()
            df[f'{col} {window} Game Window'] = df[f'{col} {window} Game Window'].shift(1)
        
        # Calculate ERA
        df[f'ERA {window} Game Window'] = ((df[f'Earned Runs {window} Game Window'] * 9) /
                                           df[f'Innings Pitched {window} Game Window'])
        
        # Calculate WHIP
        # (Walks + hits) / innings pitched
        df[f'WHIP {window} Game Window'] = ((df[f'Hits Allowed {window} Game Window'] +
                                             df[f'Walks {window} Game Window']) /
                                            (df[f'Innings Pitched {window} Game Window']))
        
        # Calculate FIP
        # (13 * HR + 3*(BB + HBP) - 2*K) / IP + constant
        df[f'FIP {window} Game Window'] = (((13 * df[f'Homeruns Allowed {window} Game Window']) +
                                             (3 * (df[f'Walks {window} Game Window'] +
                                                   df[f'Hit by Pitch {window} Game Window'])) -
                                             (2 * df[f'Strikeouts {window} Game Window'])) /
                                            (df[f'Innings Pitched {window} Game Window'])) + 3.2

        df = df.filter(items=['Game ID',
                              'ID',
                              'Date',
                              'Game Number',
                              f'Homeruns Allowed {window} Game Window',
                              f'Hits Allowed {window} Game Window',
                              f'Strikeouts {window} Game Window',
                              f'Walks {window} Game Window',
                              f'Earned Runs {window} Game Window',
                              f'ERA {window} Game Window',
                              f'WHIP {window} Game Window',
                              f'FIP {window} Game Window'])
    
        df.dropna(inplace=True)
        
        dictionary[k] = df
    
    return dictionary

In [12]:
# Save the rolling stats, 10 games seems like the sweet spot
clean_dir(PITCHING_ROLLING_SUMS_DIR)
for pitcher, df in get_pitching_rolling_sums(get_pitching_stats(), window=10).items():
    df.to_csv(os.path.join(PITCHING_ROLLING_SUMS_DIR, f'{pitcher}.csv'), index=False)

HBox(children=(HTML(value='Fetching Pitcher Stats'), FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(HTML(value='Generating Rolling Pitcher Stats'), FloatProgress(value=0.0, max=817.0), HTML(value…




In [13]:
pd.read_csv(os.path.join(PITCHING_ROLLING_SUMS_DIR, 'kersc001.csv'))

Unnamed: 0,Game ID,ID,Date,Game Number,Homeruns Allowed 10 Game Window,Hits Allowed 10 Game Window,Strikeouts 10 Game Window,Walks 10 Game Window,Earned Runs 10 Game Window,ERA 10 Game Window,WHIP 10 Game Window,FIP 10 Game Window
0,KCA201406240,kersc001,2014-06-24,0,4.0,51.0,86.0,8.0,17.0,2.378238,0.917098,1.707772
1,LAN201406290,kersc001,2014-06-29,0,4.0,52.0,87.0,8.0,16.0,2.192893,0.913706,1.707614
2,COL201407040,kersc001,2014-07-04,0,4.0,48.0,91.0,10.0,16.0,2.192893,0.883249,1.677157
3,LAN201407100,kersc001,2014-07-10,0,3.0,43.0,90.0,11.0,13.0,1.755000,0.810000,1.580000
4,SLN201407200,kersc001,2014-07-20,0,4.0,40.0,98.0,10.0,8.0,0.972973,0.675676,1.659459
...,...,...,...,...,...,...,...,...,...,...,...,...
157,LAN202009030,kersc001,2020-09-03,0,12.0,45.0,62.0,14.0,17.0,2.668605,1.029070,4.490698
158,ARI202009090,kersc001,2020-09-09,0,10.0,41.0,64.0,14.0,12.0,1.851429,0.942857,4.005714
159,SDN202009140,kersc001,2020-09-14,0,9.0,38.0,61.0,13.0,14.0,2.123596,0.859551,3.823596
160,COL202009190,kersc001,2020-09-19,0,9.0,39.0,65.0,10.0,14.0,2.123596,0.825843,3.537079


In [14]:
columns = ['Date',
           'Number of game',
           'Visiting Team',
           'Visiting Team Game Number',
           'Home Team',
           'Home Team Game Number',
           'Visiting Team Score',
           'Home Team Score',
           'Visiting Team Starting Pitcher ID',
           'Home Team Starting Pitcher ID']

filtered_game_logs = game_logs.filter(items=columns)
filtered_game_logs

Unnamed: 0,Date,Number of game,Visiting Team,Visiting Team Game Number,Home Team,Home Team Game Number,Visiting Team Score,Home Team Score,Visiting Team Starting Pitcher ID,Home Team Starting Pitcher ID
0,2014-03-22,0,LAN,1,ARI,1,3,1,kersc001,milew001
1,2014-03-23,0,LAN,2,ARI,2,7,5,ryu-h001,cahit001
2,2014-03-30,0,LAN,3,SDN,1,1,3,ryu-h001,casha001
3,2014-03-31,0,SEA,1,ANA,1,10,3,hernf002,weavj003
4,2014-03-31,0,BOS,1,BAL,1,1,2,lestj001,tillc001
...,...,...,...,...,...,...,...,...,...,...
893,2020-09-27,0,MIA,60,NYA,60,5,0,urenj001,schmc002
894,2020-09-27,0,SEA,60,OAK,60,2,6,gonzm005,montf001
895,2020-09-27,0,PHI,60,TBA,60,0,5,nolaa001,flemj001
896,2020-09-27,0,HOU,60,TEX,60,4,8,dejoc001,lylej001


In [15]:
def generate_training_data(schedule, team_data, pitching_data):
    """Formats the data for training"""
    
    # Get the batting and pitching stats
    batting = pd.concat([v for k,v in team_data.items()])
    pitching = pd.concat([v for k,v in pitching_data.items()])
    
    # Merge home batting with schedule
    home_batting = pd.merge(schedule,
                            batting,
                            how='left',
                            left_on=['Date', 'Number of game', 'Home Team'],
                            right_on=['Date', 'Number of game', 'Team']).drop(columns=['Team'])
    
    # Merge away batting with schedule
    away_batting = pd.merge(schedule,
                            batting,
                            how='left',
                            left_on=['Date', 'Number of game', 'Visiting Team'],
                            right_on=['Date', 'Number of game', 'Team']).drop(columns=['Team'])
    
    # Merge home pitching with schedule
    home_pitching = pd.merge(schedule,
                             pitching,
                             how='left',
                             left_on=['Date', 'Number of game', 'Home Team Starting Pitcher ID'],
                             right_on=['Date', 'Game Number', 'ID']).drop(columns=['Game Number', 'Game ID', 'ID'])
    
    # Merge away pitching with schedule
    away_pitching = pd.merge(schedule,
                             pitching,
                             how='left',
                             left_on=['Date', 'Number of game', 'Visiting Team Starting Pitcher ID'],
                             right_on=['Date', 'Game Number', 'ID']).drop(columns=['Game Number', 'Game ID', 'ID'])
    
    # Merge home batting with home pitching
    df_home = pd.merge(home_batting,
                       home_pitching,
                       how='left',
                       on=list(schedule.columns))
    
    # Merge away batting with away pitching
    df_away = pd.merge(away_batting,
                       away_pitching,
                       how='left',
                       on=list(schedule.columns))
    
    # Merge home and away
    home_and_away = pd.merge(df_home,
                             df_away,
                             how='left',
                             on=list(schedule.columns))
        
    # Create differences between home and away
    for i, home_col in enumerate(home_and_away.columns[len(schedule.columns):len(df_home.columns)]):
        suffix_index = home_col.find('_')
        col_name = 'Difference ' + home_col[:suffix_index]
        away_col = home_col[:-2] + '_y'
        home_and_away[col_name] = home_and_away[home_col] - home_and_away[away_col]
    
    return home_and_away.drop(columns=list(home_and_away.columns)[len(schedule.columns):
                                                                  2*len(df_home.columns)-len(schedule.columns)]).dropna()

In [16]:
generate_training_data(filtered_game_logs,
                       get_team_rolling_sums(get_team_games(), window=14),
                       get_pitching_rolling_sums(get_pitching_stats(), window=10))

HBox(children=(HTML(value='Fetching Pitcher Stats'), FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(HTML(value='Generating Rolling Pitcher Stats'), FloatProgress(value=0.0, max=817.0), HTML(value…




Unnamed: 0,Date,Number of game,Visiting Team,Visiting Team Game Number,Home Team,Home Team Game Number,Visiting Team Score,Home Team Score,Visiting Team Starting Pitcher ID,Home Team Starting Pitcher ID,Difference At-bats 14 Game Window,Difference Hits 14 Game Window,Difference RBIs 14 Game Window,Difference Team Earned Runs 14 Game Window,Difference BA 14 Game Window,Difference SLG 14 Game Window,Difference OBP 14 Game Window,Difference OPS 14 Game Window,Difference ISO 14 Game Window,Difference Homeruns Allowed 10 Game Window,Difference Hits Allowed 10 Game Window,Difference Strikeouts 10 Game Window,Difference Walks 10 Game Window,Difference Earned Runs 10 Game Window,Difference ERA 10 Game Window,Difference WHIP 10 Game Window,Difference FIP 10 Game Window
751,2014-05-26,0,SDN,52,ARI,53,5,7,rosst001,mccab001,23.0,14.0,-7.0,1.0,0.018262,0.027589,-0.004663,0.022926,0.009327,3.0,11.0,-6.0,-16.0,13.0,2.019170,-0.020897,0.076484
754,2014-05-26,0,BAL,49,MIL,52,7,6,tillc001,lohsk001,-19.0,-7.0,-6.0,-14.0,-0.004130,0.001457,-0.002286,-0.000829,0.005587,0.0,0.0,10.0,-10.0,-6.0,-1.384536,-0.350169,-0.929291
763,2014-05-27,0,DET,48,OAK,52,6,5,schem001,grays001,-10.0,-9.0,20.0,-39.0,-0.013275,0.031719,0.009795,0.041514,0.044994,-2.0,-2.0,-23.0,2.0,-4.0,-0.618767,-0.038668,0.432741
764,2014-05-27,0,ANA,51,SEA,51,6,4,weavj003,eliar001,-10.0,-2.0,4.0,10.0,0.000965,0.001635,-0.011744,-0.010109,0.000670,-1.0,6.0,2.0,6.0,5.0,1.094026,0.335152,0.144589
766,2014-05-27,0,SDN,53,ARI,54,4,3,stule002,milew001,23.0,13.0,-1.0,0.0,0.016284,0.019151,0.003791,0.022942,0.002867,0.0,-9.0,19.0,15.0,6.0,-0.167401,-0.216072,-0.209665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15464,2020-09-27,0,MIL,60,SLN,58,2,5,andeb004,gomba001,-12.0,-15.0,-16.0,6.0,-0.030145,-0.105942,-0.038774,-0.144716,-0.075797,-3.0,7.0,13.0,5.0,-2.0,-0.187606,0.322254,-1.187887
15465,2020-09-27,0,NYN,60,WAS,60,5,15,lugos001,votha001,-35.0,-3.0,10.0,-10.0,0.013476,0.014908,0.007725,0.022633,0.001432,1.0,0.0,-19.0,6.0,7.0,1.388756,0.125427,1.749487
15468,2020-09-27,0,DET,58,KCA,60,1,3,zimmj003,singb001,-6.0,7.0,14.0,-17.0,0.018250,0.088779,0.031292,0.120071,0.070529,-1.0,-15.0,10.0,16.0,-8.0,-1.845465,-0.085262,0.191742
15469,2020-09-27,0,CIN,60,MIN,60,5,3,grays001,hillr001,27.0,14.0,-3.0,0.0,0.018313,0.052218,-0.010839,0.041379,0.033905,-1.0,-13.0,-30.0,-2.0,-5.0,-0.192681,-0.016271,1.065997


# Training a Random Forrest Classifier

In [17]:
data = generate_training_data(filtered_game_logs,
                              get_team_rolling_sums(get_team_games(), window=14),
                              get_pitching_rolling_sums(get_pitching_stats(), window=10))

training_data = data.dropna().reset_index(drop=True)
training_data['Winner'] = np.where(training_data["Home Team Score"] >
                                   training_data["Visiting Team Score"], 1, 0)
training_data.drop(columns=['Date',
                            'Number of game',
                            'Visiting Team',
                            'Visiting Team Game Number',
                            'Home Team',
                            'Home Team Game Number',
                            'Home Team Score',
                            'Visiting Team Score',
                            'Visiting Team Starting Pitcher ID',
                            'Home Team Starting Pitcher ID'],
                   inplace=True)

X_train, X_test, y_train, y_test = train_test_split(training_data.drop(columns=['Winner']),
                                                    training_data.filter(items=['Winner']).pop('Winner'),
                                                    random_state=42)

HBox(children=(HTML(value='Fetching Pitcher Stats'), FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(HTML(value='Generating Rolling Pitcher Stats'), FloatProgress(value=0.0, max=817.0), HTML(value…




In [None]:
# Number of trees in random forest
n_estimators = [i for i in range(100, 1100, 100)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [i for i in range(1, 21)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_param_grid = {'n_estimators': n_estimators,
                      'max_features': max_features,
                      'max_depth': max_depth,
                      'min_samples_split': min_samples_split,
                      'min_samples_leaf': min_samples_leaf,
                      'bootstrap': bootstrap}
# Use the random grid to find the best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_param_grid,
                               n_iter = 50,
                               cv = 3,
                               verbose = 2,
                               random_state = 42,
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [18]:
best_params = {'n_estimators': 800,
               'min_samples_split': 2,
               'min_samples_leaf': 4,
               'max_features': 'sqrt',
               'max_depth': 2,
               'bootstrap': False,
               'random_state': 42}

In [19]:
rf = RandomForestClassifier(**best_params)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.5719985039271911

In [20]:
rf.score(X_test, y_test)

0.5732984293193717

In [21]:
clean_dir(MODEL_SAVE_DIR)

pickle.dump(rf, open(os.path.join(MODEL_SAVE_DIR, 'rfcl.sav'), 'wb'))

# Gaussian Bayes Naive Bayes Model

In [22]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb.score(X_train, y_train)

0.5550430120932552

In [23]:
gnb.score(X_test, y_test)

0.5635751682872102

# Neural Network Model

In [None]:
nn_data = generate_training_data(filtered_game_logs,
                                 get_team_rolling_sums(get_team_games(), window=14),
                                 get_pitching_rolling_sums(get_pitching_stats(), window=10))

In [None]:
nn_data.head()

In [None]:
nn_training_data = nn_data.dropna().reset_index(drop=True)
nn_training_data['Winner'] = np.where(nn_training_data["Home Team Score"] >
                                      nn_training_data["Visiting Team Score"], 1, 0)

nn_training_data.drop(columns=['Date',
                               'Number of game',
                               'Visiting Team',
                               'Visiting Team Game Number',
                               'Home Team',
                               'Home Team Game Number',
                               'Home Team Score',
                               'Visiting Team Score',
                               'Visiting Team Starting Pitcher ID',
                               'Home Team Starting Pitcher ID'],
                      inplace=True)

nn_training_data.head()

## Getting the Data Ready for Tensorflow
Now we have our training set and our test set. Before we can train a model, we must first get the dataframe objects properly ready and also correctly identify the feature columns we will be using.

First, we have to wrap the dataframes with `tf.data`, in order to shuffle and batch the data.

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('Winner')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [None]:
# Split the data into training sets and test sets
train, test = train_test_split(nn_training_data, test_size=0.25)
train, val = train_test_split(train, test_size=0.2)

We have now created the input pipeline. Let's call it to see the format of the data it returns. For demonstration purposes, we will enter a small batch size to keep the output readable and also only show the first three features.

In [None]:
train_ds = df_to_dataset(train, batch_size=5)

[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys())[:3])

In this dataset, all of our features are Numeric. For each Numeric feature, we will have to use a `Normalization()` layer to make sure that the mean of each feature is a 0 and its standard deviation is 1.

In [None]:
# A utility method to create a Normalization Layer
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()

    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

In [None]:
batch_size=64
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
all_inputs = []
encoded_features = []

for header in tqdm(list(train_features.keys()), unit="feature"):
    numeric_column = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_column = normalization_layer(numeric_column)
    all_inputs.append(numeric_column)
    encoded_features.append(encoded_numeric_column)

In [None]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(64, activation="relu")(all_features)
x = tf.keras.layers.Dense(32, activation="relu")(x)
x = tf.keras.layers.Dense(16, activation="relu")(x)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [None]:
model.fit(train_ds, epochs=40, validation_data=val_ds)

# Testing loading the Random Forest Model
In this section, we will see how to use the model by loading it from memory. We will also create a function that takes in as input two teams and two pitchers, a home team, a home pitcher, an away team, and an away pitcher. It will output the probability each team will win with. We will use the latest data for all inputted values.

## 2020 Data
Here, we are simply going to get the last rolling sum for each pitcher and each team, this is what we will use on the website.

In [24]:
clean_dir(DATA_2020) # Clear the directory

recent_team_dfs=[]
recent_pitcher_dfs=[]

# Get the most recent team rollings sums
for teamfile in os.listdir(TEAM_ROLLING_SUMS_DIR):
    recent_team_dfs.append(pd.read_csv(os.path.join(TEAM_ROLLING_SUMS_DIR, teamfile)).tail(1).reset_index(drop=True))
    
# Get the most recent pitcher rolling sums
for player in os.listdir(PITCHING_ROLLING_SUMS_DIR):
    recent_pitcher_dfs.append(pd.read_csv(os.path.join(PITCHING_ROLLING_SUMS_DIR, player)).tail(1).reset_index(drop=True))

# Create a dataframe for both
recent_teams = pd.concat(recent_team_dfs, ignore_index=True).reset_index(drop=True).sort_values(by=['Team'])
recent_pitchers = pd.concat(recent_pitcher_dfs, ignore_index=True).reset_index(drop=True).sort_values(by=['ID'])

# Clean the sub directories
clean_dir(os.path.join(DATA_2020, 'teams'))
clean_dir(os.path.join(DATA_2020, 'pitchers'))

# Save the dataframes
recent_teams.to_csv(os.path.join(LATEST_TEAM_DIR, 'teams.csv'), index=False)
recent_pitchers.to_csv(os.path.join(LATEST_PITCHER_DIR, 'pitchers.csv'), index=False)

## Make a Prediction using 2020 Data

In [25]:
def make_prediction(home_team, away_team, home_pitcher, away_pitcher):
    """Function that returns the probability of the home team winning
    
    Parameters:
        - home_team: String of the home team's identifying abbreviation
        - away_team: String of the away team's identifying abbreviation
        - home_pitcher: String of home pitcher's player id
        - away_pitcher: String of away pitcher's player id
    """
    
    clf = pickle.load(open(os.path.join(MODEL_SAVE_DIR, 'rfcl.sav'), 'rb'))
    teams = pd.read_csv(os.path.join(LATEST_TEAM_DIR, 'teams.csv'))
    pitchers = pd.read_csv(os.path.join(LATEST_PITCHER_DIR, 'pitchers.csv'))
    
    # Columns to drop for the team stats
    team_dropping_columns = ['Date', 'Number of game', 'Team']
    
    # Load in home team and away team; reset index so we can subtract from them
    home_team_df = teams.loc[teams['Team'] == home_team].drop(columns=team_dropping_columns)
    home_team_df.reset_index(drop=True, inplace=True)
    
    away_team_df = teams.loc[teams['Team'] == away_team].drop(columns=team_dropping_columns)
    away_team_df.reset_index(drop=True, inplace=True)
    
    # Get home team - away team; reset index so we can merge it with the pitching stats
    team_differences_df = (home_team_df - away_team_df).reset_index(drop=True)
    

    # Columns to drop for the pitcher stats
    pitcher_dropping_columns = ['Game ID', 'Date', 'Game Number', 'ID']
    
    # Load in home and away pitchers; reset index so we can subtract from them
    home_pitcher_df = pitchers.loc[pitchers['ID'] == home_pitcher].drop(columns=pitcher_dropping_columns)
    home_pitcher_df.reset_index(drop=True, inplace=True)
    
    away_pitcher_df = pitchers.loc[pitchers['ID'] == away_pitcher].drop(columns=pitcher_dropping_columns)
    away_pitcher_df.reset_index(drop=True, inplace=True)
    
    # Get home pitcher - away pitcher; reset index so we can merge it with team stats
    pitcher_differences_df = (home_pitcher_df - away_pitcher_df).reset_index(drop=True)
    
    # Datapoint as input for the model
    datapoint = pd.concat([team_differences_df, pitcher_differences_df], axis=1)
    
    # Return the prediction
    return clf.predict_proba(datapoint)[0]

In [26]:
make_prediction('NYA', 'BOS', 'coleg001', 'eovan001')

array([0.41235047, 0.58764953])