# MLB Betting Model Data Collection

In [1]:
import pybaseball as pb
import pandas as pd
import numpy as np
import requests
import json
import odds

## Importing data from fangraphs using pybaseball api

In [2]:
def game_logs(season):
    teams = ['ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL', 'DET', 'MIA', 'HOU', 'KCR', 'LAA', 'LAD', 'MIL', 'MIN', 'NYM', 'NYY', 'OAK', 'PHI', 'PIT', 'SDP', 'SFG', 'SEA', 'STL', 'TBR', 'TEX', 'TOR', 'WSN'] #List of teams
    data_list = []   # Create an empty list for appending our dataframes to
    for team in teams:
        batting_logs = pb.team_game_logs(season, team)    # Use pybaseball API to pull game logs for each team
        df_temp = batting_logs[['Date', 'Home', 'Opp', 'R', 'H', 'HR', 'BA', 'OPS', 'OppStart']].copy()    # Only need certain columns
        df_temp['Team'] = team   # Add team name column
        data_list.append(df_temp)  # Append to list
    season_logs = pd.concat(data_list, ignore_index = True)  # Concatenate all teams dataframes
    print('Nulls:')
    print(season_logs.isnull().sum())
    print('='*100)
    print(season_logs.head())
    return season_logs

In [3]:
df = game_logs(2022)  # Get games from 2022 season

Nulls:
Date        0
Home        0
Opp         0
R           0
H           0
HR          0
BA          0
OPS         0
OppStart    0
Team        0
dtype: int64
     Date   Home  Opp  R  H  HR     BA    OPS        OppStart Team
0   Apr 7  False  SDP  4  3   1  0.115  0.554   Y.Darvish(71)  ARI
1   Apr 8  False  SDP  0  2   0  0.091  0.364    S.Manaea(83)  ARI
2   Apr 9  False  SDP  2  5   1  0.116  0.426  J.Musgrove(62)  ARI
3  Apr 10  False  SDP  5  5   2  0.129  0.516  N.Crismatt(58)  ARI
4  Apr 12  False  HOU  1  4   1  0.130  0.518    L.Garcia(57)  ARI


In [4]:
pitching = pb.pitching_stats(2022, qual = 0) # Pitching stats for players 2022

In [5]:
pitching.head()

Unnamed: 0,IDfg,Season,Name,Team,Age,W,L,WAR,ERA,G,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xERA
188,16137,2022,Carlos Rodon,SFG,29,11,6,4.4,2.95,23,...,17.9,19,0.057,112.5,133,0.402,331,0.165,0.303,2.76
82,18684,2022,Sandy Alcantara,MIA,26,10,5,4.4,2.01,23,...,4.4,22,0.05,114.8,171,0.385,444,0.153,0.27,2.76
228,14107,2022,Kevin Gausman,TOR,31,8,9,4.4,3.16,22,...,11.6,28,0.08,116.6,135,0.388,348,0.138,0.292,3.54
138,13743,2022,Max Fried,ATL,28,10,4,4.3,2.6,22,...,8.3,14,0.035,112.7,131,0.328,400,0.157,0.272,2.87
209,16149,2022,Aaron Nola,PHI,29,8,9,4.3,3.07,23,...,12.6,31,0.078,112.0,125,0.313,399,0.195,0.311,2.76


In [6]:
pitching.columns.values

array(['IDfg', 'Season', 'Name', 'Team', 'Age', 'W', 'L', 'WAR', 'ERA',
       'G', 'GS', 'CG', 'ShO', 'SV', 'BS', 'IP', 'TBF', 'H', 'R', 'ER',
       'HR', 'BB', 'IBB', 'HBP', 'WP', 'BK', 'SO', 'GB', 'FB', 'LD',
       'IFFB', 'Balls', 'Strikes', 'Pitches', 'RS', 'IFH', 'BU', 'BUH',
       'K/9', 'BB/9', 'K/BB', 'H/9', 'HR/9', 'AVG', 'WHIP', 'BABIP',
       'LOB%', 'FIP', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB',
       'IFH%', 'BUH%', 'Starting', 'Start-IP', 'Relieving', 'Relief-IP',
       'RAR', 'Dollars', 'tERA', 'xFIP', 'WPA', '-WPA', '+WPA', 'RE24',
       'REW', 'pLI', 'inLI', 'gmLI', 'exLI', 'Pulls', 'WPA/LI', 'Clutch',
       'FB% 2', 'FBv', 'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%',
       'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%', 'PO%', 'wFB', 'wSL',
       'wCT', 'wCB', 'wCH', 'wSF', 'wKN', 'wFB/C', 'wSL/C', 'wCT/C',
       'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%',
       'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%',
       'F-Str

In [7]:
pitchers = pitching[['Name', 'ERA', 'K/BB', 'HR/9', 'WHIP', 'xFIP', 'HardHit%']].copy()   # Only need certain columns

In [8]:
def name_abbrev(name):
    # This function takes in a full name and changes it into first_intial.last_name
    names = name.split(' ')
    return '.'.join([names[0][0], names[1]])

In [9]:
pitchers['abbrev_name'] = pitchers['Name'].apply(name_abbrev)
pitchers.rename(columns = {'HardHit%':'HardHit%_P'}, inplace = True)
pitchers.drop_duplicates(subset = 'Name', inplace = True)
pitchers.head()

Unnamed: 0,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,abbrev_name
188,Carlos Rodon,2.95,4.31,0.6,1.06,3.09,0.402,C.Rodon
82,Sandy Alcantara,2.01,3.54,0.43,0.95,3.37,0.385,S.Alcantara
228,Kevin Gausman,3.16,6.76,0.51,1.27,2.84,0.388,K.Gausman
138,Max Fried,2.6,5.08,0.39,1.08,3.11,0.328,M.Fried
209,Aaron Nola,3.07,7.86,0.88,0.93,2.98,0.313,A.Nola


In [10]:
pitchers.to_csv('../data/pitching_stats.csv', index = False)

In [11]:
def remove_parenth(name):
    # This function uses regex to remove the parentheses and everything inside them from a string.
    import re
    return re.sub('([\(]).*?([\)])', '', name)

In [12]:
df['abbrev_name'] = df['OppStart'].apply(remove_parenth)  # Applying the above function
df.head()

Unnamed: 0,Date,Home,Opp,R,H,HR,BA,OPS,OppStart,Team,abbrev_name
0,Apr 7,False,SDP,4,3,1,0.115,0.554,Y.Darvish(71),ARI,Y.Darvish
1,Apr 8,False,SDP,0,2,0,0.091,0.364,S.Manaea(83),ARI,S.Manaea
2,Apr 9,False,SDP,2,5,1,0.116,0.426,J.Musgrove(62),ARI,J.Musgrove
3,Apr 10,False,SDP,5,5,2,0.129,0.516,N.Crismatt(58),ARI,N.Crismatt
4,Apr 12,False,HOU,1,4,1,0.13,0.518,L.Garcia(57),ARI,L.Garcia


In [13]:
batting = pb.team_batting(2022)  # Import team batting stats
batting.head()

Unnamed: 0,teamIDfg,Season,Team,Age,G,AB,PA,H,1B,2B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,22,2022,LAD,29,1621,3851,4354,1006,597,237,...,0.097,112.5,1245,0.42,2961,0.16,0.263,,,
1,9,2022,NYY,29,1662,3861,4407,936,572,166,...,0.108,119.8,1249,0.427,2927,0.178,0.285,,,
2,14,2022,TOR,29,1692,3858,4264,1015,652,213,...,0.089,117.9,1337,0.443,3018,0.163,0.27,,,
3,16,2022,ATL,29,1600,3961,4366,994,591,217,...,0.112,116.8,1258,0.433,2908,0.144,0.276,,,
4,21,2022,HOU,29,1614,3858,4329,940,569,201,...,0.084,117.4,1203,0.395,3046,0.158,0.26,,,


In [14]:
batters = batting[['Team', 'wOBA', 'wRC+', 'OBP+', 'Barrel%', 'HardHit%']].copy()   # Only take the columns we plan to use
batters.head()

Unnamed: 0,Team,wOBA,wRC+,OBP+,Barrel%,HardHit%
0,LAD,0.341,122,107,0.097,0.42
1,NYY,0.333,119,106,0.108,0.427
2,TOR,0.33,113,105,0.089,0.443
3,ATL,0.329,108,100,0.112,0.433
4,HOU,0.324,114,103,0.084,0.395


In [15]:
df_1 = df.merge(pitchers, how = 'left', left_on = 'abbrev_name', right_on = 'abbrev_name')     # Merge game log and pitching stats
df_1.dropna(inplace = True)
df_1.head()

Unnamed: 0,Date,Home,Opp,R,H,HR,BA,OPS,OppStart,Team,abbrev_name,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P
0,Apr 7,False,SDP,4,3,1,0.115,0.554,Y.Darvish(71),ARI,Y.Darvish,Yu Darvish,3.4,5.22,1.03,1.01,3.55,0.381
1,Apr 8,False,SDP,0,2,0,0.091,0.364,S.Manaea(83),ARI,S.Manaea,Sean Manaea,4.76,2.77,1.44,1.33,3.93,0.411
2,Apr 9,False,SDP,2,5,1,0.116,0.426,J.Musgrove(62),ARI,J.Musgrove,Joe Musgrove,2.91,4.43,0.92,1.02,3.38,0.331
3,Apr 10,False,SDP,5,5,2,0.129,0.516,N.Crismatt(58),ARI,N.Crismatt,Nabil Crismatt,2.92,3.06,0.34,1.17,3.44,0.36
4,Apr 12,False,HOU,1,4,1,0.13,0.518,L.Garcia(57),ARI,L.Garcia,Luis Garcia,4.03,3.51,1.37,1.13,3.69,0.348


In [16]:
df_2 = df_1.merge(batters, how = 'left', left_on = 'Team', right_on = 'Team')     # Merge game log and team batting stats
df_2.head()

Unnamed: 0,Date,Home,Opp,R,H,HR,BA,OPS,OppStart,Team,...,K/BB,HR/9,WHIP,xFIP,HardHit%_P,wOBA,wRC+,OBP+,Barrel%,HardHit%
0,Apr 7,False,SDP,4,3,1,0.115,0.554,Y.Darvish(71),ARI,...,5.22,1.03,1.01,3.55,0.381,0.305,92,97,0.071,0.36
1,Apr 8,False,SDP,0,2,0,0.091,0.364,S.Manaea(83),ARI,...,2.77,1.44,1.33,3.93,0.411,0.305,92,97,0.071,0.36
2,Apr 9,False,SDP,2,5,1,0.116,0.426,J.Musgrove(62),ARI,...,4.43,0.92,1.02,3.38,0.331,0.305,92,97,0.071,0.36
3,Apr 10,False,SDP,5,5,2,0.129,0.516,N.Crismatt(58),ARI,...,3.06,0.34,1.17,3.44,0.36,0.305,92,97,0.071,0.36
4,Apr 12,False,HOU,1,4,1,0.13,0.518,L.Garcia(57),ARI,...,3.51,1.37,1.13,3.69,0.348,0.305,92,97,0.071,0.36


In [17]:
df_2.drop(columns = ['OppStart', 'Name'], inplace = True) # Drop unnecessary columns
df_2.head()

Unnamed: 0,Date,Home,Opp,R,H,HR,BA,OPS,Team,abbrev_name,...,K/BB,HR/9,WHIP,xFIP,HardHit%_P,wOBA,wRC+,OBP+,Barrel%,HardHit%
0,Apr 7,False,SDP,4,3,1,0.115,0.554,ARI,Y.Darvish,...,5.22,1.03,1.01,3.55,0.381,0.305,92,97,0.071,0.36
1,Apr 8,False,SDP,0,2,0,0.091,0.364,ARI,S.Manaea,...,2.77,1.44,1.33,3.93,0.411,0.305,92,97,0.071,0.36
2,Apr 9,False,SDP,2,5,1,0.116,0.426,ARI,J.Musgrove,...,4.43,0.92,1.02,3.38,0.331,0.305,92,97,0.071,0.36
3,Apr 10,False,SDP,5,5,2,0.129,0.516,ARI,N.Crismatt,...,3.06,0.34,1.17,3.44,0.36,0.305,92,97,0.071,0.36
4,Apr 12,False,HOU,1,4,1,0.13,0.518,ARI,L.Garcia,...,3.51,1.37,1.13,3.69,0.348,0.305,92,97,0.071,0.36


In [18]:
df_2.to_csv('../data/game_logs.csv', index = False)    # Export to csv to be used for modeling

# Creating a csv that contains team batting stats for the season to be used in prediction modeling
---

In [19]:
games_played = dict(df['Team'].value_counts())   # Create a dictionary of games played by team
games_played = pd.DataFrame(list(games_played.items()))  # Turn it into a dataframe so it can be merged with the batting dataframe
games_played.columns = ['team', 'gp']
games_played.head()

Unnamed: 0,team,gp
0,COL,117
1,SDP,117
2,WSN,116
3,BOS,116
4,SEA,116


In [20]:
team_batting = batting[['Team', 'wOBA', 'wRC+', 'OBP+', 'Barrel%', 'HardHit%', 'H', 'HR', 'OPS', 'AVG']].copy()   # Take only columns we need
team_batting.rename(columns = {'AVG':'BA'}, inplace = True)
team_batting_df = team_batting.merge(games_played, left_on = 'Team', right_on = 'team') # Merge with previously created games_played df
team_batting_df['H'] = team_batting_df['H'] / team_batting_df['gp']      # Find H and HR/game
team_batting_df['HR'] = team_batting_df['HR'] / team_batting_df['gp']
team_batting_df.drop(columns = ['team', 'gp'], inplace = True)
team_batting_df.head()

Unnamed: 0,Team,wOBA,wRC+,OBP+,Barrel%,HardHit%,H,HR,OPS,BA
0,LAD,0.341,122,107,0.097,0.42,8.902655,1.327434,0.788,0.261
1,NYY,0.333,119,106,0.108,0.427,8.13913,1.669565,0.766,0.242
2,TOR,0.33,113,105,0.089,0.443,8.982301,1.283186,0.759,0.263
3,ATL,0.329,108,100,0.112,0.433,8.568966,1.525862,0.759,0.251
4,HOU,0.324,114,103,0.084,0.395,8.103448,1.37069,0.743,0.244


In [21]:
team_batting_df.to_csv('../data/team_batting_stats.csv', index = False)  # Export to csv for use in prediction modeling