# Web scraping mlb.com for Today's games and starting pitchers
----

# Remember to change the date on the dataframe!

In [1]:
# Imports
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import requests
import pickle

from sklearn import metrics
np.random.seed(42)

In [2]:
url = 'https://www.mlb.com/probable-pitchers'

res = requests.get(url)

print(res.status_code)

soup = BeautifulSoup(res.content)

200


In [3]:
pp_game = soup.find_all(class_ = 'probable-pitchers__game')
teams = []
for row in pp_game:
    team = {}
    team['away_team'] = row.find(class_ = 'probable-pitchers__team-name probable-pitchers__team-name--away').text.strip('\n').strip()
    team['home_team'] = row.find(class_ = 'probable-pitchers__team-name probable-pitchers__team-name--home').text.strip('\n').strip()
    teams.append(team)

teams = pd.DataFrame(teams)
teams.head()

Unnamed: 0,away_team,home_team
0,Phillies,Reds
1,Padres,Marlins
2,Cubs,Nationals
3,Red Sox,Pirates
4,Rays,Yankees


In [4]:
pp_pitchers = soup.find_all(class_ = 'probable-pitchers__pitchers')
pitchers = []
for row in pp_pitchers:
    pitcher = {}
    pitcher['away_pitcher'] = row.find(class_ = 'probable-pitchers__pitcher-name').find_next().text
    pitcher['home_pitcher'] = row.find(class_ = 'probable-pitchers__pitcher-name').find_next().find_next('a').text
    pitchers.append(pitcher)
    
pitchers = pd.DataFrame(pitchers)
pitchers.head()

Unnamed: 0,away_pitcher,home_pitcher
0,Kyle Gibson,T.J. Zeuch
1,Sean Manaea,Edward Cabrera
2,Justin Steele,Patrick Corbin
3,Nick Pivetta,Mitch Keller
4,Jeffrey Springs,Nestor Cortes


In [5]:
todays_games = teams.join(pitchers)
todays_games.head()

Unnamed: 0,away_team,home_team,away_pitcher,home_pitcher
0,Phillies,Reds,Kyle Gibson,T.J. Zeuch
1,Padres,Marlins,Sean Manaea,Edward Cabrera
2,Cubs,Nationals,Justin Steele,Patrick Corbin
3,Red Sox,Pirates,Nick Pivetta,Mitch Keller
4,Rays,Yankees,Jeffrey Springs,Nestor Cortes


In [6]:
todays_games.tail()

Unnamed: 0,away_team,home_team,away_pitcher,home_pitcher
10,Athletics,Rangers,JP Sears,Kohei Arihara
11,Astros,White Sox,Justin Verlander,Dylan Cease
12,Dodgers,Brewers,Ryan Pepiot,Brandon Woodruff
13,Mariners,Angels,Robbie Ray,Jose Suarez
14,D-backs,Giants,Merrill Kelly,Jakob Junis


In [7]:
doub_games = pd.DataFrame(np.repeat(todays_games.values, 2, axis=0))
doub_games.head()

Unnamed: 0,0,1,2,3
0,Phillies,Reds,Kyle Gibson,T.J. Zeuch
1,Phillies,Reds,Kyle Gibson,T.J. Zeuch
2,Padres,Marlins,Sean Manaea,Edward Cabrera
3,Padres,Marlins,Sean Manaea,Edward Cabrera
4,Cubs,Nationals,Justin Steele,Patrick Corbin


In [8]:
doub_games.columns = ['away', 'home', 'away_pitcher', 'home_pitcher']

In [9]:
doub_games.head()

Unnamed: 0,away,home,away_pitcher,home_pitcher
0,Phillies,Reds,Kyle Gibson,T.J. Zeuch
1,Phillies,Reds,Kyle Gibson,T.J. Zeuch
2,Padres,Marlins,Sean Manaea,Edward Cabrera
3,Padres,Marlins,Sean Manaea,Edward Cabrera
4,Cubs,Nationals,Justin Steele,Patrick Corbin


In [10]:
doub_games['opp_pitcher'] = [doub_games['away_pitcher'][i] if i%2 == 0 else doub_games['home_pitcher'][i] for i in range(len(doub_games))]
doub_games['team'] = [doub_games['home'][i] if i%2 == 0 else doub_games['away'][i] for i in range(len(doub_games))]
doub_games['Home'] = [1 if i%2 == 0 else 0 for i in range(len(doub_games))]

In [11]:
doub_games.head()

Unnamed: 0,away,home,away_pitcher,home_pitcher,opp_pitcher,team,Home
0,Phillies,Reds,Kyle Gibson,T.J. Zeuch,Kyle Gibson,Reds,1
1,Phillies,Reds,Kyle Gibson,T.J. Zeuch,T.J. Zeuch,Phillies,0
2,Padres,Marlins,Sean Manaea,Edward Cabrera,Sean Manaea,Marlins,1
3,Padres,Marlins,Sean Manaea,Edward Cabrera,Edward Cabrera,Padres,0
4,Cubs,Nationals,Justin Steele,Patrick Corbin,Justin Steele,Nationals,1


In [12]:
games = doub_games.drop(columns = ['away', 'home', 'away_pitcher', 'home_pitcher'])
games.head()

Unnamed: 0,opp_pitcher,team,Home
0,Kyle Gibson,Reds,1
1,T.J. Zeuch,Phillies,0
2,Sean Manaea,Marlins,1
3,Edward Cabrera,Padres,0
4,Justin Steele,Nationals,1


In [13]:
games

Unnamed: 0,opp_pitcher,team,Home
0,Kyle Gibson,Reds,1
1,T.J. Zeuch,Phillies,0
2,Sean Manaea,Marlins,1
3,Edward Cabrera,Padres,0
4,Justin Steele,Nationals,1
5,Patrick Corbin,Cubs,0
6,Nick Pivetta,Pirates,1
7,Mitch Keller,Red Sox,0
8,Jeffrey Springs,Yankees,1
9,Nestor Cortes,Rays,0


In [14]:
 tri_code_dict = {'D-backs':'ARI', 'Braves':'ATL', 'Orioles':'BAL', 'Red Sox':'BOS', 'Cubs':'CHC', 'White Sox':'CHW', 'Reds':'CIN', 'Guardians':'CLE', 'Rockies':'COL', 'Tigers':'DET', 'Marlins':'MIA', 'Astros':'HOU', 'Royals':'KCR', 'Angels':'LAA', 'Dodgers':'LAD', 'Brewers':'MIL', 'Twins':'MIN', 'Mets':'NYM', 'Yankees':'NYY', 'Athletics':'OAK', 'Phillies':'PHI', 'Pirates':'PIT', 'Padres':'SDP', 'Giants':'SFG', 'Mariners':'SEA', 'Cardinals':'STL', 'Rays':'TBR', 'Rangers':'TEX', 'Blue Jays':'TOR', 'Nationals':'WSN'}

In [15]:
games['team'] = games['team'].map(tri_code_dict)
games.head()

Unnamed: 0,opp_pitcher,team,Home
0,Kyle Gibson,CIN,1
1,T.J. Zeuch,PHI,0
2,Sean Manaea,MIA,1
3,Edward Cabrera,SDP,0
4,Justin Steele,WSN,1


In [16]:
pitching = pd.read_csv('../data/pitching_stats.csv')
batting = pd.read_csv('../data/team_batting_stats.csv')

In [17]:
pitching.head()

Unnamed: 0,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,abbrev_name
0,Carlos Rodon,2.95,4.31,0.6,1.06,3.09,0.402,C.Rodon
1,Sandy Alcantara,2.01,3.54,0.43,0.95,3.37,0.385,S.Alcantara
2,Kevin Gausman,3.16,6.76,0.51,1.27,2.84,0.388,K.Gausman
3,Max Fried,2.6,5.08,0.39,1.08,3.11,0.328,M.Fried
4,Aaron Nola,3.07,7.86,0.88,0.93,2.98,0.313,A.Nola


In [18]:
batting.head()

Unnamed: 0,Team,wOBA,wRC+,OBP+,Barrel%,HardHit%,H,HR,OPS,BA
0,LAD,0.341,122,107,0.097,0.42,8.902655,1.327434,0.788,0.261
1,NYY,0.333,119,106,0.108,0.427,8.13913,1.669565,0.766,0.242
2,TOR,0.33,113,105,0.089,0.443,8.982301,1.283186,0.759,0.263
3,ATL,0.329,108,100,0.112,0.433,8.568966,1.525862,0.759,0.251
4,HOU,0.324,114,103,0.084,0.395,8.103448,1.37069,0.743,0.244


In [19]:
df_1 = games.merge(pitching, how = 'left', left_on = 'opp_pitcher', right_on = 'Name')

In [20]:
df_1.head()

Unnamed: 0,opp_pitcher,team,Home,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,abbrev_name
0,Kyle Gibson,CIN,1,Kyle Gibson,4.29,2.68,1.11,1.18,4.25,0.358,K.Gibson
1,T.J. Zeuch,PHI,0,T.J. Zeuch,13.5,2.0,2.25,2.0,5.23,0.385,T.Zeuch
2,Sean Manaea,MIA,1,Sean Manaea,4.76,2.77,1.44,1.33,3.93,0.411,S.Manaea
3,Edward Cabrera,SDP,0,Edward Cabrera,2.05,2.07,0.68,1.03,3.98,0.262,E.Cabrera
4,Justin Steele,WSN,1,Justin Steele,3.63,2.47,0.62,1.41,3.55,0.319,J.Steele


In [21]:
df_2 = df_1.merge(batting, how = 'left', left_on = 'team', right_on = 'Team')
df_2.head()

Unnamed: 0,opp_pitcher,team,Home,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,...,Team,wOBA,wRC+,OBP+,Barrel%,HardHit%,H,HR,OPS,BA
0,Kyle Gibson,CIN,1,Kyle Gibson,4.29,2.68,1.11,1.18,4.25,0.358,...,CIN,0.303,88,98,0.058,0.356,7.929204,0.929204,0.685,0.24
1,T.J. Zeuch,PHI,0,T.J. Zeuch,13.5,2.0,2.25,2.0,5.23,0.385,...,PHI,0.318,102,99,0.087,0.407,8.421053,1.298246,0.73,0.249
2,Sean Manaea,MIA,1,Sean Manaea,4.76,2.77,1.44,1.33,3.93,0.411,...,MIA,0.294,91,94,0.07,0.373,7.834783,0.904348,0.667,0.234
3,Edward Cabrera,SDP,0,Edward Cabrera,2.05,2.07,0.68,1.03,3.98,0.262,...,SDP,0.309,101,101,0.065,0.365,8.196581,0.897436,0.702,0.243
4,Justin Steele,WSN,1,Justin Steele,3.63,2.47,0.62,1.41,3.55,0.319,...,WSN,0.304,92,100,0.062,0.351,8.275862,0.818966,0.689,0.248


In [22]:
X = df_2[['Home', 'H', 'HR', 'BA', 'OPS', 'ERA', 'K/BB', 'HR/9', 'WHIP', 'xFIP', 'HardHit%_P', 'wOBA', 'wRC+', 'OBP+', 'Barrel%', 'HardHit%']]

In [23]:
X['ERA'].fillna(pitching['ERA'].mean(), inplace = True)
X['K/BB'].fillna(pitching['K/BB'].mean(), inplace = True)
X['HR/9'].fillna(pitching['HR/9'].mean(), inplace = True)
X['WHIP'].fillna(pitching['WHIP'].mean(), inplace = True)
X['xFIP'].fillna(pitching['xFIP'].mean(), inplace = True)
X['HardHit%_P'].fillna(pitching['HardHit%_P'].mean(), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ERA'].fillna(pitching['ERA'].mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['K/BB'].fillna(pitching['K/BB'].mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['HR/9'].fillna(pitching['HR/9'].mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

In [24]:
with open('../models/lr.pkl', 'rb') as f:
    model = pickle.load(f)

In [25]:
predictions = model.predict(X)

In [26]:
predictions

array([4.15753402, 7.23231432, 3.93660828, 4.13410687, 3.91234232,
       4.74774719, 3.70038361, 4.58419068, 4.29332533, 3.80716202,
       4.70960494, 4.0145316 , 4.40526553, 3.25812164, 4.92226929,
       4.80871717, 4.48215428, 4.06452737, 5.03738248, 4.60303995,
       4.46342194, 3.67705141, 3.76519172, 4.35945794, 4.38246237,
       5.15408793, 3.71856391, 4.11353535, 4.2866479 , 4.1789819 ])

In [27]:
preds = pd.DataFrame([df_2['team'], predictions]).T

In [28]:
preds.rename(columns = {'Unnamed 0': 'runs_scored'}, inplace = True)

In [29]:
preds

Unnamed: 0,team,runs_scored
0,CIN,4.157534
1,PHI,7.232314
2,MIA,3.936608
3,SDP,4.134107
4,WSN,3.912342
5,CHC,4.747747
6,PIT,3.700384
7,BOS,4.584191
8,NYY,4.293325
9,TBR,3.807162


In [30]:
preds['wp'] = [preds['runs_scored'][i]**2 / (preds['runs_scored'][i]**2 + preds['runs_scored'][i + 1]**2) if i%2 == 0 else preds['runs_scored'][i]**2 / (preds['runs_scored'][i]**2 + preds['runs_scored'][i - 1]**2) for i in range(len(preds))]


In [31]:
preds

Unnamed: 0,team,runs_scored,wp
0,CIN,4.157534,0.248379
1,PHI,7.232314,0.751621
2,MIA,3.936608,0.475544
3,SDP,4.134107,0.524456
4,WSN,3.912342,0.404423
5,CHC,4.747747,0.595577
6,PIT,3.700384,0.394519
7,BOS,4.584191,0.605481
8,NYY,4.293325,0.559801
9,TBR,3.807162,0.440199


In [32]:
def wp_to_ml(wp):
    if wp > .5:
        ml = (wp / (1 - wp))*-100
    else:
        ml = ((1 - wp) / wp) * 100
    return ml

In [33]:
preds['ml'] = preds['wp'].apply(wp_to_ml)

In [34]:
preds.to_csv('../data/preds_aug_16th.csv', index = False)