# Web scraping mlb.com for Today's games and starting pitchers
----

# Remember to change the date on the dataframe!

In [1]:
# Imports
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import requests
import pickle

from sklearn import metrics
np.random.seed(42)

In [2]:
url = 'https://www.mlb.com/probable-pitchers'

res = requests.get(url)

print(res.status_code)

soup = BeautifulSoup(res.content)

200


In [3]:
pp_game = soup.find_all(class_ = 'probable-pitchers__game')
teams = []
for row in pp_game:
    team = {}
    team['away_team'] = row.find(class_ = 'probable-pitchers__team-name probable-pitchers__team-name--away').text.strip('\n').strip()
    team['home_team'] = row.find(class_ = 'probable-pitchers__team-name probable-pitchers__team-name--home').text.strip('\n').strip()
    teams.append(team)

teams = pd.DataFrame(teams)
teams.head()

Unnamed: 0,away_team,home_team
0,Phillies,Reds
1,Cubs,Nationals
2,Royals,Twins
3,Orioles,Blue Jays
4,Mariners,Angels


In [4]:
pp_pitchers = soup.find_all(class_ = 'probable-pitchers__pitchers')
pitchers = []
for row in pp_pitchers:
    pitcher = {}
    pitcher['away_pitcher'] = row.find(class_ = 'probable-pitchers__pitcher-name').find_next().text
    pitcher['home_pitcher'] = row.find(class_ = 'probable-pitchers__pitcher-name').find_next().find_next('a').text
    pitchers.append(pitcher)
    
pitchers = pd.DataFrame(pitchers)
pitchers.head()

Unnamed: 0,away_pitcher,home_pitcher
0,Ranger Suarez,Nick Lodolo
1,Drew Smyly,Cory Abbott
2,Daniel Lynch,Tyler Mahle
3,Austin Voth,Ross Stripling
4,George Kirby,Touki Toussaint


In [5]:
todays_games = teams.join(pitchers)
todays_games.head()

Unnamed: 0,away_team,home_team,away_pitcher,home_pitcher
0,Phillies,Reds,Ranger Suarez,Nick Lodolo
1,Cubs,Nationals,Drew Smyly,Cory Abbott
2,Royals,Twins,Daniel Lynch,Tyler Mahle
3,Orioles,Blue Jays,Austin Voth,Ross Stripling
4,Mariners,Angels,George Kirby,Touki Toussaint


In [6]:
todays_games.tail()

Unnamed: 0,away_team,home_team,away_pitcher,home_pitcher
10,Rockies,Cardinals,German Marquez,Jordan Montgomery
11,Athletics,Rangers,Adam Oller,Cole Ragans
12,Astros,White Sox,Framber Valdez,Michael Kopech
13,Dodgers,Brewers,Tony Gonsolin,Eric Lauer
14,D-backs,Giants,Zach Davies,Carlos Rodon


In [7]:
doub_games = pd.DataFrame(np.repeat(todays_games.values, 2, axis=0))
doub_games.head()

Unnamed: 0,0,1,2,3
0,Phillies,Reds,Ranger Suarez,Nick Lodolo
1,Phillies,Reds,Ranger Suarez,Nick Lodolo
2,Cubs,Nationals,Drew Smyly,Cory Abbott
3,Cubs,Nationals,Drew Smyly,Cory Abbott
4,Royals,Twins,Daniel Lynch,Tyler Mahle


In [8]:
doub_games.columns = ['away', 'home', 'away_pitcher', 'home_pitcher']

In [9]:
doub_games.head()

Unnamed: 0,away,home,away_pitcher,home_pitcher
0,Phillies,Reds,Ranger Suarez,Nick Lodolo
1,Phillies,Reds,Ranger Suarez,Nick Lodolo
2,Cubs,Nationals,Drew Smyly,Cory Abbott
3,Cubs,Nationals,Drew Smyly,Cory Abbott
4,Royals,Twins,Daniel Lynch,Tyler Mahle


In [10]:
doub_games['opp_pitcher'] = [doub_games['away_pitcher'][i] if i%2 == 0 else doub_games['home_pitcher'][i] for i in range(len(doub_games))]
doub_games['team'] = [doub_games['home'][i] if i%2 == 0 else doub_games['away'][i] for i in range(len(doub_games))]
doub_games['Home'] = [1 if i%2 == 0 else 0 for i in range(len(doub_games))]

In [11]:
doub_games.head()

Unnamed: 0,away,home,away_pitcher,home_pitcher,opp_pitcher,team,Home
0,Phillies,Reds,Ranger Suarez,Nick Lodolo,Ranger Suarez,Reds,1
1,Phillies,Reds,Ranger Suarez,Nick Lodolo,Nick Lodolo,Phillies,0
2,Cubs,Nationals,Drew Smyly,Cory Abbott,Drew Smyly,Nationals,1
3,Cubs,Nationals,Drew Smyly,Cory Abbott,Cory Abbott,Cubs,0
4,Royals,Twins,Daniel Lynch,Tyler Mahle,Daniel Lynch,Twins,1


In [12]:
games = doub_games.drop(columns = ['away', 'home', 'away_pitcher', 'home_pitcher'])
games.head()

Unnamed: 0,opp_pitcher,team,Home
0,Ranger Suarez,Reds,1
1,Nick Lodolo,Phillies,0
2,Drew Smyly,Nationals,1
3,Cory Abbott,Cubs,0
4,Daniel Lynch,Twins,1


In [13]:
games

Unnamed: 0,opp_pitcher,team,Home
0,Ranger Suarez,Reds,1
1,Nick Lodolo,Phillies,0
2,Drew Smyly,Nationals,1
3,Cory Abbott,Cubs,0
4,Daniel Lynch,Twins,1
5,Tyler Mahle,Royals,0
6,Austin Voth,Blue Jays,1
7,Ross Stripling,Orioles,0
8,George Kirby,Angels,1
9,Touki Toussaint,Mariners,0


In [14]:
 tri_code_dict = {'D-backs':'ARI', 'Braves':'ATL', 'Orioles':'BAL', 'Red Sox':'BOS', 'Cubs':'CHC', 'White Sox':'CHW', 'Reds':'CIN', 'Guardians':'CLE', 'Rockies':'COL', 'Tigers':'DET', 'Marlins':'MIA', 'Astros':'HOU', 'Royals':'KCR', 'Angels':'LAA', 'Dodgers':'LAD', 'Brewers':'MIL', 'Twins':'MIN', 'Mets':'NYM', 'Yankees':'NYY', 'Athletics':'OAK', 'Phillies':'PHI', 'Pirates':'PIT', 'Padres':'SDP', 'Giants':'SFG', 'Mariners':'SEA', 'Cardinals':'STL', 'Rays':'TBR', 'Rangers':'TEX', 'Blue Jays':'TOR', 'Nationals':'WSN'}

In [15]:
games['team'] = games['team'].map(tri_code_dict)
games.head()

Unnamed: 0,opp_pitcher,team,Home
0,Ranger Suarez,CIN,1
1,Nick Lodolo,PHI,0
2,Drew Smyly,WSN,1
3,Cory Abbott,CHC,0
4,Daniel Lynch,MIN,1


In [16]:
pitching = pd.read_csv('../data/pitching_stats.csv')
batting = pd.read_csv('../data/team_batting_stats.csv')

In [17]:
pitching.head()

Unnamed: 0,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,abbrev_name
0,Carlos Rodon,2.95,4.31,0.6,1.06,3.09,0.402,C.Rodon
1,Sandy Alcantara,2.01,3.54,0.43,0.95,3.37,0.385,S.Alcantara
2,Kevin Gausman,3.16,6.76,0.51,1.27,2.84,0.388,K.Gausman
3,Max Fried,2.6,5.08,0.39,1.08,3.11,0.328,M.Fried
4,Aaron Nola,3.07,7.86,0.88,0.93,2.98,0.313,A.Nola


In [18]:
batting.head()

Unnamed: 0,Team,wOBA,wRC+,OBP+,Barrel%,HardHit%,H,HR,OPS,BA
0,LAD,0.341,122,107,0.097,0.42,8.902655,1.327434,0.788,0.261
1,NYY,0.333,119,106,0.108,0.427,8.13913,1.669565,0.766,0.242
2,TOR,0.33,113,105,0.089,0.443,8.982301,1.283186,0.759,0.263
3,ATL,0.329,108,100,0.112,0.433,8.568966,1.525862,0.759,0.251
4,HOU,0.324,114,103,0.084,0.395,8.103448,1.37069,0.743,0.244


In [19]:
df_1 = games.merge(pitching, how = 'left', left_on = 'opp_pitcher', right_on = 'Name')

In [20]:
df_1.head()

Unnamed: 0,opp_pitcher,team,Home,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,abbrev_name
0,Ranger Suarez,CIN,1,Ranger Suarez,3.52,2.28,0.84,1.32,3.75,0.309,R.Suarez
1,Nick Lodolo,PHI,0,Nick Lodolo,4.72,2.67,1.32,1.66,3.84,0.339,N.Lodolo
2,Drew Smyly,WSN,1,Drew Smyly,3.69,3.59,1.4,1.29,4.12,0.315,D.Smyly
3,Cory Abbott,CHC,0,Cory Abbott,5.94,1.45,2.7,1.56,6.47,0.3,C.Abbott
4,Daniel Lynch,MIN,1,Daniel Lynch,4.52,2.41,1.13,1.51,4.33,0.466,D.Lynch


In [21]:
df_2 = df_1.merge(batting, how = 'left', left_on = 'team', right_on = 'Team')
df_2.head()

Unnamed: 0,opp_pitcher,team,Home,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,...,Team,wOBA,wRC+,OBP+,Barrel%,HardHit%,H,HR,OPS,BA
0,Ranger Suarez,CIN,1,Ranger Suarez,3.52,2.28,0.84,1.32,3.75,0.309,...,CIN,0.303,88,98,0.058,0.356,7.929204,0.929204,0.685,0.24
1,Nick Lodolo,PHI,0,Nick Lodolo,4.72,2.67,1.32,1.66,3.84,0.339,...,PHI,0.318,102,99,0.087,0.407,8.421053,1.298246,0.73,0.249
2,Drew Smyly,WSN,1,Drew Smyly,3.69,3.59,1.4,1.29,4.12,0.315,...,WSN,0.304,92,100,0.062,0.351,8.275862,0.818966,0.689,0.248
3,Cory Abbott,CHC,0,Cory Abbott,5.94,1.45,2.7,1.56,6.47,0.3,...,CHC,0.313,99,101,0.068,0.378,8.283186,1.00885,0.713,0.245
4,Daniel Lynch,MIN,1,Daniel Lynch,4.52,2.41,1.13,1.51,4.33,0.466,...,MIN,0.321,110,104,0.097,0.42,8.460177,1.238938,0.735,0.25


In [22]:
X = df_2[['Home', 'H', 'HR', 'BA', 'OPS', 'ERA', 'K/BB', 'HR/9', 'WHIP', 'xFIP', 'HardHit%_P', 'wOBA', 'wRC+', 'OBP+', 'Barrel%', 'HardHit%']]

In [23]:
X['ERA'].fillna(pitching['ERA'].mean(), inplace = True)
X['K/BB'].fillna(pitching['K/BB'].mean(), inplace = True)
X['HR/9'].fillna(pitching['HR/9'].mean(), inplace = True)
X['WHIP'].fillna(pitching['WHIP'].mean(), inplace = True)
X['xFIP'].fillna(pitching['xFIP'].mean(), inplace = True)
X['HardHit%_P'].fillna(pitching['HardHit%_P'].mean(), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ERA'].fillna(pitching['ERA'].mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['K/BB'].fillna(pitching['K/BB'].mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['HR/9'].fillna(pitching['HR/9'].mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

In [24]:
with open('../models/lr.pkl', 'rb') as f:
    model = pickle.load(f)

In [25]:
predictions = model.predict(X)

In [26]:
predictions

array([3.89420796, 4.59419722, 3.64213239, 4.44559096, 4.39434342,
       4.26007861, 4.95468135, 4.1276219 , 3.19652051, 3.93950955,
       3.76501783, 4.23235356, 3.89094105, 4.04370461, 4.8786666 ,
       3.91015475, 4.5718601 , 3.20773384, 4.23778295, 4.92714625,
       4.8459682 , 4.50073035, 4.81569093, 3.31724781, 4.22457543,
       4.70405562, 4.35025095, 4.93127796, 4.43591169, 4.07836223])

In [27]:
preds = pd.DataFrame([df_2['team'], predictions]).T

In [28]:
preds.rename(columns = {'Unnamed 0': 'runs_scored'}, inplace = True)

In [29]:
preds

Unnamed: 0,team,runs_scored
0,CIN,3.894208
1,PHI,4.594197
2,WSN,3.642132
3,CHC,4.445591
4,MIN,4.394343
5,KCR,4.260079
6,TOR,4.954681
7,BAL,4.127622
8,LAA,3.196521
9,SEA,3.93951


In [30]:
preds['wp'] = [preds['runs_scored'][i]**2 / (preds['runs_scored'][i]**2 + preds['runs_scored'][i + 1]**2) if i%2 == 0 else preds['runs_scored'][i]**2 / (preds['runs_scored'][i]**2 + preds['runs_scored'][i - 1]**2) for i in range(len(preds))]


In [31]:
preds

Unnamed: 0,team,runs_scored,wp
0,CIN,3.894208,0.418093
1,PHI,4.594197,0.581907
2,WSN,3.642132,0.401628
3,CHC,4.445591,0.598372
4,MIN,4.394343,0.51551
5,KCR,4.260079,0.48449
6,TOR,4.954681,0.590314
7,BAL,4.127622,0.409686
8,LAA,3.196521,0.396999
9,SEA,3.93951,0.603001


In [32]:
def wp_to_ml(wp):
    if wp > .5:
        ml = (wp / (1 - wp))*-100
    else:
        ml = ((1 - wp) / wp) * 100
    return ml

In [33]:
preds['ml'] = preds['wp'].apply(wp_to_ml)

In [34]:
preds.to_csv('../data/preds_aug_17th.csv', index = False)