# Web scraping mlb.com for Today's games and starting pitchers
----

# Remember to change the date on the dataframe!

In [1]:
# Imports
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import requests
import pickle

from sklearn import metrics
np.random.seed(42)

In [2]:
url = 'https://www.mlb.com/probable-pitchers'

res = requests.get(url)

print(res.status_code)

soup = BeautifulSoup(res.content)

200


In [3]:
pp_game = soup.find_all(class_ = 'probable-pitchers__game')
teams = []
for row in pp_game:
    team = {}
    team['away_team'] = row.find(class_ = 'probable-pitchers__team-name probable-pitchers__team-name--away').text.strip('\n').strip()
    team['home_team'] = row.find(class_ = 'probable-pitchers__team-name probable-pitchers__team-name--home').text.strip('\n').strip()
    teams.append(team)

teams = pd.DataFrame(teams)
teams.head()

Unnamed: 0,away_team,home_team
0,Nationals,Braves
1,Mets,Brewers
2,Red Sox,Reds
3,Astros,Rays
4,Cubs,Marlins


In [4]:
pp_pitchers = soup.find_all(class_ = 'probable-pitchers__pitchers')
pitchers = []
for row in pp_pitchers:
    pitcher = {}
    pitcher['away_pitcher'] = row.find(class_ = 'probable-pitchers__pitcher-name').find_next().text
    pitcher['home_pitcher'] = row.find(class_ = 'probable-pitchers__pitcher-name').find_next().find_next('a').text
    pitchers.append(pitcher)
    
pitchers = pd.DataFrame(pitchers)
pitchers.head()

Unnamed: 0,away_pitcher,home_pitcher
0,Paolo Espino,Bryce Elder
1,Taijuan Walker,Adrian Houser
2,Connor Seabold,Chase Anderson
3,Lance McCullers Jr.,Corey Kluber
4,Marcus Stroman,Jesus Luzardo


In [5]:
todays_games = teams.join(pitchers)
todays_games.head()

Unnamed: 0,away_team,home_team,away_pitcher,home_pitcher
0,Nationals,Braves,Paolo Espino,Bryce Elder
1,Mets,Brewers,Taijuan Walker,Adrian Houser
2,Red Sox,Reds,Connor Seabold,Chase Anderson
3,Astros,Rays,Lance McCullers Jr.,Corey Kluber
4,Cubs,Marlins,Marcus Stroman,Jesus Luzardo


In [6]:
todays_games.tail()

Unnamed: 0,away_team,home_team,away_pitcher,home_pitcher
10,Guardians,White Sox,Triston McKenzie,Lance Lynn
11,Giants,Rockies,Logan Webb,German Marquez
12,Mariners,Athletics,Robbie Ray,James Kaprielian
13,Cardinals,Padres,Miles Mikolas,Blake Snell
14,D-backs,Dodgers,Madison Bumgarner,Dustin May


In [7]:
doub_games = pd.DataFrame(np.repeat(todays_games.values, 2, axis=0))
doub_games.head()

Unnamed: 0,0,1,2,3
0,Nationals,Braves,Paolo Espino,Bryce Elder
1,Nationals,Braves,Paolo Espino,Bryce Elder
2,Mets,Brewers,Taijuan Walker,Adrian Houser
3,Mets,Brewers,Taijuan Walker,Adrian Houser
4,Red Sox,Reds,Connor Seabold,Chase Anderson


In [8]:
doub_games.columns = ['away', 'home', 'away_pitcher', 'home_pitcher']

In [9]:
doub_games.head()

Unnamed: 0,away,home,away_pitcher,home_pitcher
0,Nationals,Braves,Paolo Espino,Bryce Elder
1,Nationals,Braves,Paolo Espino,Bryce Elder
2,Mets,Brewers,Taijuan Walker,Adrian Houser
3,Mets,Brewers,Taijuan Walker,Adrian Houser
4,Red Sox,Reds,Connor Seabold,Chase Anderson


In [10]:
doub_games['opp_pitcher'] = [doub_games['away_pitcher'][i] if i%2 == 0 else doub_games['home_pitcher'][i] for i in range(len(doub_games))]
doub_games['team'] = [doub_games['home'][i] if i%2 == 0 else doub_games['away'][i] for i in range(len(doub_games))]
doub_games['Home'] = [1 if i%2 == 0 else 0 for i in range(len(doub_games))]

In [11]:
doub_games.head()

Unnamed: 0,away,home,away_pitcher,home_pitcher,opp_pitcher,team,Home
0,Nationals,Braves,Paolo Espino,Bryce Elder,Paolo Espino,Braves,1
1,Nationals,Braves,Paolo Espino,Bryce Elder,Bryce Elder,Nationals,0
2,Mets,Brewers,Taijuan Walker,Adrian Houser,Taijuan Walker,Brewers,1
3,Mets,Brewers,Taijuan Walker,Adrian Houser,Adrian Houser,Mets,0
4,Red Sox,Reds,Connor Seabold,Chase Anderson,Connor Seabold,Reds,1


In [12]:
games = doub_games.drop(columns = ['away', 'home', 'away_pitcher', 'home_pitcher'])
games.head()

Unnamed: 0,opp_pitcher,team,Home
0,Paolo Espino,Braves,1
1,Bryce Elder,Nationals,0
2,Taijuan Walker,Brewers,1
3,Adrian Houser,Mets,0
4,Connor Seabold,Reds,1


In [13]:
games

Unnamed: 0,opp_pitcher,team,Home
0,Paolo Espino,Braves,1
1,Bryce Elder,Nationals,0
2,Taijuan Walker,Brewers,1
3,Adrian Houser,Mets,0
4,Connor Seabold,Reds,1
5,Chase Anderson,Red Sox,0
6,Lance McCullers Jr.,Rays,1
7,Corey Kluber,Astros,0
8,Marcus Stroman,Marlins,1
9,Jesus Luzardo,Cubs,0


In [14]:
 tri_code_dict = {'D-backs':'ARI', 'Braves':'ATL', 'Orioles':'BAL', 'Red Sox':'BOS', 'Cubs':'CHC', 'White Sox':'CHW', 'Reds':'CIN', 'Guardians':'CLE', 'Rockies':'COL', 'Tigers':'DET', 'Marlins':'MIA', 'Astros':'HOU', 'Royals':'KCR', 'Angels':'LAA', 'Dodgers':'LAD', 'Brewers':'MIL', 'Twins':'MIN', 'Mets':'NYM', 'Yankees':'NYY', 'Athletics':'OAK', 'Phillies':'PHI', 'Pirates':'PIT', 'Padres':'SDP', 'Giants':'SFG', 'Mariners':'SEA', 'Cardinals':'STL', 'Rays':'TBR', 'Rangers':'TEX', 'Blue Jays':'TOR', 'Nationals':'WSN'}

In [15]:
games['team'] = games['team'].map(tri_code_dict)
games.head()

Unnamed: 0,opp_pitcher,team,Home
0,Paolo Espino,ATL,1
1,Bryce Elder,WSN,0
2,Taijuan Walker,MIL,1
3,Adrian Houser,NYM,0
4,Connor Seabold,CIN,1


In [16]:
pitching = pd.read_csv('../data/pitching_stats.csv')
batting = pd.read_csv('../data/team_batting_stats.csv')

In [17]:
pitching.head()

Unnamed: 0,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,abbrev_name
0,Carlos Rodon,2.84,4.49,0.59,1.04,2.95,0.399,C.Rodon
1,Aaron Nola,3.38,8.08,0.87,0.98,2.84,0.319,A.Nola
2,Kevin Gausman,3.45,7.44,0.79,1.24,2.79,0.387,K.Gausman
3,Justin Verlander,1.78,6.04,0.69,0.83,3.27,0.349,J.Verlander
4,Sandy Alcantara,2.37,3.84,0.63,1.01,3.37,0.396,S.Alcantara


In [18]:
batting.head()

Unnamed: 0,Team,wOBA,wRC+,OBP+,Barrel%,HardHit%,H,HR,OPS,BA
0,LAD,0.341,122,107,0.094,0.416,8.896552,1.358621,0.788,0.26
1,ATL,0.33,110,101,0.109,0.436,8.609589,1.506849,0.762,0.253
2,TOR,0.328,115,105,0.085,0.443,8.918367,1.204082,0.753,0.261
3,STL,0.328,115,104,0.075,0.372,8.628378,1.236486,0.752,0.254
4,NYY,0.326,115,104,0.101,0.412,8.068493,1.582192,0.749,0.24


In [19]:
df_1 = games.merge(pitching, how = 'left', left_on = 'opp_pitcher', right_on = 'Name')

In [20]:
df_1.head()

Unnamed: 0,opp_pitcher,team,Home,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,abbrev_name
0,Paolo Espino,ATL,1,Paolo Espino,4.24,4.25,1.64,1.32,4.05,0.392,P.Espino
1,Bryce Elder,WSN,0,Bryce Elder,3.67,1.58,0.79,1.37,4.71,0.412,B.Elder
2,Taijuan Walker,MIL,1,Taijuan Walker,3.42,2.78,0.82,1.18,4.0,0.406,T.Walker
3,Adrian Houser,NYM,0,Adrian Houser,4.85,1.53,0.76,1.49,4.47,0.401,A.Houser
4,Connor Seabold,CIN,1,Connor Seabold,11.91,2.6,3.18,2.56,5.16,0.419,C.Seabold


In [21]:
df_2 = df_1.merge(batting, how = 'left', left_on = 'team', right_on = 'Team')
df_2.head()

Unnamed: 0,opp_pitcher,team,Home,Name,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,...,Team,wOBA,wRC+,OBP+,Barrel%,HardHit%,H,HR,OPS,BA
0,Paolo Espino,ATL,1,Paolo Espino,4.24,4.25,1.64,1.32,4.05,0.392,...,ATL,0.33,110,101,0.109,0.436,8.609589,1.506849,0.762,0.253
1,Bryce Elder,WSN,0,Bryce Elder,3.67,1.58,0.79,1.37,4.71,0.412,...,WSN,0.307,95,100,0.06,0.354,8.445205,0.863014,0.697,0.251
2,Taijuan Walker,MIL,1,Taijuan Walker,3.42,2.78,0.82,1.18,4.0,0.406,...,MIL,0.318,104,101,0.088,0.399,7.876712,1.383562,0.728,0.235
3,Adrian Houser,NYM,0,Adrian Houser,4.85,1.53,0.76,1.49,4.47,0.401,...,NYM,0.324,114,105,0.07,0.371,8.722973,1.027027,0.74,0.258
4,Connor Seabold,CIN,1,Connor Seabold,11.91,2.6,3.18,2.56,5.16,0.419,...,CIN,0.303,87,98,0.058,0.351,7.979592,0.993197,0.687,0.239


In [22]:
X = df_2[['Home', 'H', 'HR', 'BA', 'OPS', 'ERA', 'K/BB', 'HR/9', 'WHIP', 'xFIP', 'HardHit%_P', 'wOBA', 'wRC+', 'OBP+', 'Barrel%', 'HardHit%']]

In [23]:
X['ERA'].fillna(pitching['ERA'].mean(), inplace = True)
X['K/BB'].fillna(pitching['K/BB'].mean(), inplace = True)
X['HR/9'].fillna(pitching['HR/9'].mean(), inplace = True)
X['WHIP'].fillna(pitching['WHIP'].mean(), inplace = True)
X['xFIP'].fillna(pitching['xFIP'].mean(), inplace = True)
X['HardHit%_P'].fillna(pitching['HardHit%_P'].mean(), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ERA'].fillna(pitching['ERA'].mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['K/BB'].fillna(pitching['K/BB'].mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['HR/9'].fillna(pitching['HR/9'].mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

In [24]:
with open('../models/lr.pkl', 'rb') as f:
    model = pickle.load(f)

In [25]:
predictions = model.predict(X)

In [26]:
predictions

array([4.51350732, 4.32243202, 4.43349931, 5.05907712, 4.56866308,
       5.28205166, 4.01352984, 4.56561882, 3.59733496, 4.51879197,
       4.07362904, 4.82696821, 4.0461965 , 3.45629266, 4.52295244,
       3.55734247, 4.92212679, 4.21014872, 3.87940998, 4.44509155,
       4.26291286, 4.06073452, 4.27110233, 4.28380233, 3.17236729,
       4.56928942, 4.10190214, 4.9617242 , 5.08157722, 4.4328684 ])

In [27]:
preds = pd.DataFrame([df_2['team'], predictions]).T

In [28]:
preds.rename(columns = {'Unnamed 0': 'runs_scored'}, inplace = True)

In [29]:
preds

Unnamed: 0,team,runs_scored
0,ATL,4.513507
1,WSN,4.322432
2,MIL,4.433499
3,NYM,5.059077
4,CIN,4.568663
5,BOS,5.282052
6,TBR,4.01353
7,HOU,4.565619
8,MIA,3.597335
9,CHC,4.518792


In [30]:
preds['wp'] = [preds['runs_scored'][i]**2 / (preds['runs_scored'][i]**2 + preds['runs_scored'][i + 1]**2) if i%2 == 0 else preds['runs_scored'][i]**2 / (preds['runs_scored'][i]**2 + preds['runs_scored'][i - 1]**2) for i in range(len(preds))]


In [31]:
preds

Unnamed: 0,team,runs_scored,wp
0,ATL,4.513507,0.521615
1,WSN,4.322432,0.478385
2,MIL,4.433499,0.434383
3,NYM,5.059077,0.565617
4,CIN,4.568663,0.427958
5,BOS,5.282052,0.572042
6,TBR,4.01353,0.435913
7,HOU,4.565619,0.564087
8,MIA,3.597335,0.387911
9,CHC,4.518792,0.612089


In [32]:
def wp_to_ml(wp):
    if wp > .5:
        ml = (wp / (1 - wp))*-100
    else:
        ml = ((1 - wp) / wp) * 100
    return ml

In [33]:
preds['ml'] = preds['wp'].apply(wp_to_ml)

In [34]:
from datetime import datetime

date = datetime.now().strftime("%Y_%m_%d")

In [35]:
preds.to_csv('../data/preds_' + f'{date}' + '.csv', index = False)