In [1]:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import numpy as np
import datetime
from time import sleep
import os

In [2]:
def get_matchups(browser, day):
    #pull up url
    browser.get(url_base + str(day))
    soup = BS(browser.page_source, "lxml")
    sleep(2)
    if soup.find(attrs = {'class':"no_games"}):
        df = pd.DataFrame(columns=['p_away', 'p_home', 'matchup', 
                                   'p_away_LHP', 'p_home_LHP', 't_away',
                                   't_home', 'date'])
    else:
        # Get matchups stored in <h4>
        bad_headers = ['AL East','AL Central','AL West','NL East','NL Central','NL West']
        matchups = [matchup.text for matchup in soup.find_all('h4') if matchup.text not in bad_headers]

        # Get pitchers stored in <h5>
        # Separate into away and home pitchers
        pitchers = [header.text for header in soup.findAll('h5')]
        pitchers_away = []
        pitchers_home = []
        for pitcher in pitchers:
            if pitchers.index(pitcher) % 2 == 0:
                pitchers_away.append(pitcher)
            else:
                pitchers_home.append(pitcher)

        # Combine pitchers and matchups into nested list
        # Based on order of DOM
        zipped_list = list(zip(pitchers_away,pitchers_home,matchups))

        #convert zipped list to df and split non-atomic cells
        df = pd.DataFrame(zipped_list, columns = ['p_away','p_home','matchup'])
        df['p_away_LHP'] = df.p_away.str.endswith('LHP')
        df['p_home_LHP'] = df.p_home.str.endswith('LHP')
        df['p_away'] = df.p_away.str.slice(0,-5)
        df['p_home'] = df.p_home.str.slice(0,-5)
        matchups_split = df.matchup.str.split(" @ ", expand=True).rename(columns =
                                                       {0:"t_away",
                                                       1:"t_home"})
        df = pd.concat([df, matchups_split], axis=1)
        df['date'] = day
    return df

In [8]:
# Create list of days to iterate over
start = datetime.date(year = 2016, month = 4, day = 3)
end = datetime.date(year = 2016, month = 10, day = 2)
date_range = []
while start <= end:
    date_range.append(start)
    start += datetime.timedelta(days = 1)

master_df = pd.DataFrame(columns=['p_away', 'p_home', 'matchup', 
                           'p_away_LHP', 'p_home_LHP', 't_away',
                           't_home', 'date'])

url_base = "http://mlb.mlb.com/news/probable_pitchers/?c_id=mlb&date="
binary = FirefoxBinary(r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe')
browser = webdriver.Firefox(firefox_binary=binary)

for day in date_range:
    df = get_matchups(browser = browser, day=day)
    master_df = master_df.append(df)

In [9]:
master_df.to_csv('matchups_2016.csv')