In [2]:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import numpy as np
import datetime
from time import sleep
import os

In [53]:
def get_matchups(browser, day, url_base):
    #pull up url
    browser.get(url_base + str(day))
    soup = BS(browser.page_source, "lxml")
    sleep(2)
    if soup.find(attrs = {'class':"no_games"}):
        df = pd.DataFrame(columns=['p_away', 'p_home', 'matchup', 
                                   'p_away_LHP', 'p_home_LHP', 't_away',
                                   't_home', 'date'])
    else:
        # get list of games
        games_data = []
        soup = soup.find(attrs = {'id':"mc"})
        games = soup.find_all(attrs = {'class':"module"})
        #bad_headers = ['AL East','AL Central','AL West','NL East','NL Central','NL West']
        #matchups = [matchup.text for matchup in soup.find_all('h4') if matchup.text not in bad_headers]
        
        # extract data
        for game in games:
            game_data = {}
            game_data['matchup'] = game.find('h4').text
            
            # away pitcher
            try:
                pitcher_away = game.find(attrs = {'class':"pitcher first"}).find('h5')
                game_data['p_away'] = pitcher_away.find('a').text
                if pitcher_away.find('span').text == 'LHP':
                    game_data['p_away_LHP'] = True
                    game_data['p_away_RHP'] = False
                else:
                    game_data['p_away_LHP'] = False
                    game_data['p_away_RHP'] = True
            except:
                pass
            
            # home pitcher
            try:
                pitcher_home = game.find(attrs = {'class':"pitcher last"}).find('h5')
                game_data['p_home'] = pitcher_home.find('a').text
                if pitcher_home.find('span').text == 'LHP':
                    game_data['p_home_LHP'] = True
                    game_data['p_home_RHP'] = False
                else:
                    game_data['p_home_LHP'] = False
                    game_data['p_home_RHP'] = True
            except:
                pass
            
            games_data.append(game_data)
        
        df = pd.DataFrame(games_data)
        matchups_split = df.matchup.str.split(" @ ", expand=True).rename(columns =
                                                       {0:"t_away",
                                                       1:"t_home"})
        df = pd.concat([df, matchups_split], axis=1)
        df['date'] = day
    return df

In [54]:
# Create list of days to iterate over
start = datetime.date(year = 2013, month = 4, day = 3)
end = datetime.date(year = 2013, month = 10, day = 1)
date_range = []
while start <= end:
    date_range.append(start)
    start += datetime.timedelta(days = 1)

master_df = pd.DataFrame(columns=['p_away', 'p_home', 'matchup', 
                           'p_away_LHP', 'p_home_LHP', 't_away',
                           't_home', 'date'])

url_base = "http://mlb.mlb.com/news/probable_pitchers/?c_id=mlb&date="
#binary = FirefoxBinary(r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe')
#browser = webdriver.Firefox(firefox_binary=binary)
browser = webdriver.Chrome()

for day in date_range:
    df = get_matchups(browser = browser, day=day, url_base=url_base)
    master_df = master_df.append(df)

In [57]:
master_df.to_csv('matchups_2013.csv')

In [55]:
master_df.t_away.unique()

array(['Kansas City Royals', 'Texas Rangers', 'Detroit Tigers',
       'Boston Red Sox', 'Chicago Cubs', 'Miami Marlins',
       'Cleveland Indians', 'Baltimore Orioles', 'Los Angeles Angels',
       'Philadelphia Phillies', 'San Diego Padres', 'Colorado Rockies',
       'St. Louis Cardinals', 'Seattle Mariners', 'San Francisco Giants',
       'New York Yankees', 'Minnesota Twins', 'Washington Nationals',
       'Arizona Diamondbacks', 'Oakland Athletics', 'Pittsburgh Pirates',
       'Milwaukee Brewers', 'Cincinnati Reds', 'New York Mets',
       'Atlanta Braves', 'Tampa Bay Rays', 'Houston Astros',
       'Toronto Blue Jays', 'Los Angeles Dodgers', 'Chicago White Sox',
       'American League All-Stars'], dtype=object)

In [56]:
master_df.tail(100)

Unnamed: 0,date,matchup,p_away,p_away_LHP,p_away_RHP,p_home,p_home_LHP,p_home_RHP,t_away,t_home
14,2013-09-22,Los Angeles Dodgers @ San Diego Padres,Zack Greinke,False,True,Andrew Cashner,False,True,Los Angeles Dodgers,San Diego Padres
15,2013-09-22,St. Louis Cardinals @ Milwaukee Brewers,Joe Kelly,False,True,Wily Peralta,False,True,St. Louis Cardinals,Milwaukee Brewers
0,2013-09-23,Baltimore Orioles @ Tampa Bay Rays,Wei-Yin Chen,True,False,Chris Archer,False,True,Baltimore Orioles,Tampa Bay Rays
1,2013-09-23,Milwaukee Brewers @ Atlanta Braves,Marco Estrada,False,True,Mike Minor,True,False,Milwaukee Brewers,Atlanta Braves
2,2013-09-23,New York Mets @ Cincinnati Reds,Aaron Harang,False,True,Johnny Cueto,False,True,New York Mets,Cincinnati Reds
3,2013-09-23,Philadelphia Phillies @ Miami Marlins,Roy Halladay,False,True,Nathan Eovaldi,False,True,Philadelphia Phillies,Miami Marlins
4,2013-09-23,Houston Astros @ Texas Rangers,Jordan Lyles,False,True,Derek Holland,True,False,Houston Astros,Texas Rangers
5,2013-09-23,Pittsburgh Pirates @ Chicago Cubs,Charlie Morton,False,True,Jeff Samardzija,False,True,Pittsburgh Pirates,Chicago Cubs
6,2013-09-23,Detroit Tigers @ Minnesota Twins,Justin Verlander,False,True,Mike Pelfrey,False,True,Detroit Tigers,Minnesota Twins
7,2013-09-23,Toronto Blue Jays @ Chicago White Sox,J.A. Happ,True,False,Jose Quintana,True,False,Toronto Blue Jays,Chicago White Sox
