In [1]:
import re
import sys
import gc
import datetime
import numpy as np
import pandas as pd
import bs4
from bs4 import BeautifulSoup as soup
from bs4 import Comment
from urllib.request import urlopen as uReq
from references_dict import Team_Dictionary,TableColumns

In [2]:
# get soup object from link
def get_soup(link):
    uClient = uReq(link)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "lxml")
    return page_soup       

# utility method for parsing game page tables
def get_data(page_soup,id,commented=0):
    data = page_soup.find("div",{"id":id})
    if commented > 0:
        comment = data.find(string=lambda text:isinstance(text,Comment))
        data = soup(comment,"lxml")
    players = [tr.find("th",{"scope":"row"}) for tr in data.findAll("tr",{"class":None})]
    stats = [tr.findAll("td") for tr in data.findAll("tr",{"class":None})]
    players.pop(0)
    stats.pop(0)
    return players,stats

# for debugging
def get_single_gameinfo(link):
    page_soup = get_soup(link)
    link_comps = link.split('boxscores/')
    link_comps = link_comps[1].split('.')
    gameid = link_comps[0]
    date = gameid[:8]

    # get team names
    gameteams = page_soup.findAll("a",{"itemprop":"name"})
    team_home = gameteams[0].text
    team_away = gameteams[1].text

    # get vegas odds
    gameinfo = page_soup.find("div",{"id":"all_game_info"})
    comment = gameinfo.find(string=lambda text:isinstance(text,Comment))
    gameinfo = soup(comment,"lxml")
    gameinfo = [tr.findAll("td") for tr in gameinfo.findAll("tr",{"class":None})]
    vegasline = gameinfo[-2][0].text.strip()
    vegasline = re.split("\s-",vegasline)
    home_fav = 1
    if vegasline[0] == team_away:
        home_fav = 0
    vegasline = float(vegasline[1])
    overunder = float(gameinfo[-1][0].text.split(" ")[0].strip())
    print('home_fav = ',home_fav,', ats = ',vegasline,', ou = ',overunder)

    # get score
    score = page_soup.findAll("div",{"class":"score"})
    points_home = float(score[0].text.strip())
    points_away = float(score[1].text.strip())
    home_score_diff = points_home - points_away
    print('score_diff = ',home_score_diff)

    # calculate vegas results
    ats_result = 0
    if((home_score_diff > vegasline and home_fav == 1) or (home_score_diff < (vegasline*-1) and home_fav == 0)):
        ats_result = 1
    elif((home_score_diff < vegasline and home_fav == 1) or (home_score_diff > (vegasline*-1) and home_fav == 0)):
        ats_result = -1   

    ou_result = 0
    if(points_home+points_away > overunder):
        ou_result = 1
    elif(points_home+points_away < overunder):
        ou_result = -1

    # return metrics in numpy array
    stats = np.array([gameid,date,team_home,points_home,team_away,points_away,home_fav,vegasline,overunder,ats_result,ou_result])
    return stats

In [3]:
class PFRScraper():
    def __init__(self,season,week):
        self.season = season
        self.week = week
        self.link = "https://www.pro-football-reference.com/years/"+str(season)+"/week_"+str(week)+".htm"
        self.week_soup = get_soup(self.link)
    
    def get_game_summary(self):
        # get game links for week
        games = self.week_soup.findAll("td",{"class":"gamelink"})
        all_games = [['gameid','date','team_home','points_home','team_away','points_away','home_fav','vegasline','overunder','ats_result','ou_result']]
        
        # build dataframe for every game for that week
        for game in games:
            # get date, page soup for game link
            gameid = (str(game.a['href']))
            date = gameid[11:20]
            link = "https://www.pro-football-reference.com"+gameid
            gameinfo = np.array([gameid,date])
            page_soup = get_soup(link)

            # get team names
            gameteams = page_soup.findAll("a",{"itemprop":"name"})
            team_home = gameteams[0].text
            team_away = gameteams[1].text

            # get vegas odds
            gameinfo = page_soup.find("div",{"id":"all_game_info"})
            comment = gameinfo.find(string=lambda text:isinstance(text,Comment))
            gameinfo = soup(comment,"lxml")
            gameinfo = [tr.findAll("td") for tr in gameinfo.findAll("tr",{"class":None})]
            vegasline = gameinfo[-2][0].text.strip()
            vegasline = re.split("\s-",vegasline)
            home_fav = 1
            if vegasline[0] == team_away:
                home_fav = 0
            vegasline = float(vegasline[1])
            overunder = float(gameinfo[-1][0].text.split(" ")[0].strip())

            # get score
            score = page_soup.findAll("div",{"class":"score"})
            points_home = float(score[0].text.strip())
            points_away = float(score[1].text.strip())
            home_score_diff = points_home - points_away

            # calculate vegas results
            ats_result = 0
            if((home_score_diff > vegasline and home_fav == 1) or (home_score_diff < (vegasline*-1) and home_fav == 0)):
                ats_result = 1
            elif((home_score_diff < vegasline and home_fav == 1) or (home_score_diff > (vegasline*-1) and home_fav == 0)):
                ats_result = -1   

            ou_result = 0
            if(points_home+points_away > overunder):
                ou_result = 1
            elif(points_home+points_away < overunder):
                ou_result = -1

            # return metrics in numpy array
            stats = np.array([gameid,date,team_home,points_home,team_away,points_away,home_fav,vegasline,overunder,ats_result,ou_result])
            all_games.append(stats)
        df = np.vstack(all_games)
        return pd.DataFrame(data=df[1:,1:],index=df[1:,0],columns=df[0,1:])

In [5]:
season = 2018
week = 4
pf_scraper = PFRScraper(season,week)
df = pf_scraper.get_game_summary()

In [6]:
df

Unnamed: 0,date,team_home,points_home,team_away,points_away,home_fav,vegasline,overunder,ats_result,ou_result
/boxscores/201809270ram.htm,201809270,Los Angeles Rams,38.0,Minnesota Vikings,31.0,1,7.5,48.5,-1,1
/boxscores/201809300clt.htm,201809300,Indianapolis Colts,34.0,Houston Texans,37.0,0,1.0,48.0,1,1
/boxscores/201809300chi.htm,201809300,Chicago Bears,48.0,Tampa Bay Buccaneers,10.0,1,3.0,46.0,1,1
/boxscores/201809300atl.htm,201809300,Atlanta Falcons,36.0,Cincinnati Bengals,37.0,1,3.5,52.5,-1,1
/boxscores/201809300oti.htm,201809300,Tennessee Titans,26.0,Philadelphia Eagles,23.0,0,3.0,41.5,-1,1
/boxscores/201809300nwe.htm,201809300,New England Patriots,38.0,Miami Dolphins,7.0,1,6.5,50.5,1,-1
/boxscores/201809300jax.htm,201809300,Jacksonville Jaguars,31.0,New York Jets,12.0,1,7.5,40.5,1,1
/boxscores/201809300gnb.htm,201809300,Green Bay Packers,22.0,Buffalo Bills,0.0,1,8.5,43.5,1,-1
/boxscores/201809300dal.htm,201809300,Dallas Cowboys,26.0,Detroit Lions,24.0,1,2.5,43.5,-1,1
/boxscores/201809300rai.htm,201809300,Oakland Raiders,45.0,Cleveland Browns,42.0,1,2.5,44.5,1,1


In [28]:
def get_pfr_table(link,table):
    page_soup = get_soup(link)
    metrics = [TableColumns().football_ref[table]]
    players,stats = get_data(page_soup,table,1)
    for player,stat in zip(players,stats):
        mets = np.array([player.a['href']])
        for idx,metric in enumerate(stat):
            value = metric.text
            if not value:
                value = 0
            if idx > 0 and '%' not in metric.text:
                value = float(value)
            mets = np.append(mets,value)
        metrics.append(mets)
    df = np.vstack(metrics)
    return pd.DataFrame(data=df[1:,1:],index=df[1:,0],columns=df[0,1:])

link = 'https://www.pro-football-reference.com/boxscores/201809230ram.htm'
all_offense = get_pfr_table(link,'all_player_offense')
all_receiving = get_pfr_table(link,'all_targets_directions')
all_rushing = get_pfr_table(link,'all_rush_directions')
all_home_snapcounts = get_pfr_table(link,'all_home_snap_counts')
all_vis_snapcounts = get_pfr_table(link,'all_vis_snap_counts')


Unnamed: 0,pos,off_snaps,off_pct,def_snaps,def_pct,st_snaps,st_pct
/players/S/SaffRo20.htm,G,77.0,100%,0.0,0%,6.0,24%
/players/H/HaveRo00.htm,T,77.0,100%,0.0,0%,6.0,24%
/players/B/BlytAu00.htm,G,77.0,100%,0.0,0%,6.0,24%
/players/G/GoffJa00.htm,QB,77.0,100%,0.0,0%,0.0,0%
/players/S/SullJo24.htm,C,77.0,100%,0.0,0%,0.0,0%
/players/K/KuppCo00.htm,WR,75.0,97%,0.0,0%,0.0,0%
/players/W/WhitAn20.htm,T,74.0,96%,0.0,0%,6.0,24%
/players/W/WoodRo02.htm,WR,74.0,96%,0.0,0%,0.0,0%
/players/C/CookBr00.htm,WR,74.0,96%,0.0,0%,0.0,0%
/players/G/GurlTo01.htm,RB,64.0,83%,0.0,0%,0.0,0%
