In [1]:
import re
import sys
import gc
import datetime
import numpy as np
import pandas as pd
import bs4
from bs4 import BeautifulSoup as soup
from bs4 import Comment
from urllib.request import urlopen as uReq
from references_dict import Team_Dictionary,TableColumns

In [2]:
# get soup object from link
def get_soup(link):
    uClient = uReq(link)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "lxml")
    return page_soup       

# utility method for parsing game page tables
def get_data(page_soup,id,commented=0):
    data = page_soup.find("div",{"id":id})
    if commented > 0:
        comment = data.find(string=lambda text:isinstance(text,Comment))
        data = soup(comment,"lxml")
    players = [tr.find("th",{"scope":"row"}) for tr in data.findAll("tr",{"class":None})]
    stats = [tr.findAll("td") for tr in data.findAll("tr",{"class":None})]
    players.pop(0)
    stats.pop(0)
    return players,stats

# utiliy method to dynamically scrape tables
def get_pfr_table(link,table):
    page_soup = get_soup(link)
    gameid = link.split('boxscores/')[1].split('.')[0]
    metrics = [TableColumns().football_ref[table]]
    players,stats = get_data(page_soup,table,1)
    for player,stat in zip(players,stats):
        mets = np.array([gameid,player.a['href']])
        for idx,metric in enumerate(stat):
            value = metric.text
            if not value:
                value = 0
            if idx > 0 and '%' not in metric.text:
                value = float(value)
            mets = np.append(mets,value)
        metrics.append(mets)
    df = np.vstack(metrics)
    return pd.DataFrame(data=df[1:,1:],index=df[1:,0],columns=df[0,1:])

def get_total_offense(link):
    return get_pfr_table(link,'all_player_offense')

def get_receiving(link):
    return get_pfr_table(link,'all_targets_directions')

def get_rushing(link):
    return get_pfr_table(link,'all_rush_directions')

def get_defense(link):
    return get_pfr_table(link,'all_player_defense')

def get_returns(link):
    return get_pfr_table(link,'all_returns')

def get_home_snapcounts(link):
    return get_pfr_table(link,'all_home_snap_counts')

def get_vis_snapcounts(link):
    return get_pfr_table(link,'all_vis_snap_counts')

# for debugging
def get_single_gameinfo(link):
    page_soup = get_soup(link)
    link_comps = link.split('boxscores/')
    link_comps = link_comps[1].split('.')
    gameid = link_comps[0]
    date = gameid[:8]

    # get team names
    gameteams = page_soup.findAll("a",{"itemprop":"name"})
    team_home = gameteams[0].text
    team_away = gameteams[1].text

    # get vegas odds
    gameinfo = page_soup.find("div",{"id":"all_game_info"})
    comment = gameinfo.find(string=lambda text:isinstance(text,Comment))
    gameinfo = soup(comment,"lxml")
    gameinfo = [tr.findAll("td") for tr in gameinfo.findAll("tr",{"class":None})]
    vegasline = gameinfo[-2][0].text.strip()
    vegasline = re.split("\s-",vegasline)
    home_fav = 1
    if vegasline[0] == team_away:
        home_fav = 0
    vegasline = float(vegasline[1])
    overunder = float(gameinfo[-1][0].text.split(" ")[0].strip())
    print('home_fav = ',home_fav,', ats = ',vegasline,', ou = ',overunder)

    # get score
    score = page_soup.findAll("div",{"class":"score"})
    points_home = float(score[0].text.strip())
    points_away = float(score[1].text.strip())
    home_score_diff = points_home - points_away
    print('score_diff = ',home_score_diff)

    # calculate vegas results
    ats_result = 0
    if((home_score_diff > vegasline and home_fav == 1) or (home_score_diff < (vegasline*-1) and home_fav == 0)):
        ats_result = 1
    elif((home_score_diff < vegasline and home_fav == 1) or (home_score_diff > (vegasline*-1) and home_fav == 0)):
        ats_result = -1   

    ou_result = 0
    if(points_home+points_away > overunder):
        ou_result = 1
    elif(points_home+points_away < overunder):
        ou_result = -1

    # return metrics in numpy array
    stats = np.array([gameid,date,team_home,points_home,team_away,points_away,home_fav,vegasline,overunder,ats_result,ou_result])
    return stats

def get_weekly_summary(link):
    page_soup = get_soup(link)
    # get game links for week
    games = page_soup.findAll("td",{"class":"gamelink"})
    all_games = [['gameid','date','team_home','points_home','team_away','points_away','home_fav','vegasline','overunder','ats_result','ou_result']]

    # build dataframe for every game for that week
    for game in games:
        # get date, page soup for game link
        gameid = (str(game.a['href']))
        date = gameid[11:20]
        link = "https://www.pro-football-reference.com"+gameid
        gameinfo = np.array([gameid,date])
        page_soup = get_soup(link)

        # get team names
        gameteams = page_soup.findAll("a",{"itemprop":"name"})
        team_home = gameteams[0].text
        team_away = gameteams[1].text

        # get vegas odds
        gameinfo = page_soup.find("div",{"id":"all_game_info"})
        comment = gameinfo.find(string=lambda text:isinstance(text,Comment))
        gameinfo = soup(comment,"lxml")
        gameinfo = [tr.findAll("td") for tr in gameinfo.findAll("tr",{"class":None})]
        vegasline = gameinfo[-2][0].text.strip()
        vegasline = re.split("\s-",vegasline)
        home_fav = 1
        if vegasline[0] == team_away:
            home_fav = 0
        vegasline = float(vegasline[1])
        overunder = float(gameinfo[-1][0].text.split(" ")[0].strip())

        # get score
        score = page_soup.findAll("div",{"class":"score"})
        points_home = float(score[0].text.strip())
        points_away = float(score[1].text.strip())
        home_score_diff = points_home - points_away

        # calculate vegas results
        ats_result = 0
        if((home_score_diff > vegasline and home_fav == 1) or (home_score_diff < (vegasline*-1) and home_fav == 0)):
            ats_result = 1
        elif((home_score_diff < vegasline and home_fav == 1) or (home_score_diff > (vegasline*-1) and home_fav == 0)):
            ats_result = -1   

        ou_result = 0
        if(points_home+points_away > overunder):
            ou_result = 1
        elif(points_home+points_away < overunder):
            ou_result = -1

        # return metrics in numpy array
        stats = np.array([gameid,date,team_home,points_home,team_away,points_away,home_fav,vegasline,overunder,ats_result,ou_result])
        all_games.append(stats)
    df = np.vstack(all_games)
    return pd.DataFrame(data=df[1:,1:],index=df[1:,0],columns=df[0,1:])

In [5]:
class PFRScraper():
    def __init__(self,season,week):
        self.season = season
        self.week = week
        self.weekly_link = "https://www.pro-football-reference.com/years/"+str(season)+"/week_"+str(week)+".htm"
        self.week_soup = get_soup(self.weekly_link)
        self.games = self.week_soup.findAll("td",{"class":"gamelink"})
        self.tables = {
            'weekly_summary':get_weekly_summary(self.weekly_link)
        }
        for game in self.games:
            gametable = {}
            gameid = (str(game.a['href']))
            link = "https://www.pro-football-reference.com"+gameid
            gameid = link.split('boxscores/')[1].split('.')[0]
            gametable['link'] = link
            gametable['offense'] = get_total_offense(link)
            gametable['receiving'] = get_receiving(link)
            gametable['rushing'] = get_rushing(link)
            gametable['defense'] = get_defense(link)
            gametable['returns'] = get_returns(link)
            gametable['home_snaps'] = get_home_snapcounts(link)
            gametable['away_snaps'] = get_vis_snapcounts(link)
            self.tables[gameid] = gametable
            
    
#     def get_game_summary(self):
#         # get game links for week
#         games = self.week_soup.findAll("td",{"class":"gamelink"})
#         all_games = [['gameid','date','team_home','points_home','team_away','points_away','home_fav','vegasline','overunder','ats_result','ou_result']]
        
#         # build dataframe for every game for that week
#         for game in games:
#             # get date, page soup for game link
#             gameid = (str(game.a['href']))
#             date = gameid[11:19]
#             link = "https://www.pro-football-reference.com"+gameid
#             gameinfo = np.array([gameid,date])
#             page_soup = get_soup(link)

#             # get team names
#             gameteams = page_soup.findAll("a",{"itemprop":"name"})
#             team_home = gameteams[0].text
#             team_away = gameteams[1].text

#             # get vegas odds
#             gameinfo = page_soup.find("div",{"id":"all_game_info"})
#             comment = gameinfo.find(string=lambda text:isinstance(text,Comment))
#             gameinfo = soup(comment,"lxml")
#             gameinfo = [tr.findAll("td") for tr in gameinfo.findAll("tr",{"class":None})]
#             vegasline = gameinfo[-2][0].text.strip()
#             vegasline = re.split("\s-",vegasline)
#             home_fav = 1
#             if vegasline[0] == team_away:
#                 home_fav = 0
#             vegasline = float(vegasline[1])
#             overunder = float(gameinfo[-1][0].text.split(" ")[0].strip())

#             # get score
#             score = page_soup.findAll("div",{"class":"score"})
#             points_home = float(score[0].text.strip())
#             points_away = float(score[1].text.strip())
#             home_score_diff = points_home - points_away

#             # calculate vegas results
#             ats_result = 0
#             if((home_score_diff > vegasline and home_fav == 1) or (home_score_diff < (vegasline*-1) and home_fav == 0)):
#                 ats_result = 1
#             elif((home_score_diff < vegasline and home_fav == 1) or (home_score_diff > (vegasline*-1) and home_fav == 0)):
#                 ats_result = -1   

#             ou_result = 0
#             if(points_home+points_away > overunder):
#                 ou_result = 1
#             elif(points_home+points_away < overunder):
#                 ou_result = -1

#             # return metrics in numpy array
#             stats = np.array([gameid,date,team_home,points_home,team_away,points_away,home_fav,vegasline,overunder,ats_result,ou_result])
#             all_games.append(stats)
#         df = np.vstack(all_games)
#         return pd.DataFrame(data=df[1:,1:],index=df[1:,0],columns=df[0,1:])

In [6]:
season = 2018
week = 4
pf_scraper = PFRScraper(season,week)

In [7]:
pf_scraper

<__main__.PFRScraper at 0x1b50aea4048>

In [12]:
pf_scraper.tables['201809270ram']['offense'].head()

Unnamed: 0,player,team,pass_cmp,pass_att,pass_yds,pass_td,int,sack,sack_yds,lng,...,rush_yds,rush_tds,rush_lng,rec_tgts,rec,rec_yds,rec_tds,rec_lng,fmb,fl
201809270ram,/players/C/CousKi00.htm,MIN,36.0,50.0,422.0,3.0,0.0,4.0,30.0,45.0,...,28.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
201809270ram,/players/C/CookDa01.htm,MIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201809270ram,/players/T/ThomRo05.htm,MIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
201809270ram,/players/M/MurrLa00.htm,MIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,2.0,2.0,2.0,16.0,0.0,9.0,0.0,0.0
201809270ram,/players/T/ThieAd00.htm,MIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,12.0,8.0,135.0,1.0,45.0,0.0,0.0
