In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
# options.add_argument('--headless')

driver = webdriver.Chrome(options=options)

In [2]:
import time
from selenium.webdriver.common.by import By

TEAMS = {
    'Arizona',
    'Atlanta',
    'Baltimore',
    'Buffalo',
    'Carolina',
    'Chicago',
    'Cincinnati',
    'Cleveland',
    'Dallas',
    'Denver',
    'Detroit',
    'Green Bay',
    'Houston',
    'Indianapolis',
    'Jacksonville',
    'Kansas City',
    'Las Vegas',
    'Los Angeles',
    'Miami',
    'Minnesota',
    'New England',
    'New Orleans',
    'New York',
    'Philadelphia',
    'Pittsburgh',
    'San Francisco',
    'Seattle',
    'Tampa Bay',
    'Tennessee',
    'Washington',
}

problematic_rows = []
def scrape_spreads(year, week):
    driver.get(url=f"https://www.bettingpros.com/nfl/odds/?season={year}&week={week}")
    time.sleep(10)

    # the page uses lazy loading
    # this loop helps all relevant elements load before scraping
    for _ in range(5):
        # find all loaded rows
        rows = driver.find_elements(
            by=By.CLASS_NAME,
            value="grouped-items-with-sticky-footer__content",
        )

        # scroll to the last loaded row, which should load subsequent rows if there are any
        if len(rows) > 0:
            driver.execute_script(f"window.scrollTo(0, {rows[-1].location['y']});")

        time.sleep(6)

    rows = driver.find_elements(
        by=By.CLASS_NAME,
        value="grouped-items-with-sticky-footer__content",
    )

    if len(rows) == 0:
        print(f"Warning: No rows for {year} week {week}")
        problematic_rows.append({'year': year, 'week': week})
    
    data = []
    for row in rows:
        t = row.text.split('\n')
        home_team = t[3]
        if home_team not in TEAMS:
            home_team = t[2]
        data.append((
            year,
            week,
            t[0],  # away team
            home_team,  # home team
            t[-4],  # away line
            t[-3],  # away odds
            t[-2],  # home line
            t[-1],  # home odds
        ))

    return data

In [3]:
from itertools import product
from tqdm.autonotebook import tqdm

d = []
for year, week in tqdm(list(product([year for year in range(2018, 2024)], [week for week in range(1, 18)]))):
    if ((week != 17) or (year >= 2021)) and ((year != 2018) or (week > 4)):
        d.append(scrape_spreads(year, week))

  0%|          | 0/102 [00:00<?, ?it/s]

In [4]:
problematic_rows

[]

In [5]:
for params in tqdm(problematic_rows):
    d.append(scrape_spreads(**params))

0it [00:00, ?it/s]

In [6]:
driver.quit()

In [7]:
d

[]

In [8]:
import pandas as pd

df = pd.concat(
    [
        pd.DataFrame(
            [i for i in j if len(i) > 0],
            columns=['Season', 'Week', 'Away Team', 'Home Team', 'Away Line', 'Away Odds', 'Home Line', 'Home Odds'],
        )
        for j in d
    ],
)

df

ValueError: No objects to concatenate

In [9]:
df['Spread'] = df['Away Line'].str.extract(r'([+-]?\d*\.?\d+)').astype(float)

df

Unnamed: 0,Season,Week,Away Team,Home Team,Away Line,Away Odds,Home Line,Home Odds,Spread
0,2018,5,Indianapolis,New England,+10.5,(-110),-10.5,(-110),10.5
1,2018,5,Jacksonville,Kansas City,+3.0,(-115),-3.0,(-105),3.0
2,2018,5,Baltimore,Cleveland,-3.0,(-125),+3.0,(+105),-3.0
3,2018,5,Green Bay,Detroit,+1.5,(EVEN),-1.5,(-120),1.5
4,2018,5,Miami,Cincinnati,+6.5,(-120),-6.5,(EVEN),6.5
...,...,...,...,...,...,...,...,...,...
11,2023,17,San Francisco,Washington,-14.0,(-115),+14.0,(-105),-14.0
12,2023,17,Pittsburgh,Seattle,+4.5,(-110),-4.5,(-110),4.5
13,2023,17,Cincinnati,Kansas City,+7.0,(-105),-7.0,(-115),7.0
14,2023,17,Los Angeles,Denver,+3.5,(-110),-3.5,(-110),3.5


In [10]:
df.loc[df['Away Odds'] == '(EVEN)', 'Away Odds'] = '100'
df['Away Odds'] = df['Away Odds'].str.replace(r'\(|\)', '', regex=True).astype(int)

df.loc[df['Home Odds'] == '(EVEN)', 'Home Odds'] = '100'
df['Home Odds'] = df['Home Odds'].str.replace(r'\(|\)', '', regex=True).astype(int)

df

Unnamed: 0,Season,Week,Away Team,Home Team,Away Line,Away Odds,Home Line,Home Odds,Spread
0,2018,5,Indianapolis,New England,+10.5,-110,-10.5,-110,10.5
1,2018,5,Jacksonville,Kansas City,+3.0,-115,-3.0,-105,3.0
2,2018,5,Baltimore,Cleveland,-3.0,-125,+3.0,105,-3.0
3,2018,5,Green Bay,Detroit,+1.5,100,-1.5,-120,1.5
4,2018,5,Miami,Cincinnati,+6.5,-120,-6.5,100,6.5
...,...,...,...,...,...,...,...,...,...
11,2023,17,San Francisco,Washington,-14.0,-115,+14.0,-105,-14.0
12,2023,17,Pittsburgh,Seattle,+4.5,-110,-4.5,-110,4.5
13,2023,17,Cincinnati,Kansas City,+7.0,-105,-7.0,-115,7.0
14,2023,17,Los Angeles,Denver,+3.5,-110,-3.5,-110,3.5


In [11]:
df.drop(columns=['Away Line', 'Home Line'], inplace=True)

df

Unnamed: 0,Season,Week,Away Team,Home Team,Away Odds,Home Odds,Spread
0,2018,5,Indianapolis,New England,-110,-110,10.5
1,2018,5,Jacksonville,Kansas City,-115,-105,3.0
2,2018,5,Baltimore,Cleveland,-125,105,-3.0
3,2018,5,Green Bay,Detroit,100,-120,1.5
4,2018,5,Miami,Cincinnati,-120,100,6.5
...,...,...,...,...,...,...,...
11,2023,17,San Francisco,Washington,-115,-105,-14.0
12,2023,17,Pittsburgh,Seattle,-110,-110,4.5
13,2023,17,Cincinnati,Kansas City,-105,-115,7.0
14,2023,17,Los Angeles,Denver,-110,-110,3.5


In [12]:
df.insert(df.columns.get_loc('Away Odds'), 'Spread', df.pop('Spread'))

df

Unnamed: 0,Season,Week,Away Team,Home Team,Spread,Away Odds,Home Odds
0,2018,5,Indianapolis,New England,10.5,-110,-110
1,2018,5,Jacksonville,Kansas City,3.0,-115,-105
2,2018,5,Baltimore,Cleveland,-3.0,-125,105
3,2018,5,Green Bay,Detroit,1.5,100,-120
4,2018,5,Miami,Cincinnati,6.5,-120,100
...,...,...,...,...,...,...,...
11,2023,17,San Francisco,Washington,-14.0,-115,-105
12,2023,17,Pittsburgh,Seattle,4.5,-110,-110
13,2023,17,Cincinnati,Kansas City,7.0,-105,-115
14,2023,17,Los Angeles,Denver,3.5,-110,-110


In [13]:
df.to_parquet(r'..\..\data\betting_lines\spreads.parquet')

'Done'

'Done'