In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
# options.add_argument('--headless')

driver = webdriver.Chrome(options=options)

In [2]:
import time
from selenium.webdriver.common.by import By

TEAMS = {
    'Arizona',
    'Atlanta',
    'Baltimore',
    'Buffalo',
    'Carolina',
    'Chicago',
    'Cincinnati',
    'Cleveland',
    'Dallas',
    'Denver',
    'Detroit',
    'Green Bay',
    'Houston',
    'Indianapolis',
    'Jacksonville',
    'Kansas City',
    'Las Vegas',
    'Los Angeles',
    'Miami',
    'Minnesota',
    'New England',
    'New Orleans',
    'New York',
    'Philadelphia',
    'Pittsburgh',
    'San Francisco',
    'Seattle',
    'Tampa Bay',
    'Tennessee',
    'Washington',
}

problematic_rows = []
def scrape_totals(year, week):
    driver.get(url=f"https://www.bettingpros.com/nfl/odds/total/?season={year}&week={week}")
    time.sleep(10)

    # the page uses lazy loading
    # this loop helps all relevant elements load before scraping
    for _ in range(5):
        # find all loaded rows
        rows = driver.find_elements(
            by=By.CLASS_NAME,
            value="grouped-items-with-sticky-footer__content",
        )

        # scroll to the last loaded row, which should load subsequent rows if there are any
        if len(rows) > 0:
            driver.execute_script(f"window.scrollTo(0, {rows[-1].location['y']});")

        time.sleep(6)

    rows = driver.find_elements(
        by=By.CLASS_NAME,
        value="grouped-items-with-sticky-footer__content",
    )

    if len(rows) == 0:
        print(f"Warning: No rows for {year} week {week}")
        problematic_rows.append({'year': year, 'week': week})
    
    data = []
    for row in rows:
        t = row.text.split('\n')
        home_team = t[3]
        if home_team not in TEAMS:
            home_team = t[2]
        data.append((
            year,
            week,
            t[0],  # away team
            home_team,  # home team
            t[-4],  # over line
            t[-3],  # over odds
            t[-2],  # under line
            t[-1],  # under odds
        ))

    return data

In [3]:
from itertools import product
from tqdm.autonotebook import tqdm

d = []
for year, week in tqdm(list(product([year for year in range(2018, 2024)], [week for week in range(1, 18)]))):
    if ((week != 17) or (year >= 2021)) and ((year != 2018) or (week > 4)):
        d.append(scrape_totals(year, week))

  from tqdm.autonotebook import tqdm


  0%|          | 0/102 [00:00<?, ?it/s]

In [4]:
problematic_rows

[]

In [5]:
for params in tqdm(problematic_rows):
    d.append(scrape_totals(**params))

0it [00:00, ?it/s]

In [6]:
driver.quit()

In [7]:
d

[[(2018,
   5,
   'Indianapolis',
   'New England',
   'O 49.5',
   '(-110)',
   'U 49.5',
   '(-110)'),
  (2018,
   5,
   'Jacksonville',
   'Kansas City',
   'O 48.0',
   '(-110)',
   'U 48.0',
   '(-110)'),
  (2018, 5, 'Baltimore', 'Cleveland', 'O 44.5', '(-110)', 'U 44.5', '(-110)'),
  (2018, 5, 'Green Bay', 'Detroit', 'O 50.0', '(-110)', 'U 50.0', '(-110)'),
  (2018, 5, 'Miami', 'Cincinnati', 'O 47.5', '(-110)', 'U 47.5', '(-110)'),
  (2018, 5, 'Tennessee', 'Buffalo', 'O 39.0', '(-110)', 'U 39.0', '(-110)'),
  (2018, 5, 'New York', 'Carolina', 'O 43.5', '(-110)', 'U 43.5', '(-110)'),
  (2018, 5, 'Denver', 'New York', 'O 42.5', '(-105)', 'U 42.5', '(-115)'),
  (2018, 5, 'Atlanta', 'Pittsburgh', 'O 57.0', '(-110)', 'U 57.0', '(-110)'),
  (2018,
   5,
   'Las Vegas',
   'Los Angeles',
   'O 52.0',
   '(EVEN)',
   'U 52.0',
   '(-120)'),
  (2018,
   5,
   'Arizona',
   'San Francisco',
   'O 40.5',
   '(-110)',
   'U 40.5',
   '(-110)'),
  (2018, 5, 'Los Angeles', 'Seattle', 'O 50.5',

In [8]:
import pandas as pd

df = pd.concat(
    [
        pd.DataFrame(
            [i for i in j if len(i) > 0],
            columns=['Season', 'Week', 'Away Team', 'Home Team', 'Over Line', 'Over Odds', 'Under Line', 'Under Odds'],
        )
        for j in d
    ],
)

df

Unnamed: 0,Season,Week,Away Team,Home Team,Over Line,Over Odds,Under Line,Under Odds
0,2018,5,Indianapolis,New England,O 49.5,(-110),U 49.5,(-110)
1,2018,5,Jacksonville,Kansas City,O 48.0,(-110),U 48.0,(-110)
2,2018,5,Baltimore,Cleveland,O 44.5,(-110),U 44.5,(-110)
3,2018,5,Green Bay,Detroit,O 50.0,(-110),U 50.0,(-110)
4,2018,5,Miami,Cincinnati,O 47.5,(-110),U 47.5,(-110)
...,...,...,...,...,...,...,...,...
11,2023,17,San Francisco,Washington,O 48.5,(-110),U 48.5,(-110)
12,2023,17,Pittsburgh,Seattle,O 40.5,(-112),U 40.5,(-108)
13,2023,17,Cincinnati,Kansas City,O 46.0,(-110),U 46.0,(-110)
14,2023,17,Los Angeles,Denver,O 39.5,(-110),U 39.5,(-110)


In [9]:
df['Total'] = df['Over Line'].str.extract(r'(\d+\.\d+)').astype(float)

df

Unnamed: 0,Season,Week,Away Team,Home Team,Over Line,Over Odds,Under Line,Under Odds,Total
0,2018,5,Indianapolis,New England,O 49.5,(-110),U 49.5,(-110),49.5
1,2018,5,Jacksonville,Kansas City,O 48.0,(-110),U 48.0,(-110),48.0
2,2018,5,Baltimore,Cleveland,O 44.5,(-110),U 44.5,(-110),44.5
3,2018,5,Green Bay,Detroit,O 50.0,(-110),U 50.0,(-110),50.0
4,2018,5,Miami,Cincinnati,O 47.5,(-110),U 47.5,(-110),47.5
...,...,...,...,...,...,...,...,...,...
11,2023,17,San Francisco,Washington,O 48.5,(-110),U 48.5,(-110),48.5
12,2023,17,Pittsburgh,Seattle,O 40.5,(-112),U 40.5,(-108),40.5
13,2023,17,Cincinnati,Kansas City,O 46.0,(-110),U 46.0,(-110),46.0
14,2023,17,Los Angeles,Denver,O 39.5,(-110),U 39.5,(-110),39.5


In [10]:
df.loc[df['Over Odds'] == '(EVEN)', 'Over Odds'] = '100'
df['Over Odds'] = df['Over Odds'].str.replace(r'\(|\)', '', regex=True).astype(int)

df.loc[df['Under Odds'] == '(EVEN)', 'Under Odds'] = '100'
df['Under Odds'] = df['Under Odds'].str.replace(r'\(|\)', '', regex=True).astype(int)

df

Unnamed: 0,Season,Week,Away Team,Home Team,Over Line,Over Odds,Under Line,Under Odds,Total
0,2018,5,Indianapolis,New England,O 49.5,-110,U 49.5,-110,49.5
1,2018,5,Jacksonville,Kansas City,O 48.0,-110,U 48.0,-110,48.0
2,2018,5,Baltimore,Cleveland,O 44.5,-110,U 44.5,-110,44.5
3,2018,5,Green Bay,Detroit,O 50.0,-110,U 50.0,-110,50.0
4,2018,5,Miami,Cincinnati,O 47.5,-110,U 47.5,-110,47.5
...,...,...,...,...,...,...,...,...,...
11,2023,17,San Francisco,Washington,O 48.5,-110,U 48.5,-110,48.5
12,2023,17,Pittsburgh,Seattle,O 40.5,-112,U 40.5,-108,40.5
13,2023,17,Cincinnati,Kansas City,O 46.0,-110,U 46.0,-110,46.0
14,2023,17,Los Angeles,Denver,O 39.5,-110,U 39.5,-110,39.5


In [11]:
df.drop(columns=['Over Line', 'Under Line'], inplace=True)

df

Unnamed: 0,Season,Week,Away Team,Home Team,Over Odds,Under Odds,Total
0,2018,5,Indianapolis,New England,-110,-110,49.5
1,2018,5,Jacksonville,Kansas City,-110,-110,48.0
2,2018,5,Baltimore,Cleveland,-110,-110,44.5
3,2018,5,Green Bay,Detroit,-110,-110,50.0
4,2018,5,Miami,Cincinnati,-110,-110,47.5
...,...,...,...,...,...,...,...
11,2023,17,San Francisco,Washington,-110,-110,48.5
12,2023,17,Pittsburgh,Seattle,-112,-108,40.5
13,2023,17,Cincinnati,Kansas City,-110,-110,46.0
14,2023,17,Los Angeles,Denver,-110,-110,39.5


In [12]:
df.insert(df.columns.get_loc('Over Odds'), 'Total', df.pop('Total'))

df

Unnamed: 0,Season,Week,Away Team,Home Team,Total,Over Odds,Under Odds
0,2018,5,Indianapolis,New England,49.5,-110,-110
1,2018,5,Jacksonville,Kansas City,48.0,-110,-110
2,2018,5,Baltimore,Cleveland,44.5,-110,-110
3,2018,5,Green Bay,Detroit,50.0,-110,-110
4,2018,5,Miami,Cincinnati,47.5,-110,-110
...,...,...,...,...,...,...,...
11,2023,17,San Francisco,Washington,48.5,-110,-110
12,2023,17,Pittsburgh,Seattle,40.5,-112,-108
13,2023,17,Cincinnati,Kansas City,46.0,-110,-110
14,2023,17,Los Angeles,Denver,39.5,-110,-110


In [13]:
df.to_parquet(r'..\..\data\betting_lines\totals.parquet')

'Done'

'Done'