In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pickle
import string

In [2]:
BASE_URL = 'http://racing-reference.info'
start_year = 2015
years = range(start_year, 2021)

cup_results = [requests.get(BASE_URL + f'/raceyear/{year}/W') for year in years]
set([r.status_code for r in cup_results])

{200}

In [3]:
race_anchors = []
href_regex = re.compile('/race/.*/W')

for c in cup_results:
    race_anchors.extend(BeautifulSoup(c.text, 'lxml').find_all(href=href_regex))

race_anchors[-5:]

[<a aria-label="click here to learn more about Bass Pro Shops / NRA Night Race results" href="https://www.racing-reference.info/race/2020_Bass_Pro_Shops_NRA_Night_Race/W" title="Bass Pro Shops / NRA Night Race">29</a>,
 <a aria-label="click here to learn more about South Point 400 results" href="https://www.racing-reference.info/race/2020_South_Point_400/W" title="South Point 400">30</a>,
 <a aria-label="click here to learn more about YellaWood 500 results" href="https://www.racing-reference.info/race/2020_YellaWood_500/W" title="YellaWood 500">31</a>,
 <a aria-label="click here to learn more about Bank of America Roval 400 results" href="https://www.racing-reference.info/race/2020_Bank_of_America_Roval_400/W" title="Bank of America Roval 400">32</a>,
 <a aria-label="click here to learn more about Hollywood Casino 400 results" href="https://www.racing-reference.info/race/2020_Hollywood_Casino_400/W" title="Hollywood Casino 400">33</a>]

In [4]:
races = [requests.get(a.attrs['href']) for a in race_anchors]
set([r.status_code for r in races])

{200}

In [5]:
def track_type(track_length, track_type):
    track_length = float(track_length)
    if track_type == 'road course':
        return 'road course'
    elif track_length >= 2.0:
        return 'superspeedway'
    elif track_length >= 1.0:
        return 'intermediate'
    else:
        return 'short track'

In [6]:
r_details = re.compile(r'(\d+) laps\*? on a (\d?\.\d{3}) mile (.*) \((\d+\.\d+) miles\)')
r_race_id = re.compile(r'(\d{4})_(.*)')
r_track_name = re.compile('/tracks/.*')

race_data_frames = []

for r in races:
    df = pd.read_html(r.text, match='Sponsor / Owner', header=0)[-1]

    details_match = r_details.search(r.text)
    df['race_length_laps'] = int(details_match[1])
    df['track_length_miles'] = float(details_match[2])
    df['track_type'] = details_match[3]
    df['track_type_detail'] = track_type(details_match[2], details_match[3])
    df['race_length_miles'] = float(details_match[4])

    race_id = r.url.split('/')[-2]
    race_id_match = r_race_id.search(race_id)
    df['year'] = int(race_id_match[1])
    df['race_name'] = race_id_match[2]

    df['track_name'] = BeautifulSoup(r.text, 'lxml').find(href=r_track_name).text

    race_data_frames.append(df)

In [7]:
race_data_frames[-1].head()

Unnamed: 0,Fin,St,#,Driver,Sponsor / Owner,Car,Laps,Status,Led,Pts,PPts,race_length_laps,track_length_miles,track_type,track_type_detail,race_length_miles,year,race_name,track_name
0,1,2,22,Joey Logano,Shell / Pennzoil (Roger Penske),Ford,267,running,47,42,5,267,1.5,paved track,intermediate,400.5,2020,Hollywood_Casino_400,Kansas Speedway
1,2,4,4,Kevin Harvick,Jimmy John's (Stewart Haas Racing),Ford,267,running,85,48,0,267,1.5,paved track,intermediate,400.5,2020,Hollywood_Casino_400,Kansas Speedway
2,3,6,88,Alex Bowman,ChevyGoods.com / Truck Hero (Rick Hendrick),Chevrolet,267,running,0,46,0,267,1.5,paved track,intermediate,400.5,2020,Hollywood_Casino_400,Kansas Speedway
3,4,8,2,Brad Keselowski,Discount Tire (Roger Penske),Ford,267,running,6,47,0,267,1.5,paved track,intermediate,400.5,2020,Hollywood_Casino_400,Kansas Speedway
4,5,20,18,Kyle Busch,M&M's Halloween Treat Town (Joe Gibbs),Toyota,267,running,4,33,0,267,1.5,paved track,intermediate,400.5,2020,Hollywood_Casino_400,Kansas Speedway


In [None]:
with open("all_races.pkl", 'wb') as f:
    pickle.dump(race_data_frames, f)