## Gathering Race Stats
Overall racing stats from [Racing Reference](https://www.racing-reference.info/indy500.htm).

In [1]:
import pandas as pd
import numpy as np
import requests
import time
from bs4 import BeautifulSoup

In [2]:
res = requests.get('https://www.racing-reference.info/indy500.htm')
soup = BeautifulSoup(res.text, 'lxml')
links = soup.find_all('a')

years = []
for link in links:
    try:
        if int(link.text):
            href = link.attrs['href']
            years.append(href)
    except:
        pass

In [3]:
def get_stats(year):
    url = f'https://www.racing-reference.info{year}'
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'lxml')
    date = soup.find('title').text[0:10]
    
    stats = soup.find_all('table')[4].text
    stat_list = [stats.split('\n')[1:-1][x].split(': ') for x in range(len(stats.split('\n')[1:-1]))]
    stat_list.pop(3)
    
    stat_list = {x:y for (x, y) in stat_list}
    stat_list['date'] = date
    
    return stat_list

In [4]:
get_stats('/race/2019-06/O')

{'Time of race': '2:50:39',
 'Average speed': '175.794 mph',
 'Pole speed': '229.992 mph',
 'Cautions': '4 for 29 laps',
 'Margin of victory': '.209 sec',
 'Attendance': 'n/a',
 'Lead changes': '29',
 'date': '05/26/2019'}

In [5]:
all_races = []
for year in years:
    df = get_stats(year)
    all_races.append(df)
    
    time.sleep(11)

In [6]:
stats = pd.DataFrame(all_races)

In [7]:
stats.head()

Unnamed: 0,Time of race,Average speed,Pole speed,Cautions,Margin of victory,Attendance,Lead changes,date,Pole time
0,2:50:39,175.794 mph,229.992 mph,4 for 29 laps,.209 sec,,29,05/26/2019,
1,2:59:43,166.935 mph,229.618 mph,7 for 41 laps,3.159 sec,,30,05/27/2018,
2,3:13:03,155.395 mph,232.164 mph,10 for 50 laps,.201 sec,,35,05/28/2017,
3,3:00:02,166.634 mph,230.76 mph,6 for 46 laps,4.498 sec,,54,05/29/2016,
4,3:05:57,161.341 mph,226.76 mph,6 for 47 laps,.105 sec,,37,05/24/2015,


In [8]:
stats.shape

(103, 9)

In [9]:
stats.to_csv('./race_stats.csv', index = False)