In [1]:
# Importing needed packages

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Setting up the scraping function
def scrape_formula1(sub_url, export_name):
    base_url = 'https://www.formula1.com/en/results.html/'
    years = range(1950, 2024)

    # Extracting headers
    response = requests.get(f'https://www.formula1.com/en/results.html/1950/{sub_url}')

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Recognizing the table to scrape
        table = soup.find('table', class_='resultsarchive-table')

        # Omitting <th> tags with class 'limiter' which creates an extra column in the scraping
        headers = []
        for th in table.find_all('th'):
            if 'limiter' not in th.get('class', []):
                headers.append(th.text.strip())

        # Adding 'Year' to the start of the header
        headers.insert(0, 'Year')

        # To store the rows as they are scraped
        all_rows = []

        # Looping through the years
        for year in years:
            print(f"Extracting for {year}")
            url = f'{base_url}{year}/{sub_url}'

            response = requests.get(url)

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                table = soup.find('table', class_='resultsarchive-table')

                rows = []
                for tr in table.find_all('tr'):
                    cells = tr.find_all('td')
                    if len(cells) > 0:
                        row = [year] # Starting the row with the year because original table doesn't has it
                        for td in cells:
                            if 'limiter' not in td.get('class', []):
                                row.append(td.text.strip()) # Adding the rest of the row after the year
                        if len(row) == len(headers):  # Ensure the row has the same length as the headers to make sure nothing is missing
                            rows.append(row)

                # Add the rows for this year to the list of all rows
                all_rows.extend(rows)

            else:
                print(f'Failed to retrieve the webpage for year {year}. Status code: {response.status_code}')

        # Create a DataFrame from the headers and all rows
        final_export = pd.DataFrame(all_rows, columns=headers)

        # Export the DataFrame to a CSV file
        final_export.to_csv(f'{export_name}.csv', index=False)

    else:
        print(f'Failed to retrieve the webpage. Status code: {response.status_code}')


In [None]:
# Variable for driver standings
drivers_sub_url = 'drivers.html'
drivers_export = 'drivers_1950_2023'
scrape_formula1(drivers_sub_url, drivers_export)


In [None]:
# Variable for race results
results_sub_url = 'races.html'
results_export = 'races_1950_2023'
scrape_formula1(results_sub_url, results_export)