### Pull the Circuit Data

In [1]:
import requests
import pandas as pd

# Define the years for which data is needed
years = [2023, 2024]

# Initialize an empty list to store circuit data
all_circuits = []

# Fetch circuit data for each year
for year in years:
    url = f"http://ergast.com/api/f1/{year}/circuits.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        circuits = data.get("MRData", {}).get("CircuitTable", {}).get("Circuits", [])
        for circuit in circuits:
            circuit_info = {
                "Year": year,
                "Circuit ID": circuit.get("circuitId"),
                "Circuit Name": circuit.get("circuitName"),
                "URL": circuit.get("url"),
                "Latitude": circuit.get("location", {}).get("lat"),
                "Longitude": circuit.get("location", {}).get("long"),
                "Locality": circuit.get("location", {}).get("locality"),
                "Country": circuit.get("location", {}).get("country"),
            }
            all_circuits.append(circuit_info)
    else:
        print(f"Failed to fetch data for year {year}. HTTP Status Code: {response.status_code}")

# Create a DataFrame from the list of circuits
circuit_df = pd.DataFrame(all_circuits)

# Save the DataFrame to a CSV file in the specified folder
file_path = r"C:\Users\Saffat\Downloads\Circuits_2023_2024.csv"
circuit_df.to_csv(file_path, index=False)

print(f"Data saved to {file_path}")


Data saved to C:\Users\Saffat\Downloads\Circuits_2023_2024.csv


### Pulling the Driver Standings data

In [2]:
# Define the years for which data is needed
years = [2023, 2024]

# Initialize an empty list to store standings data
all_standings = []

# Fetch standings data for each year
for year in years:
    url = f"http://ergast.com/api/f1/{year}/driverStandings.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        standings_lists = data.get("MRData", {}).get("StandingsTable", {}).get("StandingsLists", [])
        for standings in standings_lists:
            season = standings.get("season")
            round_number = standings.get("round")
            driver_standings = standings.get("DriverStandings", [])
            for driver in driver_standings:
                driver_info = driver.get("Driver", {})
                constructor_info = driver.get("Constructors", [{}])[0]

                standings_data = {
                    "Year": season,
                    "Round": round_number,
                    "Position": driver.get("position"),
                    "Position Text": driver.get("positionText"),
                    "Points": driver.get("points"),
                    "Wins": driver.get("wins"),
                    "Driver ID": driver_info.get("driverId"),
                    "Driver Name": f"{driver_info.get('givenName')} {driver_info.get('familyName')}",
                    "Driver Nationality": driver_info.get("nationality"),
                    "Constructor Name": constructor_info.get("name"),
                    "Constructor Nationality": constructor_info.get("nationality"),
                }
                all_standings.append(standings_data)
    else:
        print(f"Failed to fetch data for year {year}. HTTP Status Code: {response.status_code}")

# Create a DataFrame from the list of standings
driver_standings_df = pd.DataFrame(all_standings)

# Save the DataFrame to a CSV file in the specified folder
file_path = r"C:\\Users\\Saffat\\Downloads\\Driver_Standings_2023_2024.csv"
driver_standings_df.to_csv(file_path, index=False)

print(f"Data saved to {file_path}")

Data saved to C:\\Users\\Saffat\\Downloads\\Driver_Standings_2023_2024.csv


### Pull Qualifying results

In [3]:
import requests
import pandas as pd

# Define the years and rounds for which data is needed
years = [2023, 2024]
rounds = range(1, 25) 

# Initialize an empty list to store qualifying results data
all_qualifying_results = []

# Fetch qualifying results for each year and round
for year in years:
    for rnd in rounds:
        url = f"http://ergast.com/api/f1/{year}/{rnd}/qualifying.json"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            races = data.get("MRData", {}).get("RaceTable", {}).get("Races", [])
            for race in races:
                race_name = race.get("raceName")
                circuit_name = race.get("Circuit", {}).get("circuitName")
                race_date = race.get("date")
                qualifying_results = race.get("QualifyingResults", [])
                for result in qualifying_results:
                    driver_info = result.get("Driver", {})
                    constructor_info = result.get("Constructor", {})

                    qualifying_data = {
                        "Year": year,
                        "Round": rnd,
                        "Race Name": race_name,
                        "Circuit Name": circuit_name,
                        "Date": race_date,
                        "Position": result.get("position"),
                        "Driver Name": f"{driver_info.get('givenName')} {driver_info.get('familyName')}",
                        "Driver Nationality": driver_info.get("nationality"),
                        "Constructor Name": constructor_info.get("name"),
                        "Constructor Nationality": constructor_info.get("nationality"),
                        "Q1 Time": result.get("Q1"),
                        "Q2 Time": result.get("Q2"),
                        "Q3 Time": result.get("Q3"),
                    }
                    all_qualifying_results.append(qualifying_data)
        else:
            print(f"Failed to fetch data for year {year}, round {rnd}. HTTP Status Code: {response.status_code}")

# Create a DataFrame from the list of qualifying results
qualifying_results_df = pd.DataFrame(all_qualifying_results)

# Save the DataFrame to a CSV file in the specified folder
file_path = r"C:\\Users\\Saffat\\Downloads\\Qualifying_Results_2023_2024.csv"
qualifying_results_df.to_csv(file_path, index=False)

print(f"Data saved to {file_path}")

Data saved to C:\\Users\\Saffat\\Downloads\\Qualifying_Results_2023_2024.csv


### Race Results

In [4]:
import requests
import pandas as pd

# Define the years and rounds for which data is needed
years = [2023, 2024]
rounds = range(1, 23)  # Assuming there are 22 rounds in a season (adjust as needed)

# Initialize an empty list to store race results data
all_race_results = []

# Fetch race results for each year and round
for year in years:
    for rnd in rounds:
        url = f"http://ergast.com/api/f1/{year}/{rnd}/results.json"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            races = data.get("MRData", {}).get("RaceTable", {}).get("Races", [])
            for race in races:
                race_name = race.get("raceName")
                circuit_name = race.get("Circuit", {}).get("circuitName")
                race_date = race.get("date")
                race_results = race.get("Results", [])
                for result in race_results:
                    driver_info = result.get("Driver", {})
                    constructor_info = result.get("Constructor", {})
                    fastest_lap = result.get("FastestLap", {})

                    race_data = {
                        "Year": year,
                        "Round": rnd,
                        "Race Name": race_name,
                        "Circuit Name": circuit_name,
                        "Date": race_date,
                        "Position": result.get("position"),
                        "Position Text": result.get("positionText"),
                        "Points": result.get("points"),
                        "Driver Name": f"{driver_info.get('givenName')} {driver_info.get('familyName')}",
                        "Driver Nationality": driver_info.get("nationality"),
                        "Constructor Name": constructor_info.get("name"),
                        "Constructor Nationality": constructor_info.get("nationality"),
                        "Grid Position": result.get("grid"),
                        "Laps Completed": result.get("laps"),
                        "Status": result.get("status"),
                        "Race Time": result.get("Time", {}).get("time"),
                        "Fastest Lap Rank": fastest_lap.get("rank"),
                        "Fastest Lap Time": fastest_lap.get("Time", {}).get("time"),
                        "Fastest Lap Average Speed": fastest_lap.get("AverageSpeed", {}).get("speed"),
                    }
                    all_race_results.append(race_data)
        else:
            print(f"Failed to fetch data for year {year}, round {rnd}. HTTP Status Code: {response.status_code}")

# Create a DataFrame from the list of race results
race_results_df = pd.DataFrame(all_race_results)

# Save the DataFrame to a CSV file in the specified folder
file_path = r"C:\\Users\\Saffat\\Downloads\\Race_Results_2023_2024.csv"
race_results_df.to_csv(file_path, index=False)

print(f"Data saved to {file_path}")

Data saved to C:\\Users\\Saffat\\Downloads\\Race_Results_2023_2024.csv


### Track Images 2024

In [13]:

import requests
from bs4 import BeautifulSoup
import os
import json

def get_unstructured_data(base_url, output_dir):
    # Base URL for the 2024 season page
    response = requests.get(base_url)
    if response.status_code != 200:
        print(f"Failed to fetch the main page: {base_url}")
        return

    # Parse the main page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find links to all rounds
    round_links = [
        a['href'] for a in soup.find_all('a', href=True) if '/racing/2024/' in a['href'] and '/circuit' not in a['href']
    ]
    round_links = list(set([base_url.split('/en/racing/')[0] + link for link in round_links]))  # Make links absolute

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Prepare JSON file to store unstructured data
    json_file = os.path.join(output_dir, "f1_2024_unstructured_data.json")
    unstructured_data = []

    for round_link in round_links:
        print(f"Processing {round_link}")

        # Fetch round page
        round_response = requests.get(round_link)
        if round_response.status_code != 200:
            print(f"Failed to fetch round page: {round_link}")
            continue

        round_soup = BeautifulSoup(round_response.content, 'html.parser')

        try:
            # Fetch narrative content
            narrative_content = round_soup.find_all('p')
            narrative_text = " ".join([p.text.strip() for p in narrative_content if p.text.strip()])

            # Fetch image metadata (only those with 'carbon' in 'alt' attribute)
            images = round_soup.find_all('img')
            image_metadata = [{"src": img['src'], "alt": img.get('alt', 'No description')} for img in images 
                              if 'src' in img.attrs and 'carbon' in img.get('alt', '').lower()]

            # Store data
            unstructured_data.append({
                "round_link": round_link,
                "narrative_text": narrative_text,
                "image_metadata": image_metadata
            })

        except Exception as e:
            print(f"Error processing {round_link}: {e}")

    # Save unstructured data to JSON file
    with open(json_file, 'w', encoding='utf-8') as file:
        json.dump(unstructured_data, file, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    BASE_URL = "https://www.formula1.com/en/racing/2024"
    OUTPUT_DIR = "C:\\Users\\Saffat\\Downloads"

    get_unstructured_data(BASE_URL, OUTPUT_DIR)
    print(f"Data collection complete. Check the output directory: {OUTPUT_DIR}")

Processing https://www.formula1.com/en/racing/2024/qatar
Processing https://www.formula1.com/en/racing/2024/australia
Processing https://www.formula1.com/en/racing/2024/china
Processing https://www.formula1.com/en/racing/2024/mexico
Processing https://www.formula1.com/en/racing/2024/emiliaromagna
Processing https://www.formula1.com/en/racing/2024/azerbaijan
Processing https://www.formula1.com/en/racing/2024/las-vegas
Processing https://www.formula1.com/en/racing/2024/singapore
Processing https://www.formula1.com/en/racing/2024/netherlands
Processing https://www.formula1.com/en/racing/2024/pre-season-testing
Processing https://www.formula1.com/en/racing/2024/great-britain
Processing https://www.formula1.com/en/racing/2024/united-arab-emirates
Processing https://www.formula1.com/en/racing/2024/canada
Processing https://www.formula1.com/en/racing/2024/monaco
Processing https://www.formula1.com/en/racing/2024/saudi-arabia
Processing https://www.formula1.com/en/racing/2024/brazil
Processing

### Track Images 2023

In [14]:

import requests
from bs4 import BeautifulSoup
import os
import json

def get_unstructured_data(base_url, output_dir):
    # Base URL for the 2023 season page
    response = requests.get(base_url)
    if response.status_code != 200:
        print(f"Failed to fetch the main page: {base_url}")
        return

    # Parse the main page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find links to all rounds
    round_links = [
        a['href'] for a in soup.find_all('a', href=True) if '/racing/2023/' in a['href'] and '/circuit' not in a['href']
    ]
    round_links = list(set([base_url.split('/en/racing/')[0] + link for link in round_links]))  # Make links absolute

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Prepare JSON file to store unstructured data
    json_file = os.path.join(output_dir, "f1_2023_unstructured_data.json")
    unstructured_data = []

    for round_link in round_links:
        print(f"Processing {round_link}")

        # Fetch round page
        round_response = requests.get(round_link)
        if round_response.status_code != 200:
            print(f"Failed to fetch round page: {round_link}")
            continue

        round_soup = BeautifulSoup(round_response.content, 'html.parser')

        try:
            # Fetch narrative content
            narrative_content = round_soup.find_all('p')
            narrative_text = " ".join([p.text.strip() for p in narrative_content if p.text.strip()])

            # Fetch image metadata (only those with 'carbon' in 'alt' attribute)
            images = round_soup.find_all('img')
            image_metadata = [{"src": img['src'], "alt": img.get('alt', 'No description')} for img in images 
                              if 'src' in img.attrs and 'carbon' in img.get('alt', '').lower()]

            # Store data
            unstructured_data.append({
                "round_link": round_link,
                "narrative_text": narrative_text,
                "image_metadata": image_metadata
            })

        except Exception as e:
            print(f"Error processing {round_link}: {e}")

    # Save unstructured data to JSON file
    with open(json_file, 'w', encoding='utf-8') as file:
        json.dump(unstructured_data, file, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    BASE_URL = "https://www.formula1.com/en/racing/2023"
    OUTPUT_DIR = "C:\\Users\\Saffat\\Downloads"

    get_unstructured_data(BASE_URL, OUTPUT_DIR)
    print(f"Data collection complete. Check the output directory: {OUTPUT_DIR}")

Processing https://www.formula1.com/en/racing/2023/australia
Processing https://www.formula1.com/en/racing/2023/italy
Processing https://www.formula1.com/en/racing/2023/canada
Processing https://www.formula1.com/en/racing/2023/azerbaijan
Processing https://www.formula1.com/en/racing/2023/emiliaromagna
Processing https://www.formula1.com/en/racing/2023/qatar
Processing https://www.formula1.com/en/racing/2023/miami
Processing https://www.formula1.com/en/racing/2023/japan
Processing https://www.formula1.com/en/racing/2023/spain
Processing https://www.formula1.com/en/racing/2023/bahrain
Processing https://www.formula1.com/en/racing/2023/monaco
Processing https://www.formula1.com/en/racing/2023/netherlands
Processing https://www.formula1.com/en/racing/2023/hungary
Processing https://www.formula1.com/en/racing/2023/mexico
Processing https://www.formula1.com/en/racing/2023/las-vegas
Processing https://www.formula1.com/en/racing/2023/singapore
Processing https://www.formula1.com/en/racing/2023

### Combining both track image datasets

In [16]:
import json

# Function to load and combine JSON data from two files with a year column
def combine_json_files_with_year(file1, file2, output_file):
    # Load data from the first JSON file (2023)
    with open(file1, 'r', encoding='utf-8') as f1:
        data1 = json.load(f1)
    
    # Add the year 2023 to each entry in data1
    for entry in data1:
        entry["year"] = 2023

    # Load data from the second JSON file (2024)
    with open(file2, 'r', encoding='utf-8') as f2:
        data2 = json.load(f2)

    # Add the year 2024 to each entry in data2
    for entry in data2:
        entry["year"] = 2024

    # Combine both data
    combined_data = data1 + data2

    # Save combined data to the output file
    with open(output_file, 'w', encoding='utf-8') as output:
        json.dump(combined_data, output, ensure_ascii=False, indent=4)

    print(f"Combined data saved to {output_file}")

# Define file paths for the 2023 and 2024 JSON files (using local paths)
file_2023 = r"C:\Users\Saffat\Downloads\f1_2023_unstructured_data.json"
file_2024 = r"C:\Users\Saffat\Downloads\f1_2024_unstructured_data.json"
output_file = r"C:\Users\Saffat\Downloads\f1_2023_2024_combined_data.json"

# Combine the JSON files with the year column
combine_json_files_with_year(file_2023, file_2024, output_file)


Combined data saved to C:\Users\Saffat\Downloads\f1_2023_2024_combined_data.json
