# Formula 1 Data Analysis with Plotly
**Student:** Somesh R. Rout


**ID**: GH1039569


**Project:** F1 World Championship Visualization

In [None]:
!pip install -q kagglehub

import kagglehub
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import plotly.io as pio
pio.templates.default = "plotly_dark"
path = kagglehub.dataset_download("rohanrao/formula-1-world-championship-1950-2020")


races = pd.read_csv(os.path.join(path, 'races.csv'))
results = pd.read_csv(os.path.join(path, 'results.csv'))
drivers = pd.read_csv(os.path.join(path, 'drivers.csv'))
constructors = pd.read_csv(os.path.join(path, 'constructors.csv'))
circuits = pd.read_csv(os.path.join(path, 'circuits.csv'))

df = pd.merge(results, races[['raceId', 'year', 'name', 'circuitId']], on='raceId', how='left')
df = pd.merge(df, drivers[['driverId', 'driverRef', 'nationality', 'code']], on='driverId', how='left')
df = pd.merge(df, constructors[['constructorId', 'name', 'nationality']], on='constructorId', how='left', suffixes=('_driver', '_team'))
df.rename(columns={'name_team': 'Team', 'driverRef': 'Driver', 'name': 'GP_Name'}, inplace=True)
print("Dataset Loaded and....")
print(" 'It's Hammertime' -  Sir Lewis Hamilton ")
df.head(5)

Using Colab cache for faster access to the 'formula-1-world-championship-1950-2020' dataset.
Dataset Loaded and....
 'It's Hammertime' -  Sir Lewis Hamilton 


Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,...,fastestLapSpeed,statusId,year,name_driver,circuitId,Driver,nationality_driver,code,Team,nationality_team
0,1,18,1,1,22,1,1,1,1,10.0,...,218.3,1,2008,Australian Grand Prix,1,hamilton,British,HAM,McLaren,British
1,2,18,2,2,3,5,2,2,2,8.0,...,217.586,1,2008,Australian Grand Prix,1,heidfeld,German,HEI,BMW Sauber,German
2,3,18,3,3,7,7,3,3,3,6.0,...,216.719,1,2008,Australian Grand Prix,1,rosberg,German,ROS,Williams,British
3,4,18,4,4,5,11,4,4,4,5.0,...,215.464,1,2008,Australian Grand Prix,1,alonso,Spanish,ALO,Renault,French
4,5,18,5,1,23,3,5,5,5,4.0,...,218.385,1,2008,Australian Grand Prix,1,kovalainen,Finnish,KOV,McLaren,British


In [None]:
winners = df[df['positionOrder'] == 1]
win_counts = winners.groupby(['Team', 'Driver']).size().reset_index(name='Wins')

## Filter to keep the chart readable ###
top_teams = win_counts.groupby('Team')['Wins'].transform('sum') > 10
win_counts_filtered = win_counts[top_teams]

fig1 = px.sunburst(
    win_counts_filtered,
    path=['Team', 'Driver'],
    values='Wins',
    color='Wins',
    color_continuous_scale='Magma',
    title='Hierarchy of Dominance: F1 Wins by Team & Driver</b>'
)

fig1.update_layout(
    font_family="Roboto",
    font_size=14,
    title_font_size=24,
    width=800,
    height=800,
    margin=dict(t=50, l=0, r=0, b=0)
)

fig1.show()

In [None]:
modern_era = df[(df['year'] >= 2014) & (df['grid'] > 0) & (df['positionOrder'] <= 20)].copy()
status_df = pd.read_csv(os.path.join(path, 'status.csv'))
modern_era = pd.merge(modern_era, status_df, on='statusId', how='left')

#### function to classify status
def categorize_status(status_str):

### If the status is "Finished" or mentions +ve Laps

    if status_str == 'Finished' or '+' in str(status_str):
        return 'Finished'
    else:
        return 'DNF / Issue'

modern_era['Status_Type'] = modern_era['status'].apply(categorize_status)

fig2 = px.scatter(
    modern_era,
    x='grid',
    y='positionOrder',
    color='Status_Type',
    color_discrete_map={'Finished': '#2ecc71', 'DNF / Issue': '#e74c3c'},
    trendline="ols", ### Ordinary Least Squares trendline
    trendline_scope="overall",
    opacity=0.6,
    hover_data=['Driver', 'Team', 'year', 'name_driver'],
    title='Qualifying vs. Race Pace (Hybrid Era 2014-2023)',
    labels={'grid': 'Starting Grid Position', 'positionOrder': 'Finishing Position'}
)


fig2.update_traces(marker=dict(size=8, line=dict(width=1, color='White')))
fig2.update_layout(
    font_family="Montserrat",
    xaxis=dict(tickmode='linear', dtick=1),
    yaxis=dict(tickmode='linear', dtick=1, autorange="reversed"), #### Reverse Y axis (1st place at top)
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    plot_bgcolor='rgba(0,0,0,0)',
    height=600
)

fig2.show()

In [None]:
circuit_counts = df.groupby(['circuitId', 'name_driver']).size().reset_index(name='Race_Count')
circuit_locs = pd.merge(circuit_counts, circuits, on='circuitId')

fig3 = px.density_mapbox(
    circuit_locs,
    lat='lat',
    lon='lng',
    z='Race_Count',
    radius=25,
    center=dict(lat=48, lon=12), ##### Centered on Europe
    zoom=2.5,
    mapbox_style="carto-darkmatter",
    title='Global Grand Prix Density (1950-2024)'
)

fig3.update_layout(
    margin=dict(l=0, r=0, t=50, b=0),
    title_x=0.5,
    font_family="Roboto"
)

fig3.show()

In [None]:
# # top 3 teams filter for 2023 ###

season_2023 = df[(df['year'] == 2023) & (df['Team'].isin(['Red Bull', 'Mercedes', 'Ferrari', 'McLaren']))]

fig4 = px.violin(
    season_2023,
    y="points",
    x="Team",
    color="Team",
    box=True,
    points="all",
    hover_data=['Driver', 'name_driver'],
    color_discrete_map={
        'Red Bull': '#0600EF',
        'Mercedes': '#00D2BE',
        'Ferrari': '#DC0000',
        'McLaren': '#FF8700'
    },
    title='2023 Points Distribution Analysis'
)

fig4.update_layout(
    yaxis_title="Points Scored per Race",
    xaxis_title="Constructor",
    showlegend=False,
    font_family="Montserrat",
    plot_bgcolor='rgba(255,255,255,0.05)'
)

fig4.show()