# Scuderia Ferrari

![Scuderia Ferrari](http://wallpapercave.com/wp/wp1880037.jpg)

## Hello! everyone, this notebook looks at the Formula One's most iconic team's dreadful decade 2010-2019 and tries to find how things went wrong!

Similar to the previous work of mine, this notebook is divided into two parts
<ol>
    <li>Building sqlite database</li>
    <li>Analysis</li>
</ol>

In [None]:
import sqlite3
from sqlite3 import Error

import pandas as pd
from IPython.display import Image

pd.options.mode.chained_assignment = None

To create sqlite database we create few helper functions:
<ol>
    <li><b>create_connection: This function sets-up a connection with db_file</b></li>
    <li><b>create_table: This function creats a table based on the create_table_sql statement</b></li>
    <li><b>insert_sql_statement: This function inserts data based on insert_data sql statement into a table named "table_name"</b></li>
    <li><b>read_sql_query: This function reads sql query and returns a pandas dataframe containing the query</b></li>
</ol>

In [None]:
def create_connection(db_file, delete_db=False):
    import os
    if delete_db and os.path.exists(db_file):
        os.remove(db_file)

    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn


def create_table(conn, create_table_sql, drop_table_name=None):
    
    if drop_table_name: # You can optionally pass drop_table_name to drop the table. 
        try:
            c = conn.cursor()
            c.execute("""DROP TABLE IF EXISTS %s""" % (drop_table_name))
        except Error as e:
            print(e)
    
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)
        
def insert_sql_statement(insert_data, conn, table_name):
    with conn:
        cur = conn.cursor()

        empty_value_container = "?,"*len(insert_data[0])
        empty_value_container = "".join(["(",empty_value_container.strip(","),")"])

        cur.executemany(f"INSERT INTO {table_name} VALUES {empty_value_container}", insert_data)
        
def read_sql_query(query, conn):
    result = pd.read_sql_query(query, conn)
    return result

Before we execute above functions, we create db file and set-up a database connection with that file.

In [None]:
database_filename = 'f1_info.db'
conn = create_connection(database_filename, delete_db=True)

For this analyses, we use csv files provided in the dataset.<br>
<ol>
    <li><b>constructors.csv</b></li>
    <li><b>constructor_standings.csv</b></li>
    <li><b>drivers.csv</b></li>
    <li><b>races.csv</b></li>
    <li><b>results.csv</b></li>
    <li><b>status.csv</b></li>
    <li><b>lap_times.csv</b></li>
    <li><b>pit_stops.csv</b></li>
</ol>

In [None]:
def create_constructors_table():
    
    with open("/kaggle/input/formula-1-world-championship-1950-2020/constructors.csv","r") as f:
        data = f.read().strip().split("\n")

    data = list(map(lambda row: row.split(","), data))

    columns = data[0]

    constructor_index = columns.index("constructorId")
    name_index = columns.index("name")

    constructorId = list(map(lambda row: int(row[constructor_index]), data[1:]))
    name = list(map(lambda row: row[name_index].strip('"'), data[1:]))

    insert_data = list(zip(constructorId,name))
    
    sql_create_statement = """CREATE TABLE constructors
                        (
                            constructorId Integer not null Primary key,
                            name Text not null
                        )
                        """

    create_table(conn, sql_create_statement,drop_table_name=True)

    insert_sql_statement(insert_data, conn, "constructors")
    
def create_constructor_standings_table():

    with open("/kaggle/input/formula-1-world-championship-1950-2020/constructor_standings.csv","r") as f:
        data = f.read().strip().split("\n")

    data = list(map(lambda row: row.split(","), data))

    columns = data[0]

    constructorStandingsId_index = columns.index("constructorStandingsId")
    raceId_index = columns.index("raceId")
    constructorId_index = columns.index("constructorId")
    points_index = columns.index("points")
    position_index = columns.index("position")

    constructorStandingsId = list(map(lambda row: int(row[constructorStandingsId_index]), data[1:]))
    raceId = list(map(lambda row: int(row[raceId_index]), data[1:]))
    constructorId = list(map(lambda row: int(row[constructorId_index]), data[1:]))
    points = list(map(lambda row: float(row[points_index]), data[1:]))
    position = list(map(lambda row: int(row[position_index]), data[1:]))

    insert_data = list(zip(constructorStandingsId, raceId, constructorId, points, position))

    sql_create_statement = """CREATE TABLE constructor_standings
                        (
                            constructorStandingsId Integer not null Primary key,
                            raceId Integer not null,
                            constructorId not null,
                            points Integer not null,
                            position Integer not null
                        )
                        """

    create_table(conn, sql_create_statement,drop_table_name=True)

    insert_sql_statement(insert_data, conn, "constructor_standings")
    
def create_drivers_table():

    with open("/kaggle/input/formula-1-world-championship-1950-2020/drivers.csv","r") as f:
        data = f.read().strip().split("\n")

    data = list(map(lambda row: row.split(","), data))

    columns = data[0]

    driverId_index = columns.index("driverId")
    forename_index = columns.index("forename")
    surname_index = columns.index("surname")

    driverId = list(map(lambda row: int(row[driverId_index]), data[1:]))
    name = list(map(lambda row: " ".join([row[forename_index].strip('"'),row[surname_index].strip('"')]) , data[1:]))

    insert_data = list(zip(driverId, name))

    sql_create_statement = """CREATE TABLE drivers
                        (
                            driverId Integer not null Primary key,
                            name Text not null
                        )
                        """

    create_table(conn, sql_create_statement,drop_table_name=True)

    insert_sql_statement(insert_data, conn, "drivers")
    
def create_races_table():

    with open("/kaggle/input/formula-1-world-championship-1950-2020/races.csv","r") as f:
        data = f.read().strip().split("\n")

    data = list(map(lambda row: row.split(","), data))

    columns = data[0]

    raceId_index = columns.index("raceId")
    year_index = columns.index("year")
    round_index = columns.index("round")

    raceId = list(map(lambda row: int(row[raceId_index]), data[1:]))
    year = list(map(lambda row: int(row[year_index]), data[1:]))
    round = list(map(lambda row: int(row[round_index]), data[1:]))

    insert_data = list(zip(raceId, year, round))

    sql_create_statement = """CREATE TABLE races
                        (
                            raceId Integer not null Primary key,
                            year Integer not null,
                            round Integer not null
                        )
                        """

    create_table(conn, sql_create_statement, drop_table_name=True)

    insert_sql_statement(insert_data, conn, "races")
    
def create_results_table():

    with open("/kaggle/input/formula-1-world-championship-1950-2020/results.csv","r") as f:
        data = f.read().strip().split("\n")

    data = list(map(lambda row: row.split(","), data))

    columns = data[0]

    resultId_index = columns.index("resultId")
    raceId_index = columns.index("raceId")
    driverId_index = columns.index("driverId")
    constructorId_index = columns.index("constructorId")
    grid_index = columns.index("grid")
    position_index = columns.index("position")
    points_index = columns.index("points")
    statusId_index = columns.index("statusId")

    resultId = list(map(lambda row: int(row[resultId_index]), data[1:]))
    raceId = list(map(lambda row: int(row[raceId_index]), data[1:]))
    driverId = list(map(lambda row: int(row[driverId_index]), data[1:]))
    constructorId = list(map(lambda row: int(row[constructorId_index]), data[1:]))
    grid = list(map(lambda row: int(row[grid_index]), data[1:]))
    position = list(map(lambda row: row[position_index], data[1:]))
    points = list(map(lambda row: float(row[points_index]), data[1:]))
    statusId = list(map(lambda row: int(row[statusId_index]), data[1:]))

    position = list(map(lambda val: '1000' if val=="\\N" else val, position))
    position = list(map(lambda val: int(val), position))

    insert_data = list(zip(resultId, raceId, driverId, constructorId, grid, position, points, statusId))

    result_df = pd.DataFrame(insert_data, columns=["resultId","raceId","driverId","constructorId","grid","position","points","statusId"])

    grouped=result_df.groupby(by='raceId')['position']
    values=grouped.transform(lambda x: len(x))
    indices_to_replace=result_df[result_df.position==1000].index.tolist()
    values_to_replace=values[indices_to_replace]
    result_df['position'].iloc[indices_to_replace]=values_to_replace

    insert_data = result_df.values

    sql_create_statement = """CREATE TABLE results
                        (
                            resultId Integer not null Primary key,
                            raceId Integer not null,
                            driverId Integer not null,
                            constructorId Integer not null,
                            grid Integer not null,
                            position Integer not null,
                            points Real not null,
                            statusId Integer not null
                        )
                        """

    create_table(conn, sql_create_statement, drop_table_name=True)

    insert_sql_statement(insert_data, conn, "results")
    
def create_status_table():

    with open("/kaggle/input/formula-1-world-championship-1950-2020/status.csv","r") as f:
        data = f.read().strip().split("\n")

    data = list(map(lambda row: row.split(","), data))

    columns = data[0]

    statusId_index = columns.index("statusId")
    status_index = columns.index("status")

    statusId = list(map(lambda row: int(row[statusId_index]), data[1:]))
    status = list(map(lambda row: row[status_index].strip('"'), data[1:]))

    insert_data = list(zip(statusId, status))

    sql_create_statement = """CREATE TABLE status
                        (
                            statusId Integer not null Primary key,
                            status Text not null
                        )
                        """

    create_table(conn, sql_create_statement, drop_table_name=True)

    insert_sql_statement(insert_data, conn, "status")
    
def create_lap_times_table():
    
    with open("/kaggle/input/formula-1-world-championship-1950-2020/lap_times.csv","r") as f:
        data = f.read().strip().split("\n")

    data = list(map(lambda row: row.split(","), data))

    columns = data[0]

    raceId_index = columns.index("raceId")
    driverId_index = columns.index("driverId")
    lap_index = columns.index("lap")
    milliseconds_index = columns.index("milliseconds")

    raceId = list(map(lambda row: int(row[raceId_index]), data[1:]))
    driverId = list(map(lambda row: int(row[driverId_index]), data[1:]))
    lap = list(map(lambda row: int(row[lap_index]), data[1:]))
    milliseconds = list(map(lambda row: int(row[milliseconds_index]), data[1:]))
    lapId = list(range(1, len(raceId)+1))

    insert_data = list(zip(lapId, raceId, driverId, lap, milliseconds))

    sql_create_statement = """CREATE TABLE lap_times
                        (
                            lapId Integer not null Primary key,
                            raceId Integer not null,
                            driverId Integer not null,
                            lap Integer not null,
                            milliseconds Integer not null
                        )
                        """

    create_table(conn, sql_create_statement, drop_table_name=True)

    insert_sql_statement(insert_data, conn, "lap_times")
    
def create_pit_stops_table():
    
    with open("/kaggle/input/formula-1-world-championship-1950-2020/pit_stops.csv","r") as f:
        data = f.read().strip().split("\n")

    data = list(map(lambda row: row.split(","), data))

    columns = data[0]

    raceId_index = columns.index("raceId")
    driverId_index = columns.index("driverId")
    stop_index = columns.index("stop")
    lap_index = columns.index("lap")
    milliseconds_index = columns.index("milliseconds")

    raceId = list(map(lambda row: int(row[raceId_index]), data[1:]))
    driverId = list(map(lambda row: int(row[driverId_index]), data[1:]))
    stop = list(map(lambda row: int(row[stop_index]), data[1:]))
    lap = list(map(lambda row: int(row[lap_index]), data[1:]))
    milliseconds = list(map(lambda row: int(row[milliseconds_index]), data[1:]))
    pit_stop_Id = list(range(1, len(raceId)+1))

    insert_data = list(zip(pit_stop_Id, raceId, driverId, stop, lap, milliseconds))

    sql_create_statement = """CREATE TABLE pit_stops
                        (
                            pit_stop_Id Integer not null Primary key,
                            raceId Integer not null,
                            driverId Integer not null,
                            stop Integer not null,
                            lap Integer not null,
                            milliseconds Integer not null
                        )
                        """
    
    create_table(conn, sql_create_statement, drop_table_name=True)

    insert_sql_statement(insert_data, conn, "pit_stops")

In [None]:
create_constructors_table()
create_constructor_standings_table()
create_drivers_table()
create_races_table()
create_results_table()
create_status_table()
create_lap_times_table()
create_pit_stops_table()

Now, that our database is up and running, we will start second part of the notebook, Analysis.<br>
For numerical analysis and visualization purpose we will use numpy and matplotlib libraries.

In [None]:
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('seaborn-talk')
import matplotlib.patches as mpatches

import warnings
warnings.filterwarnings('ignore')

## First question we might ask ourselves is why we care about Ferrari?
![Charles Ferrari](https://besthqwallpapers.com/Uploads/4-3-2019/82565/thumb2-charles-leclerc-4k-ferrari-sf90-raceway-2019-f1-cars.jpg)

In [None]:
query_statement = """
                select year, constructors.name from constructor_standings
                
                    join

                        (
                        
                            select year, raceId from races
                            group by year
                            having max(round)
                        
                        ) as last_race

                            on last_race.raceId = constructor_standings.raceId
                        
                    join constructors
                        on constructor_standings.constructorId=constructors.constructorId
                
                where position=1
                
                order by year
            """
result = read_sql_query(query_statement, conn)

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

cmap = plt.get_cmap('Set1')
colors=[cmap(i) for i in range(len(result.name.unique()))]

result.name.value_counts().plot.bar(ax=ax, color=colors)
ax.set_ylabel("Number of constructor championships")
ax.grid(False);

The answer is because Ferrari is by far the most sucessful F1 team in terms of number of constructors championship wins.<br>
But, let's take a look at who are the championship winning teams in the last decade (2010-2019)

In [None]:
result[(result.year>=2010) & (result.year<2020)]

And we can see that giants like Ferrari has not won a single title in the last 10 years.<br>
Does that mean Ferrari is no longer competative? Let's answer this question by looking at Ferrari's constructor championship standings in the years.

In [None]:
query_statement = """
                select last_race.year, constructors.name, position from constructor_standings
                
                    join

                        (
                        
                            select year, raceId from races
                            where year between 2010 and 2019
                            group by year
                            having max(round)
                        
                        ) as last_race

                            on constructor_standings.raceId = last_race.raceId
                    
                    join constructors
                        on constructors.constructorId = constructor_standings.constructorId
                
                where constructors.name = "Ferrari"
                
                order by last_race.year
            """
result = read_sql_query(query_statement, conn)

In [None]:
result

We can see, the constructor standing positions are not that bad. They certainly are not in the botton half of the pile.

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

cmap = plt.get_cmap('Set1')
colors=[cmap(i) for i in range(len(result.position.unique()))]

result.position.value_counts().plot.bar(ax=ax, color=colors)
ax.grid(False)
ax.set_ylabel("Frequency")
ax.set_xlabel("Constructor standing positions");

In fact, by looking at the bar-plot above we can see 5 out of 10 times they stood at first-runner up spot.<br><br>
But, being at the 2nd spot for 5 times and still not winning a championship makes us curious to know why or rather how?. Hence we will now look at points scored by the team using following formula: 


<br><br>
percentage points deficit = $\frac{\text{Total points scored by championship winning team of particular year - Ferrari's total points in the same year}}{\text{Total points scored by championship winning team of particular year}}$

In [None]:
query_statement = """
                select last_race.year, constructors.name, constructor_standings.position, constructor_standings.points from constructor_standings
                
                    join

                        (
                        
                            select year, raceId from races
                            where year between 2010 and 2019
                            group by year
                            having max(round)
                        
                        ) as last_race

                            on last_race.raceId = constructor_standings.raceId
                        
                    join constructors
                        on constructors.constructorId = constructor_standings.constructorId
                
                where constructors.name = "Ferrari" or constructor_standings.position = 1
                
                order by year, position
            """
result = read_sql_query(query_statement, conn)

In [None]:
# Calculate Ferrari's percentage deficit over the decade
percent_deficit = []
years = result.year.unique()
for year in years:
    percent_deficit.append(-1*(result[result.year==year].points.pct_change().iloc[1]))
    

# Calculate moving average
numbers = percent_deficit.copy()
window_size = 3
i = 0
moving_averages = []

while i < len(numbers) - window_size + 1:
    this_window = numbers[i : i + window_size]
    window_average = sum(this_window) / window_size
    moving_averages.append(window_average)
    i += 1

fig, [ax1, ax2, ax3] = plt.subplots(nrows=3, ncols=1, figsize=(17,9), sharex=True)
fig.tight_layout()

cmap = plt.get_cmap('tab10')
colors=[cmap(i) for i in range(len(result.year.unique()))]

ax1.bar(years, percent_deficit, color=colors)
ax1.grid(False)

ax2.scatter(years, result[result.name=="Ferrari"].position.values)
ax2.plot(years, result[result.name=="Ferrari"].position.values, linestyle="--")
ax2.grid(False)

ax3.scatter(years[2:], moving_averages)
ax3.plot(years[2:], moving_averages, c="gray", label="3-years moving average")
ax3.grid(False)

ax1.set_ylabel("Percentage points deficit")
ax2.set_ylabel("Constructor Standings")
ax3.set_ylabel("3 years percentage points\ndeficit average")

ax1.set_title("Ferrari last decade (2010-2019) performance")
ax3.set_xticks(years)
ax3.set_xticklabels(years, fontsize=13);

We can see from above plots that considering only the championship standings can be missleading. For example, in the years 2010 and 2011, Ferrari's standings are same, 3rd spot for both years, but in those years the percentage points deficit increased from just above 20% to just above 40%.

<b>Other key findings:</b>
<ul>
    <li>In the year 2012, percentage points deficit one of the lowest of the decade followed by the increase in the deficit in the coming years reaching the maximum at 2014 to almost 70%.</li>
    <li>The gradual improvement in the season performance after 2014 helps Ferrari to reach another one of the lowest percentage points deficit year 2018.</li>
</ul>

Let's now compare Ferrari's race performances, in terms of the points scored, with a championship winning team of the respective years.

In [None]:
query_statement = """
                select races.year, races.round, constructors.name, constructor_standings.points, season_winner.seasonWinner from races
                
                join
                
                    (
                    
                        select last_race.year, constructors.name as seasonWinner from constructor_standings

                        join

                            (

                                select year, raceId from races
                                where year between 2010 and 2019
                                group by year
                                having max(round)

                            ) as last_race

                                on constructor_standings.raceId = last_race.raceId

                        join constructors
                            on constructors.constructorId = constructor_standings.constructorId

                        where constructor_standings.position=1
                    
                    ) as season_winner
                    
                        on races.year = season_winner.year
                
                join constructor_standings
                    on constructor_standings.raceId = races.raceId
                    
                join constructors
                    on constructors.constructorId = constructor_standings.constructorId
                    
                where constructors.name = "Ferrari" or constructors.name = season_winner.seasonWinner
                    
                order by races.year, races.round, constructor_standings.position
            """
result = read_sql_query(query_statement, conn)

In [None]:
fig, ax = plt.subplots(figsize=(15,15), nrows=5, ncols=2)
fig.tight_layout()
years = result.year.unique()
for ind in range(0,len(years),2):
    ax_ind = ind//2
    result[result.year==years[ind]].groupby("name")["points"].plot(x="round", y="points", legend=True, ax=ax[ax_ind][0])
    result[result.year==years[ind+1]].groupby("name")["points"].plot(x="round", y="points", legend=True, ax=ax[ax_ind][1])
    
    ax[ax_ind][0].set_title(f"Season {years[ind]}")
    ax[ax_ind][1].set_title(f"Season {years[ind+1]}")
    
    ax[ax_ind][0].grid(False)
    ax[ax_ind][1].grid(False);

<b>Key findings:</b>
<ul>
    <li>Observing seasons 2011, 2013, 2014, 2015, 2016 Ferrari's gap with the title winners always increases significantly and consistantly over the season races.</li>
    <li>Whereas in season 2012 Ferrari is consistent with it's performance but not able to catch-up with the title wining Red Bull Racing team.</li>
    <li>In season 2010, 2017, 2018 Ferrari is more competative, especially in 2018 where Ferrai is strong title contender, but is not able to hold-up to their lead and the championship fight.</li>
    <li>Seasons 2013 - 2016 are actual nightmare years.</li>
</ul>

In the next plot we like to observe Ferrari's overall race performance per season, by comparing race outcomes with a championship tittle winning team of the respective years.

In [None]:
query_statement = """
                select races.year, constructors.name,
                
                    (
                        case
                            when results.position<=3 then "Podium"
                            when results.position between 4 and 10 then "In points"
                            else
                                case
                                    when status.status like "Finish%" or status.status like "+%" then "Out of points"
                                    else "DNF"
                                end 
                        end
                    ) as "resultType"
                
                from results
                
                join races
                    on races.raceId = results.raceId
                    
                join constructors
                    on constructors.constructorId = results.constructorId
                    
                join status
                    on results.statusId = status.statusId
                    
                join
                
                    (
                    
                        select last_race.year, constructors.name as seasonWinner from constructor_standings

                        join

                            (

                                select year, raceId from races
                                where year between 2010 and 2019
                                group by year
                                having max(round)

                            ) as last_race

                                on constructor_standings.raceId = last_race.raceId

                        join constructors
                            on constructors.constructorId = constructor_standings.constructorId

                        where constructor_standings.position=1

                    
                    ) as season_winner
                    
                on season_winner.year = races.year
                
                where constructors.name = "Ferrari" or constructors.name = season_winner.seasonWinner
            """
result = read_sql_query(query_statement, conn)

In [None]:
fig, ax = plt.subplots(figsize=(17,17), nrows=5, ncols=2, sharey=True)
fig.tight_layout(pad=5)
years = result.year.unique()
for ind in range(0,len(years),2):
    ax_ind = ind//2
    
    for add in range(2):
        year = years[ind+add]
        df = result[result.year==year].groupby("name").resultType.value_counts().to_frame()
        name = []
        resultType = []
        for team, cat in df.index:
            name.append(team)
            resultType.append(cat)
        df.index = range(df.shape[0])
        df.rename(columns={"resultType":"Counts"}, inplace=True)
        df["name"] = name
        df["resultType"] = resultType
        df.pivot(index='resultType', columns='name', values='Counts').plot(kind='bar', ax=ax[ax_ind][add], rot=35, xlabel="")
        ax[ax_ind][add].set_title(f"Season {years[ind+add]}")
        ax[ax_ind][add].grid(False);

The race outcomes are grouped in four different categories as follows:<br>
<ul>
    <li>Podium: Being able to finish a race in top three spots</li>
    <li>In points: Being able to finish a race bewtween 4th and 10th spots.</li>
    <li>Out of points: Being able to finish a race out of top 10 spots.</li>
    <li>DNF: Did Not Finish.</li>
</ul>

<b>Key Findings:</b>
<ul>
    <li>Season 2012 is an outlier season, where the number of podiums scored by Ferrari are more than a title winning team.</li>
    <li>Except in year 2012 and year 2018, in every other season the championship wining teams score more podiums than Ferrai.</li>
    <li>In year 2018 Ferrari scores almost equal number of podiums as the title winning Mercedes team.</li>
    <li>Again excepting years 2012 and 2018, in every other seasons over the last decade Ferrari places their cars into the points (in the top 10 rankings in a race) more than the champion teams but also fails to clinch more podiums than the champion teams.</li>
    <li>Surprisingly, three out of the four seasons (2010-2013) where Red Bull team wins the title, has more DNFs than Ferrari.</li>
    <li>Opposite to that, in every season since 2015 where Mercedes wins the titles, Ferrari has atleast 5 DNFs every season which are always more than Mercedes's DNFs.</li>
</ul>

The obvious question arises as why Ferrari has more DNFs since 2015 when they are actually getting good season by season.<br><br>

To answer this part we need more data which we don't have. But we can look at what are those DNFs all about? Are there more reliability related issues or are there more car crashes?

In [None]:
query_statement = """
                select 
                
                    (

                        case
                            when races.year<2015 then "2010-2014"
                            else "2015-2019"
                        end

                    ) as "yearPartition",
                
                    (

                        case
                            when status.status like "Finish%" or status.status like "+%" then "Completed"
                            else status.status
                        end

                    ) as resultType
                
                from results
                
                join races
                    on races.raceId = results.raceId
                    
                join constructors
                    on constructors.constructorId = results.constructorId
                    
                join status
                    on status.statusId = results.statusId
                    
                where races.year between 2010 and 2019 and constructors.name = "Ferrari"
                
            """
result = read_sql_query(query_statement, conn)

In [None]:
df = result[result.resultType!="Completed"].groupby("yearPartition").resultType.value_counts(sort=True, ascending=True).to_frame()
yearPartition = []
resultType = []
for year, cat in df.index:
    yearPartition.append(year)
    resultType.append(cat)
df.index = range(df.shape[0])
df.rename(columns={"resultType":"Counts"}, inplace=True)
df["yearPartition"] = yearPartition
df["resultType"] = resultType
#df.pivot(index='resultType', columns='yearPartition', values='Counts').plot(kind='barh');

df.resultType = df.resultType.apply(lambda val: "Accident" if val=="Accident" or val=="Collision" or val=="Collision damage" else val)


accident_dnf = [df[(df.yearPartition=="2010-2014") & (df.resultType=="Accident")].Counts.sum(),df[(df.yearPartition=="2015-2019") & (df.resultType=="Accident")].Counts.sum()]
non_accident_dnf = [df[(df.yearPartition=="2010-2014") & (df.resultType!="Accident")].Counts.sum(), df[(df.yearPartition=="2015-2019") & (df.resultType!="Accident")].Counts.sum()]

df = pd.DataFrame({"accident_dnf":accident_dnf, "non_accident_dnf":non_accident_dnf, "years":["2010-2014", "2015-2019"]})

fig, ax = plt.subplots(figsize=(15,5))
df[["accident_dnf","non_accident_dnf"]].plot(kind="bar", stacked=True, ax=ax)
ax.set_xticks(range(len(df.years.values.tolist())))
ax.set_xticklabels(df.years.values.tolist(), rotation=45)
ax.set_title("Ferrari's Accident type DNFs vs Non-accident type DNFs")
ax.grid(False);

<b>Continuing from the last plot, key finding are listed below:</b>
<ul>
    <li>Coming from the last plot we have the idea of Ferrari having more DNFs in 2015-2019 period than 2010-2014 period. But from the plot above, we understand that the number of DNFs in 2015-2019 are almost twice as much as 2010-2014.</li>
    <li>We can also observe a significant rise in the non accident related DNFs (such as power loss, gear box issue, etc.) in those time periods.</li>
    <li>In the 2010-2014 period, non accidental DNFs proportion of 16 total DNFs is 37.5%. This is compared to 53.125% non accidental DNFs of total 32 DNFs in period 2015-2019</li>
</ul>

The next important information to look at is the drivers' data.

In [None]:
query_statement = """
                select races.year, constructors.name team, drivers.name driver, results.position, results.points from results
                
                join races
                    on races.raceId = results.raceId
                    
                join constructors
                    on constructors.constructorId = results.constructorId
                    
                join drivers
                    on drivers.driverId = results.driverId
                    
                join
                    (
                    
                        select last_race.year, constructors.name as seasonWinner from constructor_standings

                        join

                            (

                                select year, raceId from races
                                where year between 2010 and 2019
                                group by year
                                having max(round)

                            ) as last_race

                                on constructor_standings.raceId = last_race.raceId

                        join constructors
                            on constructors.constructorId = constructor_standings.constructorId

                        where constructor_standings.position=1

                    
                    ) as season_winner
                    
                    on season_winner.year = races.year
                    
                where races.year between 2010 and 2019 and (constructors.name = "Ferrari" or constructors.name = season_winner.seasonWinner)
            """
result = read_sql_query(query_statement, conn)

In [None]:
def get_driver_contrib(ferrai, champ, year):
    ferrari_dictionary = (result[(result.year==year) & (result.team==ferrai)].groupby("driver").points.sum().div(result[(result.year==year) & (result.team==ferrai)].points.sum())).to_dict()
    champ_dictionary = (result[(result.year==year) & (result.team==champ)].groupby("driver").points.sum().div(result[(result.year==year) & (result.team==champ)].points.sum())).to_dict()

    
    ferrari_drivers = list(map(lambda val: val[0], sorted(ferrari_dictionary.items(), key=lambda val: val[1], reverse=True)))
    champ_drivers = list(map(lambda val: val[0], sorted(champ_dictionary.items(), key=lambda val: val[1], reverse=True)))
    drivers = ferrari_drivers+champ_drivers

    ferrari_drivers_contrib = list(map(lambda val: val[1], sorted(ferrari_dictionary.items(), key=lambda val: val[1], reverse=True)))
    champ_drivers_contrib = list(map(lambda val: val[1], sorted(champ_dictionary.items(), key=lambda val: val[1], reverse=True)))
    contrib = ferrari_drivers_contrib+champ_drivers_contrib
    
    return drivers, contrib

In [None]:
fig, ax = plt.subplots(figsize=(17,17), nrows=5, ncols=2, sharey=True)
fig.tight_layout(pad=5)
years = result.year.unique()
for ind in range(0,len(years),2):
    ax_ind = ind//2
    
    for add in range(2):

        year = years[ind+add]
        
        t1, t2 = result[result.year==year].team.unique()
        
        if t1 == "Ferrari":
            
            drivers, contrib = get_driver_contrib(t1, t2, year)
            
            red_patch = mpatches.Patch(color='tab:red', label=t1)
            blue_patch = mpatches.Patch(color='tab:blue', label=t2)
            
        else:
            
            drivers, contrib = get_driver_contrib(t2, t1, year)
                
            red_patch = mpatches.Patch(color='tab:red', label=t2)
            blue_patch = mpatches.Patch(color='tab:blue', label=t1)
            
        ax[ax_ind][add].bar(drivers, contrib, color=['tab:red','tab:red','tab:blue','tab:blue'])
        ax[ax_ind][add].set_title(f"Season {years[ind+add]}")
        
        ax[ax_ind][add].legend(handles=[red_patch, blue_patch])
        ax[ax_ind][add].grid(False)
        
    ax[ax_ind][0].set_ylabel("Percentage of teams'\ntotal points distributed\nover drivers");

The above plots show drivers' team contributions in therms of points scored - season by season.
<ul>
    <li>From year 2010 to year 2015 Ferrai has lead F1 drivers in their driver line ups, Fernando Alonso (2010-2014) and Sebastian Vettel (2015). In every season of this period lead drivers score at least 60% of the team's total season tally of points.</li>
    <li>But that dynamics changes from year 2016 where non-lead drivers score atleast close to 40% of the team's total points.</li>
    <li>Season 2019 in an important year for Ferrari, where Sebastian Vettel (4 times driver championship winner) is no longer a lead driver.</li>
    <li>Since year 2010 except year 2018 and year 2019 the Ferrari's drivers' contribution gap is always greater than those of the championship winning team's drivers' contribution gap.</li>
</ul>

Now, that we have introduced ourselves to Ferrari's driver line-up since 2010, we can get back to our previous question about accidents vs non-accidents (realiability related) DNFs, but this time grouped by Ferrari drivers.

In [None]:
query_statement = """
                select races.year, drivers.name,
                
                    (

                        case
                            when status.status like "Finish%" or status.status like "+%" then "Completed"
                            else status.status
                        end

                    ) as resultType
                
                from results
                
                join constructors
                    on constructors.constructorId = results.constructorId
                    
                join races
                    on races.raceId = results.raceId
                    
                join drivers
                    on drivers.driverId = results.driverId
                    
                join status
                    on status.statusId = results.statusId
                    
                where constructors.name = "Ferrari" and (races.year between 2015 and 2019) and resultType!="Completed"
                
                order by races.year
            """
result = read_sql_query(query_statement, conn)

In [None]:
result.resultType = result.resultType.apply(lambda val: "Accident" if val=="Accident" or val=="Collision" or val=="Collision damage" else val)
result.resultType = result.resultType.apply(lambda val: "Non-accidental" if val!="Accident" else val)

non_accidental = []
accident = []
names = []

for driver in result.name.unique():
    driver_dict = result[result.name==driver].resultType.value_counts().to_dict()
    
    try:
        non_accidental.append(driver_dict["Non-accidental"])
    except:
        non_accidental.append(0)
    
    try:
        accident.append(driver_dict["Accident"])
    except:
        accident.append(0)
    
    names.append(driver)
    
fig, ax = plt.subplots(figsize=(15,5))
pd.DataFrame({"Name":names, "accident":accident, "Non-accidental":non_accidental}).plot(x="Name", kind="bar", ax=ax)
ax.set_xlabel("")
ax.set_ylabel("Frequency", fontsize=13)
ax.set_title("Ferrari's DNF analysis by drivers")
ax.grid(False);

<b>Key findings:</b>
<ul>
    <li>We can see Charles Leclerc only in one year has 3 accident type DNFs compare to Sebastian's or Kimi's 6 accident type DNF's in 5 years.</li>
    <li>Also, Kimi is extremely unfortunate for retiring multiple times due to reliability issues.</li>
</ul>

Now that we have grasped some of the ideas where Ferrari loses some of their season points, we will focus on analyzing Ferrari's qualifying performances. <br><br>
Qualifying is second most important event of the race weekend. Results of a qualifying session dictate drivers grid formation for the start of the race event. <br><br> Let's look at these results.

In [None]:
query_statement = """
                    select races.year, constructors.name,
                    
                        (
                        
                            case
                                when results.grid in (1,2) then "first_row"
                                when results.grid in (3,4) then "second_row"
                                when results.grid in (5,6) then "third_row"
                                else "out_of_third_row" 
                            end
                        ) as gridType
                    
                    from results
                    
                    join constructors
                        on constructors.constructorId = results.constructorId
                        
                    join races
                        on races.raceId = results.raceId
                        
                    join
                    
                        (

                            select last_race.year, constructors.name as seasonWinner from constructor_standings

                            join

                                (

                                    select year, raceId from races
                                    where year between 2010 and 2019
                                    group by year
                                    having max(round)

                                ) as last_race

                                    on constructor_standings.raceId = last_race.raceId

                            join constructors
                                on constructors.constructorId = constructor_standings.constructorId

                            where constructor_standings.position=1


                        ) as season_winner
                        
                            on season_winner.year = races.year
                        
                    where (races.year between 2010 and 2019) and constructors.name in ("Ferrari", season_winner.seasonWinner)
            """
result = read_sql_query(query_statement, conn)

In [None]:
fig, ax = plt.subplots(figsize=(17,17), nrows=5, ncols=2, sharey=True)
fig.tight_layout(pad=5)
years = result.year.unique()
for ind in range(0,len(years),2):
    ax_ind = ind//2
    
    for add in range(2):
        year = years[ind+add]
        df = result[result.year==year].groupby("name").gridType.value_counts().to_frame()
        name = []
        gridType = []
        for team, cat in df.index:
            name.append(team)
            gridType.append(cat)
        df.index = range(df.shape[0])
        df.rename(columns={"gridType":"Counts"}, inplace=True)
        df["name"] = name
        df["gridType"] = gridType
        df.pivot(index='gridType', columns='name', values='Counts').plot(kind='bar', ax=ax[ax_ind][add], rot=35, xlabel="")
        ax[ax_ind][add].set_title(f"Season {years[ind+add]}")
        ax[ax_ind][add].grid(False);

To analyze qualyfing performance, we group qualyfing results in four different categories.
<ul>
    <li><b>first_row: Qualyfing 1st or 2nd</b></li>
    <li><b>second_row: Qualyfing 3rd or 4th</b></li>
    <li><b>third_row: Qualyfing 5th or 6th</b></li>
    <li><b>out_of_third_row: Qualyfing below 6th</b></li>
</ul>

<br>

<b>Key Findings:</b>
<ul>
    <li>Except in seasons 2017, 2018, and 2019, Ferrari is no where near to the championship winning teams in terms of qualifying performances.</li>
    <li>Recalling percentage points deficit plot, we know 2012 season is Ferrari's one of the best performing season of the decade. But the plots above reveal that championship winning Red Bull Racing team in the seasons 2010-2013 outperforms Ferrari in terms of clinching at least one of the top two rows for the race starts.</li>
    <li>As we concluded in one of the earlier plots, the years 2013 to 2016 are absolute nighmares for Ferrari. Sticking to the same conclusion we can experience the dominance of the championship winning teams, in those years, in placing their superior, high efficient cars in front rows over the entire seasons.</li>
    <li>Also, Ferrari's comback in qualifying sessions in year 2017 is appreciable.</li>
    <li>Recalling the moving average plot for percentage points deficit, we know 2016 season is a tipping point for Ferrari's perforance. From that season onwards for straight three years, the average moving plot shows downward trend for percentage points deficit. This trend is also evident from the above bar charts as the qualifying performances from 2017 improve comparing to their previous years.</li>
</ul>

<br><br>
One element that we don't want to miss is race strategies and overall race performance such as tire management, tire selection, spontenous pit stops decisions, cars overtaking ability, etc. But unfortunately we don't have that information either. Hence, we now look at Ferrari's potential of finishing in podium for a race event when qualifying result is out of podium. 

In [None]:
query_statement = """
                    select races.year, constructors.name,
                    
                        sum(case
                                when results.grid>3 and results.position<4 then 1
                                else 0
                            end) as podiumGain,
            
                        sum(case
                                when results.grid>3 then 1
                                else 0
                            end) as noPodiumStart
        
                    from results
                    
                    join constructors
                        on constructors.constructorId = results.constructorId
                        
                    join races
                        on races.raceId = results.raceId
                        
                    join
                    
                        (

                            select last_race.year, constructors.name as seasonWinner from constructor_standings

                            join

                                (

                                    select year, raceId from races
                                    where year between 2010 and 2019
                                    group by year
                                    having max(round)

                                ) as last_race

                                    on constructor_standings.raceId = last_race.raceId

                            join constructors
                                on constructors.constructorId = constructor_standings.constructorId

                            where constructor_standings.position=1


                        ) as season_winner
                        
                            on season_winner.year = races.year

                        
                    where (races.year between 2010 and 2019) and constructors.name in ("Ferrari", season_winner.seasonWinner)
                    
                    group by races.year, constructors.name
            """
result = read_sql_query(query_statement, conn)

In [None]:
result["percent_podiumGain"] = result["podiumGain"]/result["noPodiumStart"]

In [None]:
fig, [ax1, ax2] = plt.subplots(figsize=(16,9), nrows=2, ncols=1, sharex=True)

result[result.name=="Ferrari"][["year","percent_podiumGain"]].plot(kind="line", x="year",y="percent_podiumGain", ax=ax1)
result[result.name=="Ferrari"][["year","percent_podiumGain"]].plot(kind="scatter", x="year",y="percent_podiumGain", ax=ax1)

result[result.name!="Ferrari"][["year","percent_podiumGain"]].plot(kind="line", x="year",y="percent_podiumGain", ax=ax1)
result[result.name!="Ferrari"][["year","percent_podiumGain"]].plot(kind="scatter", x="year",y="percent_podiumGain", ax=ax1)

ax1.legend(["Ferrari","Champion team"])
ax1.grid(False)

result[result.name=="Ferrari"][["year","noPodiumStart"]].plot(kind="line", x="year",y="noPodiumStart", ax=ax2)
result[result.name=="Ferrari"][["year","noPodiumStart"]].plot(kind="scatter", x="year",y="noPodiumStart", ax=ax2)

result[result.name!="Ferrari"][["year","noPodiumStart"]].plot(kind="line", x="year",y="noPodiumStart", ax=ax2)
result[result.name!="Ferrari"][["year","noPodiumStart"]].plot(kind="scatter", x="year",y="noPodiumStart", ax=ax2)

ax2.set_xticks(result.year.unique())

ax2.legend(["Ferrari","Champion team"])
ax2.grid(False);

The above plots tries to grasp team's ablilities to finish in the podium spots when the qualyfing result are out of 1st, 2nd or 3rd.

The first plot of the two above, line charts, show a proportion of teams' "out of podium starts" (starting grid position 4 or more) that actully resulted into a podium finish (race result 1, 2, or 3) at the end of the race.<br>
The second plot shows the number of podium starts by a team.

<ul>
    <li>The second plot undoubtedly concludes that Ferrari, never in the last decade has less "out of podium race starts" than the championship winning teams for respective years.</li>
    <li>First plot highlights the Ferrari's best of the decade years, 2012 and 2018, where the percentages of the podium finish gains after starting 4 or more on the grid positions are above 0.3</li>
</ul>

In the closing stages of the notebook, we like to look at shear car performance. Again because we do not have enough data about engine and car performance we can only approximate.<br><br> One key indicator of the overall car performance is lap times during the race. Hence, in the next plot, we compare Ferrari's lap times vs championship winning team's lap times for the respective years.

In [None]:
query_statement = """
                    select races.year, races.raceId, constructors.name team, drivers.name driver, sum(lap_times.milliseconds) total_lap_time, 
                    
                        (
                        
                            case
                                when status.status not like "Finish%" and status.status not like "+%" then 1
                                else 0
                            end
                        ) Retired
                    
                    from lap_times
                    
                    join results
                        on results.raceId = lap_times.raceId and results.driverId = lap_times.driverId
                    
                    join constructors
                        on constructors.constructorId = results.constructorId
                        
                    join drivers
                        on drivers.driverId = results.driverId
                        
                    join races
                        on races.raceId = results.raceId
                        
                    join status
                        on status.statusId = results.statusId
                        
                    join
                    
                        (

                            select last_race.year, constructors.name as seasonWinner from constructor_standings

                            join

                                (

                                    select year, raceId from races
                                    where year between 2010 and 2019
                                    group by year
                                    having max(round)

                                ) as last_race

                                    on constructor_standings.raceId = last_race.raceId

                            join constructors
                                on constructors.constructorId = constructor_standings.constructorId

                            where constructor_standings.position=1


                        ) as season_winner
                        
                            on season_winner.year = races.year
                        
                    where (races.year between 2011 and 2019) and constructors.name in ("Ferrari", season_winner.seasonWinner)
                    
                    group by races.year, races.raceId, drivers.name
            """
result_lap_time = read_sql_query(query_statement, conn)

query_statement = """
                    select races.year, races.raceId, constructors.name team, drivers.name driver, sum(pit_stops.milliseconds) total_pit_stop_time from pit_stops
                    
                    join results
                        on results.raceId = pit_stops.raceId and results.driverId = pit_stops.driverId
                    
                    join constructors
                        on constructors.constructorId = results.constructorId
                        
                    join drivers
                        on drivers.driverId = results.driverId
                        
                    join races
                        on races.raceId = results.raceId
                        
                    join
                    
                        (

                            select last_race.year, constructors.name as seasonWinner from constructor_standings

                            join

                                (

                                    select year, raceId from races
                                    where year between 2010 and 2019
                                    group by year
                                    having max(round)

                                ) as last_race

                                    on constructor_standings.raceId = last_race.raceId

                            join constructors
                                on constructors.constructorId = constructor_standings.constructorId

                            where constructor_standings.position=1


                        ) as season_winner
                        
                            on season_winner.year = races.year
                        
                    where (races.year between 2011 and 2019) and constructors.name in ("Ferrari", season_winner.seasonWinner)
                    
                    group by races.year, races.raceId, drivers.name
            """
result_pit_stop_time = read_sql_query(query_statement, conn)

In [None]:
df = pd.merge(result_lap_time, result_pit_stop_time, on=["year", "raceId", "team", "driver"], how="left")

df["total_pit_stop_time"].fillna(0, inplace=True)

df = df[df["Retired"] != 1]

df["net_lap_time"] = df["total_lap_time"] - df["total_pit_stop_time"]

years = df["year"].unique()
raceId = df["raceId"].unique()

year_diff = {}
for year in years:
    
    lap_time_diff = []    
    for race in raceId:
        
        df_temp = df[(df["year"] == year) & (df["raceId"] == race)].reset_index(drop=True)
        n_rows = df_temp.shape[0]
        teams = df_temp["team"].unique()
        n_teams = len(teams)
        
        if n_rows in [0, 1]:
            continue
            
        elif n_rows == 2:
            if n_teams == 1:
                continue
            else:
                win_team_lap_time = df_temp[df_temp["team"] == win_team]["net_lap_time"].values[0]
                ferrari_lap_time = df_temp[df_temp["team"] == "Ferrari"]["net_lap_time"].values[0]
                
                val = win_team_lap_time - ferrari_lap_time
            
        elif n_rows == 3:
            win_team = [val for val in teams if val!="Ferrari"][0]
            win_team_cnt = df_temp[df_temp["team"] == win_team].shape[0]
            ferrari_cnt = df_temp[df_temp["team"] == "Ferrari"].shape[0]
            
            if ferrari_cnt > win_team_cnt:
                win_team_lap_time = df_temp[df_temp["team"] == win_team]["net_lap_time"].values[0]
                ferrari_lap_time = np.min(df_temp[df_temp["team"] == "Ferrari"]["net_lap_time"].values)
            
            else:
                win_team_lap_time = np.min(df_temp[df_temp["team"] == win_team]["net_lap_time"].values)
                ferrari_lap_time = df_temp[df_temp["team"] == "Ferrari"]["net_lap_time"].values[0]
                
            val = win_team_lap_time - ferrari_lap_time
            
        else:
            win_team = [val for val in teams if val!="Ferrari"][0]
            win_team_lap_time = np.sum(df_temp[df_temp["team"] == win_team]["net_lap_time"].values)
            ferrari_lap_time = np.sum(df_temp[df_temp["team"] == "Ferrari"]["net_lap_time"].values)
            
            val = win_team_lap_time - ferrari_lap_time
            
        lap_time_diff.append(val/1000)
        
    year_diff[year] = lap_time_diff

In [None]:
fig, ax0 = plt.subplots(figsize=(17,15), nrows=3, ncols=3)
fig.tight_layout()

year_mean_diff = {}

for ind in range(0,len(years)-1,3):
    ax_ind = ind//3
    
    for add in range(3):
        year = years[ind+add]
        year_vals = year_diff[year]
        colors=['red' if val<0 else 'lawngreen' for val in year_vals]
        ax0[ax_ind][add].plot(year_vals, "--o")
        ax0[ax_ind][add].bar(range(len(year_vals)), year_vals, color=colors)
        ax0[ax_ind][add].set_title(f"Season {year}")
        ax0[ax_ind][add].grid(False)
        
        year_mean_diff[year] = np.log(-1*np.mean(year_vals))
        
        if add!=0:
            continue
            
        ax0[ax_ind][add].set_ylabel("Championship winning team's lap\n times - Ferrari's lap times (in seconds\n grouped by the teams per race)")
        
fig, ax1 = plt.subplots(figsize=(20,5))
ax1.plot(list(year_mean_diff.keys()), list(year_mean_diff.values()), "--o")
ax1.set_ylabel("Negative log of average\nmean lap time");

Above first nine bar charts highlight the differences between championship winning teams' lap times and the Ferrari's lap times from 9 different years.<br><br>
Whereas the above line plot indicates the "average yearly lap time differences" between two teams over the decade.

<b>Key findings:</b>
<ul>
    <li>Plots indicating year/season 2017, 2018, and 2019 have more green bars than the previous plots.</li>
    <li>In seasons such 2013, 2016, and especially in 2018 Ferrari's lap time performance in the first half of these years is better than respective championship winning teams of the years.</li>
    <li>Year 2019 is exceptional where Ferrari's lap time performance is consistantly good in the later portion of the season.</li>
    <li>From the negative average mean lap time plot, we can conclude, in year 2018 Ferrari's overall race performance is better than 2012 (their two best performing years of the decade).</li>
</ul>