<a href="https://colab.research.google.com/github/southerntw/f1-data-analysis/blob/main/f1predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#F1 Race Predictor



**Formula Satu**, disingkat F1 (atau bernama lengkap FIA Formula One World Championship), adalah kelas tertinggi balap mobil kursi tunggal yang diatur oleh Federasi Otomotif Internasional (FIA) dan dimiliki oleh Formula One Group. Kata formula di "Formula Satu" mengacu pada peraturan dan regulasi yang harus diikuti semua peserta.[2] Formula Satu terdiri dari sejumlah seri balapan yang dikenal dengan istilah Grand Prix. Balapan-balapan tersebut diselenggarakan baik dalam sirkuit yang dibangun khusus atau jalan raya tertutup.

## Install Requirements and Import Modules

In [None]:
!pip install selenium
!pip install lazypredict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import requests
import bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from dateutil.relativedelta import *

In [None]:
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.neighbors import KNeighborsClassifier

np.set_printoptions(precision=4)

In [None]:
import lazypredict
from lazypredict.Supervised import LazyClassifier
from lazypredict.Supervised import LazyRegressor

## Data Collecting / Data Mining

Dataset yang digunakan diambil dari Formula 1 Data API bernama '[Ergast API](https://ergast.com/mrd/)'.  Dataset tersedia dalam berbagai bentuk: CSV, json object, dll. Dalam kasus ini, saya akan mengambil data json object menggunakan `requests` lalu ditampung dalam pandas `DataFrame`

### Mengambil data `races`

In [None]:
races = {'season': [],
        'round': [],
        'circuit_id': [],
        'lat': [],
        'long': [],
        'country': [],
        'date': [],
        'url': []}

for year in list(range(1950,2022)):    
    url = 'https://ergast.com/api/f1/{}.json'
    r = requests.get(url.format(year))
    json = r.json()

    for item in json['MRData']['RaceTable']['Races']:
        try:
            races['season'].append(int(item['season']))
        except:
            races['season'].append(None)

        try:
            races['round'].append(int(item['round']))
        except:
            races['round'].append(None)

        try:
            races['circuit_id'].append(item['Circuit']['circuitId'])
        except:
            races['circuit_id'].append(None)

        try:
            races['lat'].append(float(item['Circuit']['Location']['lat']))
        except:
            races['lat'].append(None)

        try:
            races['long'].append(float(item['Circuit']['Location']['long']))
        except:
            races['long'].append(None)

        try:
            races['country'].append(item['Circuit']['Location']['country'])
        except:
            races['country'].append(None)

        try:
            races['date'].append(item['date'])
        except:
            races['date'].append(None)

        try:
            races['url'].append(item['url'])
        except:
            races['url'].append(None)
        
races = pd.DataFrame(races)

In [None]:
races

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
0,1950,1,silverstone,52.0786,-1.01694,UK,1950-05-13,http://en.wikipedia.org/wiki/1950_British_Gran...
1,1950,2,monaco,43.7347,7.42056,Monaco,1950-05-21,http://en.wikipedia.org/wiki/1950_Monaco_Grand...
2,1950,3,indianapolis,39.7950,-86.23470,USA,1950-05-30,http://en.wikipedia.org/wiki/1950_Indianapolis...
3,1950,4,bremgarten,46.9589,7.40194,Switzerland,1950-06-04,http://en.wikipedia.org/wiki/1950_Swiss_Grand_...
4,1950,5,spa,50.4372,5.97139,Belgium,1950-06-18,http://en.wikipedia.org/wiki/1950_Belgian_Gran...
...,...,...,...,...,...,...,...,...
1052,2021,18,rodriguez,19.4042,-99.09070,Mexico,2021-11-07,http://en.wikipedia.org/wiki/2021_Mexican_Gran...
1053,2021,19,interlagos,-23.7036,-46.69970,Brazil,2021-11-14,http://en.wikipedia.org/wiki/2021_S%C3%A3o_Pau...
1054,2021,20,losail,25.4900,51.45420,Qatar,2021-11-21,http://en.wikipedia.org/wiki/2021_Qatar_Grand_...
1055,2021,21,jeddah,21.6319,39.10440,Saudi Arabia,2021-12-05,http://en.wikipedia.org/wiki/2021_Saudi_Arabia...


In [None]:
races.to_csv('f1races.csv', index=False)

### Mengambil data `results`

In [None]:
rounds = []
for year in np.array(races.season.unique()):
    rounds.append([year, list(races[races.season == year]['round'])])
    
results = {'season': [],
          'round':[],
           'circuit_id':[],
          'driver': [],
           'date_of_birth': [],
           'nationality': [],
          'constructor': [],
          'grid': [],
          'time': [],
          'status': [],
          'points': [],
          'podium': []}

for n in list(range(len(rounds))):
    for i in rounds[n][1]:
    
        url = 'http://ergast.com/api/f1/{}/{}/results.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()

        for item in json['MRData']['RaceTable']['Races'][0]['Results']:
            try:
                results['season'].append(int(json['MRData']['RaceTable']['Races'][0]['season']))
            except:
                results['season'].append(None)

            try:
                results['round'].append(int(json['MRData']['RaceTable']['Races'][0]['round']))
            except:
                results['round'].append(None)

            try:
                results['circuit_id'].append(json['MRData']['RaceTable']['Races'][0]['Circuit']['circuitId'])
            except:
                results['circuit_id'].append(None)

            try:
                results['driver'].append(item['Driver']['driverId'])
            except:
                results['driver'].append(None)
            
            try:
                results['date_of_birth'].append(item['Driver']['dateOfBirth'])
            except:
                results['date_of_birth'].append(None)
                
            try:
                results['nationality'].append(item['Driver']['nationality'])
            except:
                results['nationality'].append(None)

            try:
                results['constructor'].append(item['Constructor']['constructorId'])
            except:
                results['constructor'].append(None)

            try:
                results['grid'].append(int(item['grid']))
            except:
                results['grid'].append(None)

            try:
                results['time'].append(int(item['Time']['millis']))
            except:
                results['time'].append(None)

            try:
                results['status'].append(item['status'])
            except:
                results['status'].append(None)

            try:
                results['points'].append(int(item['points']))
            except:
                results['points'].append(None)

            try:
                results['podium'].append(int(item['position']))
            except:
                results['podium'].append(None)

           
results = pd.DataFrame(results)

In [None]:
results

Unnamed: 0,season,round,circuit_id,driver,date_of_birth,nationality,constructor,grid,time,status,points,podium
0,1950,1,silverstone,farina,1906-10-30,Italian,alfa,1,8003600.0,Finished,9.0,1
1,1950,1,silverstone,fagioli,1898-06-09,Italian,alfa,2,8006200.0,Finished,6.0,2
2,1950,1,silverstone,reg_parnell,1911-07-02,British,alfa,4,8055600.0,Finished,4.0,3
3,1950,1,silverstone,cabantous,1904-10-08,French,lago,6,,+2 Laps,3.0,4
4,1950,1,silverstone,rosier,1905-11-05,French,lago,9,,+2 Laps,2.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...
24942,2021,22,yas_marina,latifi,1995-06-29,Canadian,williams,16,,Accident,0.0,16
24943,2021,22,yas_marina,giovinazzi,1993-12-14,Italian,alfa,14,,Gearbox,0.0,17
24944,2021,22,yas_marina,russell,1998-02-15,British,williams,17,,Gearbox,0.0,18
24945,2021,22,yas_marina,raikkonen,1979-10-17,Finnish,alfa,18,,Brakes,0.0,19


In [None]:
results.to_csv('f1results.csv', index=False)

### Mengambil Data `driver_standings`

In [None]:
driver_standings = {'season': [],
                    'round':[],
                    'driver': [],
                    'driver_points': [],
                    'driver_wins': [],
                   'driver_standings_pos': []}

# query API

for n in list(range(len(rounds))):     
    for i in rounds[n][1]:    # iterate through rounds of each year
    
        url = 'https://ergast.com/api/f1/{}/{}/driverStandings.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']:
            try:
                driver_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                driver_standings['season'].append(None)

            try:
                driver_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                driver_standings['round'].append(None)
                                         
            try:
                driver_standings['driver'].append(item['Driver']['driverId'])
            except:
                driver_standings['driver'].append(None)
            
            try:
                driver_standings['driver_points'].append(int(item['points']))
            except:
                driver_standings['driver_points'].append(None)
            
            try:
                driver_standings['driver_wins'].append(int(item['wins']))
            except:
                driver_standings['driver_wins'].append(None)
                
            try:
                driver_standings['driver_standings_pos'].append(int(item['position']))
            except:
                driver_standings['driver_standings_pos'].append(None)
            
driver_standings = pd.DataFrame(driver_standings)

# define lookup function to shift points and number of wins from previous rounds

def lookup (df, team, points):
    df['lookup1'] = df.season.astype(str) + df[team] + df['round'].astype(str)
    df['lookup2'] = df.season.astype(str) + df[team] + (df['round']-1).astype(str)
    new_df = df.merge(df[['lookup1', points]], how = 'left', left_on='lookup2',right_on='lookup1')
    new_df.drop(['lookup1_x', 'lookup2', 'lookup1_y'], axis = 1, inplace = True)
    new_df.rename(columns = {points+'_x': points+'_after_race', points+'_y': points}, inplace = True)
    new_df[points].fillna(0, inplace = True)
    return new_df
  
driver_standings = lookup(driver_standings, 'driver', 'driver_points')
driver_standings = lookup(driver_standings, 'driver', 'driver_wins')
driver_standings = lookup(driver_standings, 'driver', 'driver_standings_pos')

driver_standings.drop(['driver_points_after_race', 'driver_wins_after_race', 'driver_standings_pos_after_race'], 
                      axis = 1, inplace = True)

In [None]:
driver_standings

Unnamed: 0,season,round,driver,driver_points,driver_wins,driver_standings_pos
0,1950,1,farina,0.0,0.0,0.0
1,1950,1,fagioli,0.0,0.0,0.0
2,1950,1,reg_parnell,0.0,0.0,0.0
3,1950,1,cabantous,0.0,0.0,0.0
4,1950,1,rosier,0.0,0.0,0.0
...,...,...,...,...,...,...
27108,2021,22,latifi,7.0,0.0,17.0
27109,2021,22,giovinazzi,3.0,0.0,18.0
27110,2021,22,mick_schumacher,0.0,0.0,19.0
27111,2021,22,kubica,0.0,0.0,20.0


In [None]:
driver_standings.to_csv('f1driver_standings.csv', index=False)

### Mengambil Data `constructor_standings`

In [None]:
# start from year 1958

constructor_rounds = rounds[8:]

constructor_standings = {'season': [],
                    'round':[],
                    'constructor': [],
                    'constructor_points': [],
                    'constructor_wins': [],
                   'constructor_standings_pos': []}
# query API

for n in list(range(len(constructor_rounds))):
    for i in constructor_rounds[n][1]:
    
        url = 'https://ergast.com/api/f1/{}/{}/constructorStandings.json'
        r = requests.get(url.format(constructor_rounds[n][0], i))
        json = r.json()

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['ConstructorStandings']:
            try:
                constructor_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                constructor_standings['season'].append(None)

            try:
                constructor_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                constructor_standings['round'].append(None)
                                         
            try:
                constructor_standings['constructor'].append(item['Constructor']['constructorId'])
            except:
                constructor_standings['constructor'].append(None)
            
            try:
                constructor_standings['constructor_points'].append(int(item['points']))
            except:
                constructor_standings['constructor_points'].append(None)
            
            try:
                constructor_standings['constructor_wins'].append(int(item['wins']))
            except:
                constructor_standings['constructor_wins'].append(None)
                
            try:
                constructor_standings['constructor_standings_pos'].append(int(item['position']))
            except:
                constructor_standings['constructor_standings_pos'].append(None)
            
constructor_standings = pd.DataFrame(constructor_standings)

constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_points')
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_wins')
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_standings_pos')

constructor_standings.drop(['constructor_points_after_race', 'constructor_wins_after_race','constructor_standings_pos_after_race' ],
                           axis = 1, inplace = True)

In [None]:
constructor_standings

Unnamed: 0,season,round,constructor,constructor_points,constructor_wins,constructor_standings_pos
0,1958,1,cooper,0.0,0.0,0.0
1,1958,1,ferrari,0.0,0.0,0.0
2,1958,1,maserati,0.0,0.0,0.0
3,1958,2,cooper,8.0,1.0,1.0
4,1958,2,ferrari,6.0,0.0,2.0
...,...,...,...,...,...,...
12706,2021,22,alphatauri,120.0,0.0,6.0
12707,2021,22,aston_martin,77.0,0.0,7.0
12708,2021,22,williams,23.0,0.0,8.0
12709,2021,22,alfa,13.0,0.0,9.0


In [None]:
constructor_standings.to_csv('f1constructor.csv')

### Mengambil Data `qualifying_results`

Kode di bawah tidak berfungsi karena website formula1.com sudah renovasi. Jadi, langsung saja import dari hasil scrapping orang lain.

In [None]:
qualifying_results = pd.DataFrame()

# Qualifying times are only available from 1983

qualifying_results = pd.DataFrame()
for year in list(range(1983,2022)):
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')
    
    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    qualifying_results = pd.concat([qualifying_results, year_df])
    
print(qualifying_results.shape)

(0, 0)


In [None]:
qualifying_results = pd.read_csv('/content/f1qualifying.csv')

In [None]:
qualifying_results

Unnamed: 0,grid_position,driver_name,car,qualifying_time,season,round
0,1,Keke Rosberg ROS,Williams Honda,1:34.526,1983,1
1,2,Alain Prost PRO,Renault,1:34.672,1983,1
2,3,Patrick Tambay TAM,Ferrari,1:34.758,1983,1
3,4,Nelson Piquet PIQ,Brabham BMW,1:35.114,1983,1
4,5,Derek Warwick WAR,Toleman Hart,1:35.206,1983,1
...,...,...,...,...,...,...
14554,16,Antonio Giovinazzi GIO,Alfa Romeo Racing Ferrari,1:38.114,2019,21
14555,17,Kimi Räikkönen RAI,Alfa Romeo Racing Ferrari,1:38.383,2019,21
14556,18,George Russell RUS,Williams Mercedes,1:38.717,2019,21
14557,19,Robert Kubica KUB,Williams Mercedes,1:39.236,2019,21


### Mengambil Data `weather`

In [None]:
weather = races.iloc[:,[0,1,2]]

info = []

# read wikipedia tables

for link in races.url:
    try:
        df = pd.read_html(link)[0]
        if 'Weather' in list(df.iloc[:,0]):
            n = list(df.iloc[:,0]).index('Weather')
            info.append(df.iloc[n,1])
        else:
            df = pd.read_html(link)[1]
            if 'Weather' in list(df.iloc[:,0]):
                n = list(df.iloc[:,0]).index('Weather')
                info.append(df.iloc[n,1])
            else:
                df = pd.read_html(link)[2]
                if 'Weather' in list(df.iloc[:,0]):
                    n = list(df.iloc[:,0]).index('Weather')
                    info.append(df.iloc[n,1])
                else:
                    df = pd.read_html(link)[3]
                    if 'Weather' in list(df.iloc[:,0]):
                        n = list(df.iloc[:,0]).index('Weather')
                        info.append(df.iloc[n,1])
                    else:
                        driver = webdriver.Chrome()
                        driver.get(link)

                        # click language button
                        button = driver.find_element_by_link_text('Italiano')
                        button.click()
                        
                        # find weather in italian with selenium
                        
                        clima = driver.find_element_by_xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr[9]/td').text
                        info.append(clima) 
                                
    except:
        info.append('not found')

# append column with weather information to dataframe  
  
weather['weather'] = info

# set up a dictionary to convert weather information into keywords

weather_dict = {'weather_warm': ['soleggiato', 'clear', 'warm', 'hot', 'sunny', 'fine', 'mild', 'sereno'],
               'weather_cold': ['cold', 'fresh', 'chilly', 'cool'],
               'weather_dry': ['dry', 'asciutto'],
               'weather_wet': ['showers', 'wet', 'rain', 'pioggia', 'damp', 'thunderstorms', 'rainy'],
               'weather_cloudy': ['overcast', 'nuvoloso', 'clouds', 'cloudy', 'grey', 'coperto']}

# map new df according to weather dictionary

weather_df = pd.DataFrame(columns = weather_dict.keys())
for col in weather_df:
    weather_df[col] = weather['weather'].map(lambda x: 1 if any(i in weather_dict[col] for i in x.lower().split()) else 0)
   
weather_info = pd.concat([weather, weather_df], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
weather

Unnamed: 0,season,round,circuit_id,weather
0,1950,1,silverstone,"Sunny, mild, dry."
1,1950,2,monaco,not found
2,1950,3,indianapolis,Rainy
3,1950,4,bremgarten,"Warm, dry and sunny"
4,1950,5,spa,"Warm, dry and sunny"
...,...,...,...,...
1052,2021,18,rodriguez,Sunny
1053,2021,19,interlagos,Sunny
1054,2021,20,losail,Clear
1055,2021,21,jeddah,Clear


In [None]:
weather_info.to_csv("f1weather.csv")

## Data Preparation

In [None]:
races = pd.read_csv('f1races.csv')
results = pd.read_csv('f1results.csv')
qualifying = pd.read_csv('f1qualifying.csv')
driver_standings = pd.read_csv('f1driver_standings.csv')
constructor_standings = pd.read_csv('f1constructor.csv')
weather = pd.read_csv('f1weather.csv')

In [None]:
# races
print(races.shape)
races.head()

(1057, 8)


Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
0,1950,1,silverstone,52.08,-1.02,UK,1950-05-13,http://en.wikipedia.org/wiki/1950_British_Gran...
1,1950,2,monaco,43.73,7.42,Monaco,1950-05-21,http://en.wikipedia.org/wiki/1950_Monaco_Grand...
2,1950,3,indianapolis,39.8,-86.23,USA,1950-05-30,http://en.wikipedia.org/wiki/1950_Indianapolis...
3,1950,4,bremgarten,46.96,7.4,Switzerland,1950-06-04,http://en.wikipedia.org/wiki/1950_Swiss_Grand_...
4,1950,5,spa,50.44,5.97,Belgium,1950-06-18,http://en.wikipedia.org/wiki/1950_Belgian_Gran...


In [None]:
# qualifying
print(qualifying.shape)
qualifying.rename(columns = {'grid_position': 'grid'}, inplace = True)
qualifying.head()

(14559, 6)


Unnamed: 0,grid,driver_name,car,qualifying_time,season,round
0,1,Keke Rosberg ROS,Williams Honda,1:34.526,1983,1
1,2,Alain Prost PRO,Renault,1:34.672,1983,1
2,3,Patrick Tambay TAM,Ferrari,1:34.758,1983,1
3,4,Nelson Piquet PIQ,Brabham BMW,1:35.114,1983,1
4,5,Derek Warwick WAR,Toleman Hart,1:35.206,1983,1


In [None]:
# driver standings
print(driver_standings.shape)
driver_standings.head()

(27113, 6)


Unnamed: 0,season,round,driver,driver_points,driver_wins,driver_standings_pos
0,1950,1,farina,0.0,0.0,0.0
1,1950,1,fagioli,0.0,0.0,0.0
2,1950,1,reg_parnell,0.0,0.0,0.0
3,1950,1,cabantous,0.0,0.0,0.0
4,1950,1,rosier,0.0,0.0,0.0


In [None]:
# constructor standings
constructor_standings.drop(['Unnamed: 0' ],axis = 1, inplace = True)
print(constructor_standings.shape)
constructor_standings.head()

(12711, 6)


Unnamed: 0,season,round,constructor,constructor_points,constructor_wins,constructor_standings_pos
0,1958,1,cooper,0.0,0.0,0.0
1,1958,1,ferrari,0.0,0.0,0.0
2,1958,1,maserati,0.0,0.0,0.0
3,1958,2,cooper,8.0,1.0,1.0
4,1958,2,ferrari,6.0,0.0,2.0


In [None]:
# weather
weather.drop(['Unnamed: 0'], axis=1, inplace=True)
print(weather.shape)
weather.head()

(1057, 9)


Unnamed: 0,season,round,circuit_id,weather,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy
0,1950,1,silverstone,"Sunny, mild, dry.",0,0,0,0,0
1,1950,2,monaco,not found,0,0,0,0,0
2,1950,3,indianapolis,Rainy,0,0,0,1,0
3,1950,4,bremgarten,"Warm, dry and sunny",1,0,1,0,0
4,1950,5,spa,"Warm, dry and sunny",1,0,1,0,0


In [None]:
df1 = pd.merge(races, weather, how='inner', on=['season', 'round', 'circuit_id']).drop(['lat', 'long','country','weather'], axis = 1)
df2 = pd.merge(df1, results, how='inner', on=['season', 'round', 'circuit_id']).drop(['points', 'status', 'time'], axis = 1)
df3 = pd.merge(df2, driver_standings, how='left', on=['season', 'round', 'driver']) 
df4 = pd.merge(df3, constructor_standings, how='left', on=['season', 'round', 'constructor'])
final_df = pd.merge(df4, qualifying, how='inner', on=['season', 'round', 'grid']).drop(['driver_name', 'car'], axis = 1)

In [None]:
final_df.drop(['url'], axis=1, inplace=True)
final_df.head()

Unnamed: 0,season,round,circuit_id,date,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,...,constructor,grid,podium,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time
0,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,piquet,...,brabham,4,1,0.0,0.0,0.0,0.0,0.0,0.0,1:35.114
1,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,lauda,...,mclaren,9,2,0.0,0.0,0.0,0.0,0.0,0.0,1:36.054
2,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,laffite,...,williams,18,3,0.0,0.0,0.0,0.0,0.0,0.0,1:38.234
3,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,tambay,...,ferrari,3,4,0.0,0.0,0.0,0.0,0.0,0.0,1:34.758
4,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,surer,...,arrows,20,5,0.0,0.0,0.0,0.0,0.0,0.0,1:38.468


In [None]:
# calculate age of drivers
final_df['date'] = pd.to_datetime(final_df.date)
final_df['date_of_birth'] = pd.to_datetime(final_df.date_of_birth)
final_df['driver_age'] = final_df.apply(lambda x: relativedelta(x['date'], x['date_of_birth']).years, axis=1)
final_df.drop(['date', 'date_of_birth'], axis = 1, inplace = True)

In [None]:
# fill/drop nulls

for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 
            'constructor_wins' , 'constructor_standings_pos']:
    final_df[col].fillna(0, inplace = True)
    final_df[col] = final_df[col].map(lambda x: int(x))
    
final_df.dropna(inplace = True )

In [None]:
# convert to boolean

for col in ['weather_warm', 'weather_cold','weather_dry', 'weather_wet', 'weather_cloudy']:
    final_df[col] = final_df[col].map(lambda x: bool(x))

In [None]:
# calculate difference in qualifying times

final_df['qualifying_time'] = final_df.qualifying_time.map(lambda x: 0 if str(x) == '00.000' 
                             else(float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])) if x != 0 else 0))
final_df = final_df[final_df['qualifying_time'] != 0]
final_df.sort_values(['season', 'round', 'grid'], inplace = True)
final_df['qualifying_time_diff'] = final_df.groupby(['season', 'round']).qualifying_time.diff()
final_df['qualifying_time'] = final_df.groupby(['season', 'round']).qualifying_time_diff.cumsum().fillna(0)
final_df.drop('qualifying_time_diff', axis = 1, inplace = True)

In [None]:
# get dummies

df_dum = pd.get_dummies(final_df, columns = ['circuit_id', 'nationality', 'constructor'] )

for col in df_dum.columns:
    if 'nationality' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)
        
    elif 'constructor' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)
        
    elif 'circuit_id' in col and df_dum[col].sum() < 70:
        df_dum.drop(col, axis = 1, inplace = True)
    
    else:
        pass

In [None]:
df_dum.shape

(14272, 99)

In [None]:
df_dum.to_csv('final_df.csv')

## Modelling & Fitting

In [None]:
data = pd.read_csv("final_df.csv")
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
final_df = data.copy()
print(final_df.shape)
final_df.head()

(14272, 99)


Unnamed: 0,season,round,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,grid,podium,...,constructor_minardi,constructor_prost,constructor_red_bull,constructor_renault,constructor_sauber,constructor_team_lotus,constructor_toro_rosso,constructor_toyota,constructor_tyrrell,constructor_williams
0,1983,1,False,False,True,False,False,keke_rosberg,1,15,...,0,0,0,0,0,0,0,0,0,1
1,1983,1,False,False,True,False,False,prost,2,6,...,0,0,0,1,0,0,0,0,0,0
2,1983,1,False,False,True,False,False,tambay,3,4,...,0,0,0,0,0,0,0,0,0,0
3,1983,1,False,False,True,False,False,piquet,4,1,...,0,0,0,0,0,0,0,0,0,0
4,1983,1,False,False,True,False,False,warwick,5,7,...,0,0,0,0,0,0,0,0,0,0


In [None]:
final_df.columns

Index(['season', 'round', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy', 'driver', 'grid', 'podium',
       'driver_points', 'driver_wins', 'driver_standings_pos',
       'constructor_points', 'constructor_wins', 'constructor_standings_pos',
       'qualifying_time', 'driver_age', 'circuit_id_adelaide',
       'circuit_id_albert_park', 'circuit_id_americas', 'circuit_id_bahrain',
       'circuit_id_baku', 'circuit_id_brands_hatch', 'circuit_id_catalunya',
       'circuit_id_detroit', 'circuit_id_estoril', 'circuit_id_galvez',
       'circuit_id_hockenheimring', 'circuit_id_hungaroring',
       'circuit_id_imola', 'circuit_id_indianapolis', 'circuit_id_interlagos',
       'circuit_id_istanbul', 'circuit_id_jacarepagua', 'circuit_id_jerez',
       'circuit_id_kyalami', 'circuit_id_magny_cours', 'circuit_id_marina_bay',
       'circuit_id_monaco', 'circuit_id_monza', 'circuit_id_nurburgring',
       'circuit_id_phoenix', 'circuit_id_red_bull_ring', 

In [None]:
df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season < 2019]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [None]:
# ronde yang ingin diprediksi
round = 9

test = df[(df.season == 2019) & (df['round'] == round)]
X_test = test.drop(['driver', 'podium'], axis = 1)
y_test = test.podium

X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

### Pemilihan Model

Untuk melakukan evaluasi model klasifikasi, dapat digunakan `lazyclassifier` dari module lazypredict. Fungsi tersebut akan menampilkan ranking model classifier dari tertinggi

In [None]:
reg = LazyClassifier(verbose=0,ignore_warnings=False, custom_metric=None )
models,predictions = reg.fit(X_train, X_test, y_train, y_test)

 14%|█▍        | 4/29 [00:13<01:35,  3.83s/it]

CategoricalNB model failed to execute
Negative values in data passed to CategoricalNB (input X)


 66%|██████▌   | 19/29 [00:39<00:10,  1.05s/it]

NuSVC model failed to execute
specified nu is infeasible


 90%|████████▉ | 26/29 [00:46<00:04,  1.63s/it]

StackingClassifier model failed to execute
__init__() missing 1 required positional argument: 'estimators'


100%|██████████| 29/29 [00:52<00:00,  1.80s/it]


In [None]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,1.0,1.0,1.0,1.0,1.3
KNeighborsClassifier,1.0,1.0,1.0,1.0,0.08
XGBClassifier,1.0,1.0,1.0,1.0,4.37
RidgeClassifierCV,1.0,1.0,1.0,1.0,0.32
RidgeClassifier,1.0,1.0,1.0,1.0,0.15
RandomForestClassifier,1.0,1.0,1.0,1.0,1.53
LogisticRegression,1.0,1.0,1.0,1.0,0.58
LinearDiscriminantAnalysis,1.0,1.0,1.0,1.0,0.47
LinearSVC,1.0,1.0,1.0,1.0,3.18
LGBMClassifier,1.0,1.0,1.0,1.0,1.21


### Prediksi Menggunakan `Neural Network Classifier`

In [None]:
hidden_layer_sizes = (75, 25, 50, 10)
activation = 'identity'
solver = 'lbfgs'
alpha = 0.01623776739188721

model = MLPClassifier(hidden_layer_sizes = hidden_layer_sizes,
                              activation = activation, 
                              solver = solver, 
                              alpha = alpha, 
                              random_state = 1)

model.fit(X_train, y_train)

MLPClassifier(activation='identity', alpha=0.01623776739188721,
              hidden_layer_sizes=(75, 25, 50, 10), random_state=1,
              solver='lbfgs')

In [None]:
# Membuat tabel 'prediction_df'
prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
prediction_df['actual'] = y_test.reset_index(drop = True)
#prediction_df.sort_values('proba_1', ascending = False, inplace = True)
prediction_df.reset_index(inplace = True, drop = True)
prediction_df['predicted'] = prediction_df.index
prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 1 else 0)

something = pd.DataFrame(test['driver'], columns=['driver'])
something.reset_index(drop=True, inplace=True)

result = prediction_df.copy()
combined = pd.concat([something, result], axis=1)
print(combined)

predicted_winner = combined['driver'].loc[combined['predicted'].idxmax()]
actual_winner = combined['driver'].loc[combined['actual'].idxmax()]

print("-"*50)
print("Pemenang (Prediksi):", predicted_winner)
print("Pemenang (Asli):", actual_winner)

             driver  proba_0  proba_1  actual  predicted
0           leclerc     0.88     0.12       0          0
1    max_verstappen     0.67     0.33       1          1
2            bottas     0.84     0.16       0          0
3          hamilton     0.71     0.29       0          0
4            norris     0.97     0.03       0          0
5         raikkonen     0.98     0.02       0          0
6        giovinazzi     1.00     0.00       0          0
7             gasly     0.99     0.01       0          0
8   kevin_magnussen     1.00     0.00       0          0
9          grosjean     1.00     0.00       0          0
10        ricciardo     1.00     0.00       0          0
11            perez     1.00     0.00       0          0
12           stroll     1.00     0.00       0          0
13       hulkenberg     1.00     0.00       0          0
14            kvyat     1.00     0.00       0          0
15           kubica     1.00     0.00       0          0
16            albon     1.00   

### Prediksi Menggunakan `KNeighborsClassifier`

In [None]:
neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train)

KNeighborsClassifier()

In [None]:
prediction_df = pd.DataFrame(neigh.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
prediction_df['actual'] = y_test.reset_index(drop = True)
#prediction_df.sort_values('proba_1', ascending = False, inplace = True)
prediction_df.reset_index(inplace = True, drop = True)
prediction_df['predicted'] = prediction_df.index
prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 1 else 0)

something = pd.DataFrame(test['driver'], columns=['driver'])
something.reset_index(drop=True, inplace=True)

result = prediction_df.copy()
combined = pd.concat([something, result], axis=1)
print(combined)

predicted_winner = combined['driver'].loc[combined['predicted'].idxmax()]
actual_winner = combined['driver'].loc[combined['actual'].idxmax()]

print("-"*50)
print("Pemenang (Prediksi):", predicted_winner)
print("Pemenang (Asli):", actual_winner)

             driver  proba_0  proba_1  actual  predicted
0           leclerc     1.00     0.00       0          0
1    max_verstappen     0.80     0.20       1          1
2            bottas     0.40     0.60       0          0
3          hamilton     0.80     0.20       0          0
4            norris     0.80     0.20       0          0
5         raikkonen     1.00     0.00       0          0
6        giovinazzi     1.00     0.00       0          0
7             gasly     1.00     0.00       0          0
8   kevin_magnussen     1.00     0.00       0          0
9          grosjean     1.00     0.00       0          0
10        ricciardo     1.00     0.00       0          0
11            perez     1.00     0.00       0          0
12           stroll     0.80     0.20       0          0
13       hulkenberg     1.00     0.00       0          0
14            kvyat     1.00     0.00       0          0
15           kubica     1.00     0.00       0          0
16            albon     1.00   

In [None]:
actual = []
predict_neural = []
predict_kn = []

for round in range(17, 22):
  test = df[(df.season == 2019) & (df['round'] == round)]
  X_test = test.drop(['driver', 'podium'], axis = 1)
  y_test = test.podium

  X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

  # hidden_layer_sizes = (75, 25, 50, 10)
  # activation = 'identity'
  # solver = 'lbfgs'
  # alpha = 0.01623776739188721

  # model = MLPClassifier(hidden_layer_sizes = hidden_layer_sizes,
  #                               activation = activation, 
  #                               solver = solver, 
  #                               alpha = alpha, 
  #                               random_state = 1)

  # model.fit(X_train, y_train)

  neigh = KNeighborsClassifier()
  neigh.fit(X_train, y_train)

  # # Membuat tabel 'prediction_df'
  # prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
  # prediction_df['actual'] = y_test.reset_index(drop = True)
  # #prediction_df.sort_values('proba_1', ascending = False, inplace = True)
  # prediction_df.reset_index(inplace = True, drop = True)
  # prediction_df['predicted'] = prediction_df.index
  # prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

  # something = pd.DataFrame(test['driver'], columns=['driver'])
  # something.reset_index(drop=True, inplace=True)

  # result = prediction_df.copy()
  # combined = pd.concat([something, result], axis=1)
  # name1 = f'NNround{round}.txt'
  # np.savetxt(name1, combined.values, fmt='%s')
  # print(name1, "saved")


  prediction_df = pd.DataFrame(neigh.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
  prediction_df['actual'] = y_test.reset_index(drop = True)
  #prediction_df.sort_values('proba_1', ascending = False, inplace = True)
  prediction_df.reset_index(inplace = True, drop = True)
  prediction_df['predicted'] = prediction_df.index
  prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 1 else 0)

  something = pd.DataFrame(test['driver'], columns=['driver'])
  something.reset_index(drop=True, inplace=True)

  result = prediction_df.copy()
  combined = pd.concat([something, result], axis=1)
  name2 = f'KNround{round}.txt'
  np.savetxt(name2, combined.values, fmt='%s')
  print(name2, "saved")

KNround17.txt saved
KNround18.txt saved
KNround19.txt saved
KNround20.txt saved
KNround21.txt saved


## Junk

In [None]:
df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season < 2019]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [None]:
df

Unnamed: 0,season,round,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,grid,podium,...,constructor_minardi,constructor_prost,constructor_red_bull,constructor_renault,constructor_sauber,constructor_team_lotus,constructor_toro_rosso,constructor_toyota,constructor_tyrrell,constructor_williams
0,1983,1,False,False,True,False,False,keke_rosberg,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1983,1,False,False,True,False,False,prost,2,0,...,0,0,0,1,0,0,0,0,0,0
2,1983,1,False,False,True,False,False,tambay,3,0,...,0,0,0,0,0,0,0,0,0,0
3,1983,1,False,False,True,False,False,piquet,4,1,...,0,0,0,0,0,0,0,0,0,0
4,1983,1,False,False,True,False,False,warwick,5,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14267,2019,21,True,False,False,False,False,giovinazzi,16,0,...,0,0,0,0,0,0,0,0,0,0
14268,2019,21,True,False,False,False,False,raikkonen,17,0,...,0,0,0,0,0,0,0,0,0,0
14269,2019,21,True,False,False,False,False,russell,18,0,...,0,0,0,0,0,0,0,0,0,1
14270,2019,21,True,False,False,False,False,kubica,19,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
for circuit in df[df.season == 2019]['round'].unique():
        test = df[(df.season == 2019) & (df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium

        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

Untuk melakukan evaluasi model regresi, 

In [None]:
df = data.copy()

train = df[df.season <2019]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [None]:
def score_regression(model):
    score = 0
    for circuit in df[df.season == 2019]['round'].unique():

        test = df[(df.season == 2019) & (df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['podium'] = y_test.reset_index(drop = True)
        prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x == 1 else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        score += precision_score(prediction_df.actual, prediction_df.predicted)

    model_score = score / df[df.season == 2019]['round'].unique().max()
    return model_score

comparison_dict ={'model':[],
                  'params': [],
                  'score': []}

### Regresi

In [None]:
# Linear Regression

params={'fit_intercept': ['True', 'False']}

for fit_intercept in params['fit_intercept']:
    model_params = (fit_intercept)
    model = LinearRegression(fit_intercept = fit_intercept)
    model.fit(X_train, y_train)
            
    model_score = score_regression(model)
            
    comparison_dict['model'].append('linear_regression')
    comparison_dict['params'].append(model_params)
    comparison_dict['score'].append(model_score)

In [None]:
# Random Forest Regressor

params={'criterion': ['mse'],
        'max_features': [0.8, 'auto', None],
        'max_depth': list(np.linspace(5, 55, 26)) + [None]}

for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:
            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion = criterion,
            max_features = max_features, max_depth = max_depth, random_state = 1)
            
            model.fit(X_train, y_train)
            
            model_score = score_regression(model)
            
            comparison_dict['model'].append('random_forest_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)

In [None]:
## Support Vector Machines

params={'gamma': np.logspace(-4, -1, 10),
        'C': np.logspace(-2, 1, 10),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} 

for gamma in params['gamma']:
    for c in params['C']:
        for kernel in params['kernel']:
            model_params = (gamma, c, kernel)
            model = svm.SVR(gamma = gamma, C = c, kernel = kernel)
            model.fit(X_train, y_train)
            
            model_score = score_regression(model)
            
            comparison_dict['model'].append('svm_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)

In [None]:
# Neural network

params={'hidden_layer_sizes': [(80,20,40,5), (75,30,50,10,3)], 
        'activation': ['identity', 'relu','logistic', 'tanh',], 
        'solver': ['lbfgs','sgd', 'adam'], 
        'alpha': np.logspace(-4,1,20)} 

for hidden_layer_sizes in params['hidden_layer_sizes']:
    for activation in params['activation']:
        for solver in params['solver']:
            for alpha in params['alpha']:
                model_params = (hidden_layer_sizes, activation, solver, alpha )
                model = MLPRegressor(hidden_layer_sizes = hidden_layer_sizes,
                                      activation = activation, solver = solver, alpha = alpha, random_state = 1)
                model.fit(X_train, y_train)

                model_score = score_regression(model)

                comparison_dict['model'].append('nn_regressor')
                comparison_dict['params'].append(model_params)
                comparison_dict['score'].append(model_score)