## Library importation 

In [1]:
import datetime
import os
import pandas as pd
import numpy as np
from pprint import pprint
import requests
# Scrappings tools for downloading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Set a time tracker 
We use it to count a global time for our script execution

In [3]:
start = time.time()

### Initialize Selenium webdriver for Chrome
As we are using Selenium in a remote Notebook, we don't have a real OS for testing purpose. So Selenium has a feature that help us to make web scrapping from a remote server.
The feature is a Remote driver.
The url of our remote driver online tool manager (**Selenium grid**) is <a target="_blank" href="http://141.145.214.205:4444">here</a>


In [15]:
# Setting options for the web driver 
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
hub_url = "http://selenium-hub:4444/wd/hub"

## 1. Creation of a function that return GP's links by year 
It is important to run Selenium driver with **with** expression as in the next lines, is a best practice that help us to preserve our memory by closing the browser usage in memory, even if we have an error that occurs when the scripts has started

In [153]:

def get_links_by_year():
    round_link_by_year = []

    # Calling remote driver for selenium
    with webdriver.Remote(command_executor=hub_url, options=chrome_options) as driver:
        for year in range(1950, 2024):
            i = 1
            round_values = []
            base_url = "https://www.formula1.com/en/results.html/%s/races.html" % year
        
            # Open the web page
            driver.get(base_url)

            # We first have to accept cookies
            if i == 1:
                try:
                    accept_cookies_btn = driver.find_elements(By.XPATH, "//button[@id='truste-consent-button']")
                    if(len(accept_cookies_btn) > 0):
                        accept_cookies_btn[0].click()
                except:
                    print("Error: There is an error with cookie acceptation")

            # Now grab the all rounds html link elements by their XPATH
            round_list = driver.find_elements(By.XPATH,'//a[@data-name="meetingKey"]')

            if(len(round_list) > 0):
                for round in round_list:
                    # Remove the link for all races because we don't need it
                    url_all = 'https://www.formula1.com/en/results.html/%s/races.html' % year
                    if(round.get_attribute("href") != url_all):
                        round_values.append(round.get_attribute("href"))

            # Create a dictionnary that contains all the rounds for a year
            value = {
                "year": year,
                "rounds": round_values
            }
        
            round_link_by_year.append(value)

            # As we are in the same navigation session, we need to change it after the 
            # first iteration to avoid to try to accept the cookie again. 
            i += 1 
    
        driver.close()
        
    return round_link_by_year

## 2. Creation of a function that returns qualifiying URL

In [224]:
def get_qualifiying_url(round_url, diff = False):
    qualifiying_url = ''
    with webdriver.Remote(command_executor=hub_url, options=chrome_options) as driver:
        driver.get(round_url)

        try:
            if diff:
                qualifiying_elt = driver.find_element(By.XPATH,'//a[@data-value="qualifying"]')
            else:
                qualifiying_elt = driver.find_element(By.XPATH,'//a[@data-value="qualifying-0"]')
            qualifiying_url = qualifiying_elt.get_attribute("href")
        except:
            print("There is no element correponding to this XPATH")
        
        driver.close()
    return qualifiying_url

## 3. Function that retrives the data for just only one round (Params: year, round and url)

In [244]:
def get_table_data(year, round, url, diff = False):
    data = {'position': [],
           'number':[],
           'driver':[],
           'team': [],
           'time': [],
           'Q1': [],
           'Q2': [],
           'Q3': [],
           'laps': [],
           'year': [],
           'round': []}
    
    with webdriver.Remote(command_executor=hub_url, options=chrome_options) as driver:
        driver.get(url)
        # The default size is (800, 400 ), we need to set the size of the window otherwise 
        # the value of race team will be empty 
        driver.set_window_size(width=1200, height=800)

        try: 
            table_rows = driver.find_elements(By.XPATH,'//table/tbody/tr')
            row_numbers = len(table_rows)

            if(row_numbers > 0):
                for row in range(1, row_numbers + 1):
                    column_number = 2
                    try:
                        position = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                        data['position'].append(position.text)
                    except:
                        print("Row n°%d - No position" % row)
                    column_number += 1
                    try:
                        number = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                        data['number'].append(number.text)
                    except:
                        print("Row n°%d - No number" % row)
                    column_number += 1
                    try:
                        driver_name = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                        data['driver'].append(driver_name.text)
                    except:
                        print("Row n°%d - No driver" % row)
                    column_number += 1
                    try:
                        race_team = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                        data['team'].append(race_team.text)
                    except:
                        print("Row n°%d - No Race team" % row)
                    column_number += 1

                    if not diff:
                        try:
                            time = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['time'].append(time.text)
                        except:
                            print("Row n°%d - No time" % row)
                        column_number += 1
                        try:
                            lap = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['laps'].append(lap.text)
                        except:
                            print("Row n°%d - No Lap" % row)
                        # Fill the others fields presents in the row
                        data['Q1'].append('')
                        data['Q2'].append('')
                        data['Q3'].append('')
                    else:
                        # TODO Process the retirving of Q1, Q2 and Q3 and Laps
                        try:
                            q1 = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['Q1'].append(q1.text)
                        except:
                            print("Row n°%d - No Q1 time" % row)
                        column_number += 1
                        try:
                            q2 = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['Q2'].append(q2.text)
                        except:
                            print("Row n°%d - No Q2 time" % row)
                        column_number += 1
                        try:
                            q3 = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['Q3'].append(q3.text)
                        except:
                            print("Row n°%d - No Q3 time" % row)
                        column_number += 1
                        try:
                            lap = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['laps'].append(lap.text)
                        except:
                            print("Row n°%d - No Lap" % row)
                        # Fill the others fields presents in the row
                        data['time'].append('')

                    data["year"].append(year)
                    data["round"].append(round)
        except:
            print("There is no data table here")
        
        driver.close()

    results = pd.DataFrame(data)
    return results

## 4. Create a function that retrieves data for only one year (params: year::string)

In [225]:
def get_one_year_data(year_code, diff = False):
    # Generate the round number by counting each row line 
    round_nber = 1
    list_df = []
    for round_url in round_link_by_year[year_code]['rounds']:
        url = get_qualifiying_url(round_url, diff)
        print(url)
        if url == '':
            round_nber += 1
            pass
        else:
            data = get_table_data(round_link_by_year[year_code]["year"], round_nber, url, diff)
            list_df.append(data)
            round_nber += 1

    return pd.concat(list_df)
    

In [None]:
round_link_by_year = get_links_by_year()

In [None]:
round_link_by_year[34]['rounds']

In [190]:
round_link_by_year[56]["year"]

2006

In [None]:
df = get_one_year_data(17)
df


## Create an iteration for getting the data from **1950** to  **1982**


In [148]:
list_of_qualifiying_part_1 = [get_one_year_data(year) for year in range(0, 33)]

In [178]:
pd.concat(list_of_qualifiying_part_1)

Unnamed: 0,position,number,driver,team,time,year,round
0,1,2,Nino Farina,ALFA ROMEO,1:50.800,1950,1
0,1,34,Juan Manuel Fangio,ALFA ROMEO,1:50.200,1950,2
0,1,98,Walt Faulkner,KURTIS KRAFT OFFENHAUSER,1:06.992,1950,3
0,1,14,Juan Manuel Fangio,ALFA ROMEO,2:42.100,1950,4
0,1,8,Nino Farina,ALFA ROMEO,4:37.000,1950,5
...,...,...,...,...,...,...,...
0,1,28,Didier Pironi,FERRARI,1:47.947,1982,12
0,1,1,Nelson Piquet,BRABHAM FORD,1:27.612,1982,13
0,1,15,Alain Prost,RENAULT,1:01.380,1982,14
0,1,28,Mario Andretti,FERRARI,1:28.473,1982,15


## Create an iteration for getting the data from **1982** to  **2005**

In [209]:
round_link_by_year[56]["year"]

2006

In [None]:
list_of_qualifiying_part_2 = [get_one_year_data(year) for year in range(33, 56)]

In [213]:
df_partt_2 = pd.concat(list_of_qualifiying_part_2) 

In [226]:
df_partt_2

Unnamed: 0,position,number,driver,team,time,year,round
0,1,1,Keke Rosberg,WILLIAMS FORD,1:34.526,1983,1
1,2,15,Alain Prost,RENAULT,1:34.672,1983,1
2,3,27,Patrick Tambay,FERRARI,1:34.758,1983,1
3,4,5,Nelson Piquet,BRABHAM BMW,1:35.114,1983,1
4,5,35,Derek Warwick,TOLEMAN HART,1:35.206,1983,1
...,...,...,...,...,...,...,...
15,16,11,Jacques Villeneuve,SAUBER PETRONAS,1:36.788,2005,19
16,17,4,Takuma Sato,BAR HONDA,1:37.083,2005,19
17,18,21,Christijan Albers,MINARDI COSWORTH,1:39.105,2005,19
18,19,18,Tiago Monteiro,JORDAN TOYOTA,1:39.233,2005,19


In [246]:
df_data = get_one_year_data(33)

https://www.formula1.com/en/results.html/1983/races/9/brazil/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/453/usa-west/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/454/france/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/455/san-marino/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/456/monaco/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/457/belgium/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/8/detroit/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/458/canada/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/459/great-britain/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/460/germany/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/461/austria/qualifying-0.html
https://www.formula1.com/en/results.html/1983/races/462/netherlands/qualifying-0.html
https://ww

In [248]:
df_data.tail(22)

Unnamed: 0,position,number,driver,team,time,Q1,Q2,Q3,laps,year,round
4,5,15,Alain Prost,RENAULT,1:07.186,,,,,1983,15
5,6,1,Keke Rosberg,WILLIAMS HONDA,1:07.256,,,,,1983,15
6,7,12,Nigel Mansell,LOTUS RENAULT,1:07.643,,,,,1983,15
7,8,9,Manfred Winkelhock,ATS BMW,1:07.682,,,,,1983,15
8,9,22,Andrea de Cesaris,ALFA ROMEO,1:07.759,,,,,1983,15
9,10,2,Jacques Laffite,WILLIAMS HONDA,1:07.931,,,,,1983,15
10,11,11,Elio de Angelis,LOTUS RENAULT,1:07.937,,,,,1983,15
11,12,8,Niki Lauda,MCLAREN TAG,1:07.974,,,,,1983,15
12,13,35,Derek Warwick,TOLEMAN HART,1:08.061,,,,,1983,15
13,14,16,Eddie Cheever,RENAULT,1:08.069,,,,,1983,15


In [168]:
str(datetime.timedelta(seconds=(end - start)))

'0:37:18.330823'

In [175]:
list_of_qualifiying_part_2

[None]

In [None]:
# //a[@data-name="year"] # Years
# //a[@data-name="meetingKey"] # round

# The starting Grid begin at 1960

# # Qualification data start in 1983 (Overall Qualifiying - Qualifiying 1 - Qualifiying 2)
# //a[@data-value="qualifying-0"] # Overrall qualifiying (unique)
# //a[@data-value="qualifying-2"] # Qualifiying 2 
# //a[@data-value="qualifying-1"] # Qualifiying 1 

# # Practices data starts in 1988, 2 Practices and 2 qualifiying with a single overall qualifiying stage
# //a[@data-value="qualifying-0"] # Overrall qualifiying 
# //a[@data-value="qualifying-2"] # Qualifiying 2 
# //a[@data-value="qualifying-1"] # Qualifiying 1 
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1

# # The format changed in 1996, they switched from 2 qualifiying to a single qualifiying and 2 pratices
# //a[@data-value="qualifying-0"] # Qualifiying 
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1

# # The format changed again in 2003, they came back to 2 qualifiying and 2 pratices
# //a[@data-value="qualifying-0"] # Overrall qualifiying (unique)
# //a[@data-value="qualifying-2"] # Qualifiying 2 
# //a[@data-value="qualifying-1"] # Qualifiying 1 
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1

# # In 2004, They added 2 others practices steps and stayed with 2 qualifiying 
# //a[@data-value="qualifying-0"] # Overrall qualifiying (unique)
# //a[@data-value="qualifying-2"] # Qualifiying 2 
# //a[@data-value="qualifying-1"] # Qualifiying 1 
# //a[@data-value="practice-4"] # Practice 4
# //a[@data-value="practice-3"] # Practice 3
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1

# # In 2006, They removed one practice steps and keeped just one qualifiying 
# //a[@data-value="qualifying"] # Qualifiying (unique) (Q1-Q2-Q3)
# //a[@data-value="practice-3"] # Practice 3
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1



# //p[@class="note"]