## Library importation 

In [1]:
import datetime
import os
import pandas as pd
import numpy as np
from pprint import pprint
import requests
# Scrappings tools for downloading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Set a time tracker 
We use it to count a global time for our script execution

In [3]:
start = time.time()

### Initialize Selenium webdriver for Chrome
As we are using Selenium in a remote Notebook, we don't have a real OS for testing purpose. So Selenium has a feature that help us to make web scrapping from a remote server.
The feature is a Remote driver.
The url of our remote driver online tool manager (**Selenium grid**) is <a target="_blank" href="http://141.145.214.205:4444">here</a>


In [4]:
# Setting options for the web driver 
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
hub_url = "http://selenium-hub:4444/wd/hub"

## 1. Creation of a function that return GP's links by year 
It is important to run Selenium driver with **with** expression as in the next lines, is a best practice that help us to preserve our memory by closing the browser usage in memory, even if we have an error that occurs when the scripts has started

In [5]:

def get_links_by_year():
    round_link_by_year = []

    # Calling remote driver for selenium
    with webdriver.Remote(command_executor=hub_url, options=chrome_options) as driver:
        for year in range(1950, 2024):
            i = 1
            round_values = []
            base_url = "https://www.formula1.com/en/results.html/%s/races.html" % year
        
            # Open the web page
            driver.get(base_url)

            # We first have to accept cookies
            if i == 1:
                try:
                    accept_cookies_btn = driver.find_elements(By.XPATH, "//button[@id='truste-consent-button']")
                    if(len(accept_cookies_btn) > 0):
                        accept_cookies_btn[0].click()
                except:
                    print("Error: There is an error with cookie acceptation")

            # Now grab the all rounds html link elements by their XPATH
            round_list = driver.find_elements(By.XPATH,'//a[@data-name="meetingKey"]')

            if(len(round_list) > 0):
                for round in round_list:
                    # Remove the link for all races because we don't need it
                    url_all = 'https://www.formula1.com/en/results.html/%s/races.html' % year
                    if(round.get_attribute("href") != url_all):
                        round_values.append(round.get_attribute("href"))

            # Create a dictionnary that contains all the rounds for a year
            value = {
                "year": year,
                "rounds": round_values
            }
        
            round_link_by_year.append(value)

            # As we are in the same navigation session, we need to change it after the 
            # first iteration to avoid to try to accept the cookie again. 
            i += 1 
    
        driver.close()
        
    return round_link_by_year

In [6]:
round_link_by_year = get_links_by_year()

## 2. Creation of a function that returns qualifiying URL

In [7]:
def get_qualifiying_url(round_url, diff = False):
    qualifiying_url = ''
    with webdriver.Remote(command_executor=hub_url, options=chrome_options) as driver:
        driver.get(round_url)

        try:
            if diff:
                qualifiying_elt = driver.find_element(By.XPATH,'//a[@data-value="qualifying"]')
            else:
                qualifiying_elt = driver.find_element(By.XPATH,'//a[@data-value="qualifying-0"]')
            qualifiying_url = qualifiying_elt.get_attribute("href")
        except:
            print("There is no element correponding to this XPATH")
            
        
        driver.close()
    return qualifiying_url

## 3. Function that retrives the data for just only one round (Params: year, round and url)

In [8]:
def get_table_data(year, round, url, diff = False):
    data = {'position': [],
           'number':[],
           'driver':[],
           'team': [],
           'time': [],
           'Q1': [],
           'Q2': [],
           'Q3': [],
           'laps': [],
           'year': [],
           'round': []}
    
    with webdriver.Remote(command_executor=hub_url, options=chrome_options) as driver:
        driver.get(url)
        # The default size is (800, 400 ), we need to set the size of the window otherwise 
        # the value of race team will be empty 
        driver.set_window_size(width=1200, height=800)

        try: 
            table_rows = driver.find_elements(By.XPATH,'//table/tbody/tr')
            row_numbers = len(table_rows)

            if(row_numbers > 0):
                for row in range(1, row_numbers + 1):
                    column_number = 2
                    try:
                        position = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                        data['position'].append(position.text)
                    except:
                        print("Row n°%d - No position" % row)
                    column_number += 1
                    try:
                        number = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                        data['number'].append(number.text)
                    except:
                        print("Row n°%d - No number" % row)
                    column_number += 1
                    try:
                        driver_name = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                        data['driver'].append(driver_name.text)
                    except:
                        print("Row n°%d - No driver" % row)
                    column_number += 1
                    try:
                        race_team = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                        data['team'].append(race_team.text)
                    except:
                        print("Row n°%d - No Race team" % row)
                    column_number += 1

                    if not diff:
                        try:
                            time = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['time'].append(time.text)
                        except:
                            print("Row n°%d - No time" % row)
                        column_number += 1
                        try:
                            lap = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['laps'].append(lap.text)
                        except:
                            print("Row n°%d - No Lap" % row)
                        # Fill the others fields presents in the row
                        data['Q1'].append('')
                        data['Q2'].append('')
                        data['Q3'].append('')
                    else:
                        # TODO Process the retirving of Q1, Q2 and Q3 and Laps
                        try:
                            q1 = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['Q1'].append(q1.text)
                        except:
                            print("Row n°%d - No Q1 time" % row)
                        column_number += 1
                        try:
                            q2 = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['Q2'].append(q2.text)
                        except:
                            print("Row n°%d - No Q2 time" % row)
                        column_number += 1
                        try:
                            q3 = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['Q3'].append(q3.text)
                        except:
                            print("Row n°%d - No Q3 time" % row)
                        column_number += 1
                        try:
                            lap = driver.find_element(By.XPATH,('//table/tbody/tr[%d]/td[%d]' % (row, column_number)))
                            data['laps'].append(lap.text)
                        except:
                            print("Row n°%d - No Lap" % row)
                        # Fill the others fields presents in the row
                        data['time'].append('')

                    data["year"].append(year)
                    data["round"].append(round)
        except:
            print("There is no data table here")
        
        driver.close()

    results = pd.DataFrame(data)
    return results

## 4. Create a function that retrieves data for only one year (params: year::string)

In [9]:
def get_one_year_data(year_code, diff = False):
    # Generate the round number by counting each row line 
    round_nber = 1
    list_df = []
    for round_url in round_link_by_year[year_code]['rounds']:
        url = get_qualifiying_url(round_url, diff)
        print(url)
        if url == '':
            round_nber += 1
            pass
        else:
            data = get_table_data(round_link_by_year[year_code]["year"], round_nber, url, diff)
            list_df.append(data)
            round_nber += 1

    return pd.concat(list_df)
    


## Create an iteration for getting the data from **1950** to  **2023**


In [None]:
qualifiying_list = [get_one_year_data(year, year > 55) for year in range(0, 74)]

In [29]:
df = pd.concat(qualifiying_list)

In [31]:
df

Unnamed: 0,position,number,driver,team,time,Q1,Q2,Q3,laps,year,round
0,1,2,Nino Farina,ALFA ROMEO,1:50.800,,,,,1950,1
0,1,34,Juan Manuel Fangio,ALFA ROMEO,1:50.200,,,,,1950,2
0,1,98,Walt Faulkner,KURTIS KRAFT OFFENHAUSER,1:06.992,,,,,1950,3
0,1,14,Juan Manuel Fangio,ALFA ROMEO,2:42.100,,,,,1950,4
0,1,8,Nino Farina,ALFA ROMEO,4:37.000,,,,,1950,5
...,...,...,...,...,...,...,...,...,...,...,...
15,16,23,Alexander Albon,WILLIAMS MERCEDES,,2:00.314,,,8,2023,13
16,17,24,Zhou Guanyu,ALFA ROMEO FERRARI,,2:00.832,,,9,2023,13
17,18,2,Logan Sargeant,WILLIAMS MERCEDES,,2:01.535,,,6,2023,13
18,19,3,Daniel Ricciardo,ALPHATAURI HONDA RBPT,,2:02.159,,,8,2023,13


### Export de data as csv file

In [34]:
df.to_csv('./data/qualifiying_results.csv', index = False)

In [20]:
end = time.time()

In [21]:
str(datetime.timedelta(seconds=(end - start)))

'1:38:29.555796'

In [22]:
# list_of_qualifiying_part_2

In [23]:
# //a[@data-name="year"] # Years
# //a[@data-name="meetingKey"] # round

# The starting Grid begin at 1960

# # Qualification data start in 1983 (Overall Qualifiying - Qualifiying 1 - Qualifiying 2)
# //a[@data-value="qualifying-0"] # Overrall qualifiying (unique)
# //a[@data-value="qualifying-2"] # Qualifiying 2 
# //a[@data-value="qualifying-1"] # Qualifiying 1 

# # Practices data starts in 1988, 2 Practices and 2 qualifiying with a single overall qualifiying stage
# //a[@data-value="qualifying-0"] # Overrall qualifiying 
# //a[@data-value="qualifying-2"] # Qualifiying 2 
# //a[@data-value="qualifying-1"] # Qualifiying 1 
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1

# # The format changed in 1996, they switched from 2 qualifiying to a single qualifiying and 2 pratices
# //a[@data-value="qualifying-0"] # Qualifiying 
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1

# # The format changed again in 2003, they came back to 2 qualifiying and 2 pratices
# //a[@data-value="qualifying-0"] # Overrall qualifiying (unique)
# //a[@data-value="qualifying-2"] # Qualifiying 2 
# //a[@data-value="qualifying-1"] # Qualifiying 1 
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1

# # In 2004, They added 2 others practices steps and stayed with 2 qualifiying 
# //a[@data-value="qualifying-0"] # Overrall qualifiying (unique)
# //a[@data-value="qualifying-2"] # Qualifiying 2 
# //a[@data-value="qualifying-1"] # Qualifiying 1 
# //a[@data-value="practice-4"] # Practice 4
# //a[@data-value="practice-3"] # Practice 3
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1

# # In 2006, They removed one practice steps and keeped just one qualifiying 
# //a[@data-value="qualifying"] # Qualifiying (unique) (Q1-Q2-Q3)
# //a[@data-value="practice-3"] # Practice 3
# //a[@data-value="practice-2"] # Practice 2
# //a[@data-value="practice-1"] # Practice 1



# //p[@class="note"]