## Library importation 

In [1]:
import datetime
import os
import pandas as pd
import numpy as np
from pprint import pprint
import requests
# Scrappings tools for downloading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import re

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Set a time tracker 
We use it to count a global time for our script execution

In [3]:
start = time.time()

### Initialize Selenium webdriver for Chrome
As we are using Selenium in a remote Notebook, we don't have a real OS for testing purpose. So Selenium has a feature that help us to make web scrapping from a remote server.
The feature is a Remote driver.
The url of our remote driver online tool manager (**Selenium grid**) is <a target="_blank" href="http://141.145.214.205:4444">here</a>


In [4]:
# Setting options for the web driver 
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
hub_url = "http://selenium-hub:4444/wd/hub"

## 1. Creation of a function that return Qualifiyings links 
It is important to run Selenium driver with **with** expression as in the next lines, is a best practice that help us to preserve our memory by closing the browser usage in memory, even if we have an error that occurs when the scripts has started

In [5]:
def get_round_urls():
    all_round_urls = []

    # Calling remote driver for selenium
    with webdriver.Remote(command_executor=hub_url, options=chrome_options) as driver:
        for page in range(1, 12):
            base_url = "https://www.f1-fansite.com/tag/f1-qualifying-times/page/%s/" % page
        
            # Open the web page
            driver.get(base_url)

            # Now grab the all rounds html link elements by their XPATH
            round_elements_list = driver.find_elements(By.XPATH,'//div[@class="post_content pc_full"]/h2/a')

            if(len(round_elements_list) > 0):
                for round in round_elements_list:
                    qualifiying_matches = re.findall(r"\b\w*qualifying\w*\b", round.get_attribute("href"))
                    if qualifiying_matches:
                        all_round_urls.append(round.get_attribute("href"))
    
        driver.close()
        
    return all_round_urls

In [6]:
def get_data_list(urls):

    data = {'gp_title': [],
               'weather':[],
               'year':[],
               'title': [],
               'url':[],}
    
        
    with webdriver.Remote(command_executor=hub_url, options=chrome_options) as driver:
        for url in urls:
        
            base_url = url
            
            # Open the web page
            driver.get(base_url)
        
            try:
                # Now grab the all rounds html link elements by their XPATH
                title = driver.find_element(By.XPATH,'//div[@id="content"]/div[1]/div[1]/h1')
                weather = driver.find_element(By.XPATH,'/html/body/div[1]/div/div[4]/div/div[1]/div[1]/div[2]/p[2]')
                gp_round = driver.find_element(By.XPATH,'/html/body/div[1]/div/div[4]/div/div[1]/div[1]/div[2]/p[1]/a[1]')
                
                weather_block_text = weather.text
                article_title = title.text
                round_name = gp_round.text
            
                year = ''
                pattern = r'\b\d{4}\b'
                matches = re.findall(pattern, article_title)
                
                if matches:
                    year = matches[0]
        
                data['year'].append(year)
                data['gp_title'].append(round_name)
                data['weather'].append(weather_block_text)
                data['title'].append(article_title)
                data['url'].append(url)
                    
            except Exception as e:
                print("No Xptah found")
                print(base_url)
            
            
            
        driver.close()
    
    return data

In [7]:
urls = get_round_urls()

In [8]:
data = get_data_list(urls) 

No Xptah found
https://www.f1-fansite.com/f1-news/steiner-q4-qualifying-could-favour-big-teams/
No Xptah found
https://www.f1-fansite.com/f1-news/brawn-no-q4-qualifying-for-2019/
No Xptah found
https://www.f1-fansite.com/f1-news/liberty-considers-f1-qualifying-race/


In [9]:
df = pd.DataFrame(data)

In [10]:
df

Unnamed: 0,gp_title,weather,year,title,url
0,Belgian Grand Prix,Weather: dry 18°C\nTarmac: dry 25°C\nHumidit...,2023,Qualifying Results 2023 Belgian F1 Sprint Shoo...,https://www.f1-fansite.com/f1-result/qualifyin...
1,Hungarian F1 Grand Prix,Weather: dry 26°C\nTarmac: dry 42°C\nHumidit...,2023,F1 Qualifying Results & Report 2023 Hungarian GP,https://www.f1-fansite.com/f1-result/f1-qualif...
2,British F1 Grand Prix,Weather: wet/dry 21°C\nTarmac: wet/dry 22°C\...,2023,Qualifying Results 2023 British F1 Grand Prix,https://www.f1-fansite.com/f1-result/qualifyin...
3,Austrian F1 Grand Prix,Weather: dry 16°C\nTarmac: dry 21°C\nHumidit...,2023,Qualifying Results 2023 Austrian F1 Sprint,https://www.f1-fansite.com/f1-result/qualifyin...
4,Austrian F1 GP,Weather: dry 27°C\nTarmac: dry 41°C\nHumidity...,2023,Qualifying Results & Report 2023 Austrian F1 GP,https://www.f1-fansite.com/f1-result/qualifyin...
...,...,...,...,...,...
183,Singapore Grand Prix,On a the dry Singapore street circuit Lewis Ha...,2014,Qualifying Results 2014 Singapore F1 Grand Prix,https://www.f1-fansite.com/f1-result/qualifyin...
184,Italian Grand Prix,On a dry Monza circuit Lewis Hamilton scored h...,2014,Qualifying Results 2014 Italian F1 Grand Prix,https://www.f1-fansite.com/f1-result/qualifyin...
185,US F1 Grand Prix,Weather: Dry\nTrack temperature: 31°C\nAir tem...,2012,Qualifying results 2012 United States F1 Grand...,https://www.f1-fansite.com/f1-result/qualifyin...
186,Abu Dhabi Grand Prix,,2012,Qualifying results 2012 Formula 1 Grand Prix o...,https://www.f1-fansite.com/f1-result/qualifyin...


In [44]:
df.loc[df['weather']=='']

Unnamed: 0,gp_title,weather,year,title,url
149,German Grand Prix,,2016,Qualifying results 2016 German F1 Grand Prix,https://www.f1-fansite.com/f1-result/qualifyin...
173,Spanish Grand Prix,,2015,Qualifying Results 2015 Spanish F1 Grand Prix,https://www.f1-fansite.com/f1-result/qualifyin...
180,Grand Prix of United States,,2014,Qualifying Results 2014 USA F1 Grand Prix,https://www.f1-fansite.com/f1-result/qualifyin...
186,Abu Dhabi Grand Prix,,2012,Qualifying results 2012 Formula 1 Grand Prix o...,https://www.f1-fansite.com/f1-result/qualifyin...


### Delete the null entries and bad data values

In [64]:
df.drop(df[df['weather']==''].index, inplace=True)

In [83]:
df.drop(df.loc[~df['weather'].str.match('Weather')].index, inplace=True)

In [84]:
df.to_csv('./data/qualifiying_weather.csv', index = False)

In [12]:
end = time.time()

In [13]:
str(datetime.timedelta(seconds=(end - start)))

'0:03:06.917944'