In [15]:
import os
import datetime
import pandas as pd
import numpy as np

# Scrappings tools for downloading
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import re

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Set a time tracker 
We use it to count a global time for our script execution

In [3]:
start = time.time()

### Initialize Selenium webdriver for Chrome
As we are using Selenium in a remote Notebook, we don't have a real OS for testing purpose. So Selenium has a feature that help us to make web scrapping from a remote server.
The feature is a Remote driver.
The url of our remote driver online tool manager (**Selenium grid**) is <a target="_blank" href="http://141.145.214.205:4444">here</a>

In [4]:
# Setting options for the web driver 
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
hub_url = "http://selenium-hub:4444/wd/hub"

In [5]:
races = pd.read_csv('./data/races.csv')

In [6]:
# Creating weather column with empties value
races["weather"] = ''

In [7]:
races.head()

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url,weather
0,1950,1,silverstone,52.0786,-1.01694,UK,1950-05-13,http://en.wikipedia.org/wiki/1950_British_Gran...,
1,1950,2,monaco,43.7347,7.42056,Monaco,1950-05-21,http://en.wikipedia.org/wiki/1950_Monaco_Grand...,
2,1950,3,indianapolis,39.795,-86.2347,USA,1950-05-30,http://en.wikipedia.org/wiki/1950_Indianapolis...,
3,1950,4,bremgarten,46.9589,7.40194,Switzerland,1950-06-04,http://en.wikipedia.org/wiki/1950_Swiss_Grand_...,
4,1950,5,spa,50.4372,5.97139,Belgium,1950-06-18,http://en.wikipedia.org/wiki/1950_Belgian_Gran...,


## Scrapping weather data function

In [8]:
def get_weather_data(races):
    data = {'season': [],
            'round': [],
            'circuit_id': [],
            'lat': [],
            'long': [],
            'country': [],
            'date': [],
            'weather': []}
    
    
    for index, row in races.iterrows():
        url = row["url"]
        
        data["season"].append(row['season'])
        data["round"].append(row['round'])
        data["circuit_id"].append(row['circuit_id'])
        data["lat"].append(row['lat'])
        data["long"].append(row['long'])
        data["country"].append(row['country'])
        data["date"].append(row['date'])
        
        with webdriver.Remote(command_executor=hub_url, options=chrome_options) as driver:
            # Open the web page
            driver.get(url)
        
            try:
                weather_elt = driver.find_element(By.XPATH,'//table[@class="infobox vevent"]')
                value = weather_elt.text
                if re.findall(r"\b\w*Weather\w*\b", value):
                    data["weather"].append(value)
                else:
                    data["weather"].append('')
            except Exception as e:
                print(f"error url: {url}")
                data["weather"].append('')
            
            driver.close()

    return data

In [9]:
race_weather_list = get_weather_data(races)

error url: https://en.wikipedia.org/wiki/2023_Dutch_Grand_Prix
error url: https://en.wikipedia.org/wiki/2023_Italian_Grand_Prix
error url: https://en.wikipedia.org/wiki/2023_Singapore_Grand_Prix
error url: https://en.wikipedia.org/wiki/2023_Japanese_Grand_Prix
error url: https://en.wikipedia.org/wiki/2023_Qatar_Grand_Prix
error url: https://en.wikipedia.org/wiki/2023_United_States_Grand_Prix
error url: https://en.wikipedia.org/wiki/2023_Mexico_City_Grand_Prix
error url: https://en.wikipedia.org/wiki/2023_S%C3%A3o_Paulo_Grand_Prix
error url: https://en.wikipedia.org/wiki/2023_Las_Vegas_Grand_Prix
error url: https://en.wikipedia.org/wiki/2023_Abu_Dhabi_Grand_Prix


In [10]:
df = pd.DataFrame(race_weather_list)

In [11]:
end = time.time()

In [12]:
str(datetime.timedelta(seconds=(end - start)))

'0:15:03.760384'

In [13]:
df.head()

Unnamed: 0,season,round,circuit_id,lat,long,country,date,weather
0,1950,1,silverstone,52.0786,-1.01694,UK,1950-05-13,1950 British Grand Prix\nNext race →\nSilverst...
1,1950,2,monaco,43.7347,7.42056,Monaco,1950-05-21,
2,1950,3,indianapolis,39.795,-86.2347,USA,1950-05-30,1950 Indianapolis 500\n← Previous race Next ra...
3,1950,4,bremgarten,46.9589,7.40194,Switzerland,1950-06-04,1950 Swiss Grand Prix\n← Previous race Next ra...
4,1950,5,spa,50.4372,5.97139,Belgium,1950-06-18,1950 Belgian Grand Prix\n← Previous race Next ...


In [16]:
if not os.path.exists('./data'):
    os.mkdir('./data')
df.to_csv('./data/races_weather.csv', index = False)