# Import packages

In [2]:
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
import lxml

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scrape the source codes

In [3]:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('http://idojarasbudapest.hu/archivalt-idojaras')

time.sleep(5)

sourcesDictionary={}
years = ['2017', '2018', '2019', '2020', '2021', '2022']
months = ['Január', 'Február', 'Március', 'Április', 'Május', 'Június',
              'Július', 'Augusztus', 'Szeptember', 'Október', 'November', 'December']
for year in years:
    
    selectYear = Select(driver.find_element(By.ID, 'ev'))
    selectYear.select_by_visible_text(year)
    
    for month in months:     
        selectMonth = Select(driver.find_element(By.ID, 'ho'))
        selectMonth.select_by_visible_text(month)
        
        button = driver.find_element(By.ID, 'button')
        button.click()
        time.sleep(5)
        sourcesDictionary[year+month]=driver.page_source

[WDM] - Downloading: 100%|██████████| 8.41M/8.41M [00:00<00:00, 17.9MB/s]
  driver = webdriver.Chrome(ChromeDriverManager().install())


KeyboardInterrupt: 

# Scrape the tables from the sources and put them into a pandas DataFrame

In [None]:
sourcesDictionary.keys()

In [None]:
# soup = BeautifulSoup(sourcesDictionary['2017Február'])
# soup
# tempDf = pd.read_html(str(soup.find('table')))[0]
# tempDf

In [None]:
bpweatherdf = pd.DataFrame()
listOfMonthsNotToIncludeFor2017 = ['Január', 'Február', 'Március', 'Április', 'Május', 'Június']
for year in years:
    for month in months:
        if (year == '2017' and month in listOfMonthsNotToIncludeFor2017):
            continue
        if (year == '2022' and month == 'November'):
            break
        key = year+month
        soup = BeautifulSoup(sourcesDictionary[key]) # soup: html soup
        #soup
        tempDf = pd.read_html(str(soup.find('table')))[0]
        bpweatherdf = pd.concat([bpweatherdf, tempDf], axis=0, ignore_index=True)

In [None]:
bpweatherdf.head(10)

In [None]:
bpweatherdf.rename(columns={'Nap': 'Date'}, inplace=True)
bpweatherdf.rename(columns={'Tmax  [°C]': 'Max temperature (°C)'}, inplace=True)
bpweatherdf.rename(columns={'Tmin  [°C]': 'Min temperature (°C)'}, inplace=True)
bpweatherdf.rename(columns={'Szél \t[m/s]': 'Wind (m/s)'}, inplace=True)
bpweatherdf.rename(columns={'Csapadék \t[mm]': 'Precipitation (mm)'}, inplace=True)

bpweatherdf['Date'] = bpweatherdf['Date'].apply(lambda x: x[0:10])

In [None]:
bpweatherdf.head(10)

In [None]:
bpweatherdf.to_csv('bp_weather_data.csv', index = False)

# Visualizations

## Max and min temperatures from July 2017 to October 2022

In [None]:
temperatures = bpweatherdf[['Date', 'Max temperature (°C)', 'Min temperature (°C)']]
sns.lineplot(data=temperatures)

## Wind speeds from July 2017 to October 2022

In [None]:
sns.lineplot(data=bpweatherdf, x='Date', y='Wind (m/s)')

## Precipitation from July 2017 to October 2022

In [None]:
sns.lineplot(data=bpweatherdf, x='Date', y='Precipitation (mm)')

## Max and min temperatures in each year from July 2017 to October 2022

In [None]:
figure, axes = plt.subplots(2, 3, sharex=True, figsize=(16, 8))
figure.suptitle('Max and min temperatures in each year')

years = ['2017', '2018', '2019', '2020', '2021', '2022']
for year, ax in zip(years, np.ravel(axes)):
    ax.set_title(year)
    temperatures_in_year = temperatures[temperatures['Date'].str.startswith(year)].reset_index(drop=True)
    sns.lineplot(data=temperatures_in_year, ax=ax)