In [1]:
link = 'https://www.pse.pl/dane-systemowe/funkcjonowanie-rb/raporty-dobowe-z-funkcjonowania-rb/podstawowe-wskazniki-cenowe-i-kosztowe/rynkowa-cena-energii-elektrycznej-rce'
params_sel = 'span[class=nav-item-label]'
date_input_sel = 'input[name=data]'
confirm_sel = 'button[class*=apply]'
table_sel = 'tbody[id*=yui_patched]'

In [2]:
import contextlib
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import TimeoutException, NoSuchWindowException
from typing import Union
from bs4 import BeautifulSoup as Bs
import pandas as pd
from datetime import datetime, timedelta
from app.data_managers.readers.driver_factory import DriverFactory
import time


class LimitExceededException(Exception):
    def __init__(self, date: str, *args: object) -> None:
        mes = f'limit exceeded at {date}'
        super().__init__(mes, *args)


DATE = 'Date'
PRICE = 'Price'
HOUR = 'Hour'
TIMEOUT = 10
WAIT = 0.3

def set_driver(headless: bool = True, driver = None):
    if driver is None:
        driver = DriverFactory.get_driver(headless=headless)    
    driver.get(link)
    return driver

def read_table(table_html: Bs) -> pd.DataFrame:
    df = pd.read_html(table_html)[0]
    converters = {df.columns[1]: lambda x: str(x.replace(u'\xa0', u''))}
    df = pd.read_html(table_html,
                      converters=converters,
                      decimal=',',
                      thousands='.')[0].dropna()
    df.columns = [HOUR, PRICE]
    return df

def transform_table(data: pd.DataFrame, date: str) -> pd.DataFrame:
    df = data.copy()
    df[PRICE] = pd.to_numeric(df[PRICE])
    df[DATE] = df[HOUR].apply(lambda x: date + timedelta(hours=x))
    df = df.drop(columns=HOUR)
    return df

def get_element(driver, selector: str, timeout: Union[int, float] = TIMEOUT) -> WebElement:
    element = WebDriverWait(driver=driver,
                            timeout=timeout).until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
    return element


def scrap_prices(dates: pd.DatetimeIndex, headless: bool = True):
    driver = set_driver(headless=headless)
    prices = pd.DataFrame()
    with contextlib.suppress(
        ValueError,
        TimeoutException,
        NoSuchWindowException,
        KeyboardInterrupt
    ):
        for date in dates:
            day_df = get_prices(date=date, driver=driver)
            prices = pd.concat([prices, day_df], ignore_index=True)
            
    prices = prices[[DATE, PRICE]].sort_values(DATE)
    return prices

def click_params(driver):
    params = get_element(driver=driver, selector=params_sel)
    params.click()
    time.sleep(WAIT)
    
def send_keys(driver, date: str):
    date_input = get_element(driver=driver, selector=date_input_sel)
    date_input.clear()
    date_input.send_keys(datetime.strftime(date, '%Y-%m-%d'))
    
def confirm(driver):
    confirm = get_element(driver=driver, selector=confirm_sel)
    confirm.click()
    time.sleep(WAIT)

def get_table_html(driver) -> str:
    table = get_element(driver=driver, selector=table_sel)
    table_html = table.get_attribute('innerHTML')
    return table_html
    
def get_prices(date: datetime, driver) -> pd.DataFrame:
    time.sleep(WAIT)
    click_params(driver=driver)
    send_keys(driver=driver, date=date)
    confirm(driver=driver)
    table_html = get_table_html(driver=driver)
    df = read_table(table_html=table_html)
    df = transform_table(data=df, date=date)
    return df
    
now = datetime.now()
r = pd.date_range('2023-01-01', now)
pr = scrap_prices(dates=r, headless=False)

driver: C:\Users\wojte\.wdm\drivers\chromedriver\win32\111.0.5563\chromedriver.exe
driver_version: 111.0.5563.64
browser_version: 111.0.5563.64


In [3]:
pr

Unnamed: 0,Date,Price
0,2023-01-01 01:00:00,504.88
1,2023-01-01 02:00:00,491.33
2,2023-01-01 03:00:00,497.17
3,2023-01-01 04:00:00,513.19
4,2023-01-01 05:00:00,526.86
5,2023-01-01 06:00:00,575.12
6,2023-01-01 07:00:00,687.43
7,2023-01-01 08:00:00,748.73
8,2023-01-01 09:00:00,790.44
9,2023-01-01 10:00:00,760.47


In [None]:
driver = DriverFactory.get_driver(headless=False) 
driver.get(link)

In [None]:
def get_element(driver, selector: str, timeout: Union[int, float] = TIMEOUT) -> WebElement:
    element = WebDriverWait(driver=driver,
                            timeout=timeout).until(
    EC.visibility_of_any_elements_located((By.CSS_SELECTOR, selector)))
    return element

get_element(driver, 'span[class=nav-item-label]')[-1].click()

In [None]:
from_, to = get_element(driver, 'input[id*=_VisioToolbar')

In [None]:
from_.clear()
from_.send_keys('2022-12-01')
to.clear()
to.send_keys('2022-12-10')

In [None]:
get_element(driver, 'button[class*=ui-datepicker-close]')[0].click()

In [None]:
confirm = get_element(driver, "a[title='Eksport do CSV']")[-1].click()

In [None]:
pr.set_index(DATE).plot()

In [1]:
import contextlib
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import TimeoutException, NoSuchWindowException, NoSuchElementException
from typing import Union
from bs4 import BeautifulSoup as Bs
import pandas as pd
from datetime import datetime, timedelta
from driver_factory import DriverFactory
import time
import requests

main_page = 'https://www.pogodajutro.com/europe/poland?page=past-weather'


def set_up(headless: bool = True):
    driver = DriverFactory.get_driver(headless=headless)    
    driver.get(main_page)
    time.sleep(2)
    consent = driver.find_element(By.CSS_SELECTOR, "button[class='fc-button fc-cta-consent fc-primary-button']")
    consent.click()
    time.sleep(1)
    return driver

def build_df(driver, dates):
    weather = pd.DataFrame()
    for date in dates:
        print(date)
        link = f'{main_page}#day={date.day}&month={date.month}'
        driver.get(link)
        for year in (2019, 2020, 2021, 2022):
            try:
                time.sleep(1)
                year_tab = driver.find_element(By.CSS_SELECTOR, f"button[data-year='{year}']")
                driver.execute_script("arguments[0].click();", year_tab)
            except NoSuchElementException:
                print(f'No data for {year} {date.month} {date.day}')
                print(link)
                continue
            html = driver.page_source
            df = pd.read_html(html)[2]
            xd = pd.melt(df, id_vars=['Unnamed: 0'], var_name='Hour')
            xd = pd.pivot(xd, index='Hour', columns='Unnamed: 0', values='value').reset_index()
            xd['Year'] = year
            xd['Month'] = date.month
            xd['Day'] = date.day
            xd['Time'] = pd.to_datetime(xd[['Year', 'Month', 'Day', 'Hour']].astype(str).agg('-'.join, axis=1))
            xd = xd.drop(columns=['Year', 'Month', 'Day', 'Hour'])
            weather = pd.concat([weather, xd], ignore_index=True)
    return weather

driver = set_up(headless=True)
dates = pd.date_range('01-01-2012', '01-05-2012', freq='D')
# assert len(dates) == 366
df = build_df(driver, dates)


driver: C:\Users\wojte\.wdm\drivers\chromedriver\win32\110.0.5481\chromedriver.exe
driver_version: 110.0.5481.77
browser_version: 110.0.5481.178
2012-01-01 00:00:00
No data for 2019 1 1
https://www.pogodajutro.com/europe/poland?page=past-weather#day=1&month=1
2012-01-02 00:00:00
No data for 2019 1 2
https://www.pogodajutro.com/europe/poland?page=past-weather#day=2&month=1
2012-01-03 00:00:00
No data for 2019 1 3
https://www.pogodajutro.com/europe/poland?page=past-weather#day=3&month=1
2012-01-04 00:00:00
No data for 2019 1 4
https://www.pogodajutro.com/europe/poland?page=past-weather#day=4&month=1
2012-01-05 00:00:00
No data for 2019 1 5
https://www.pogodajutro.com/europe/poland?page=past-weather#day=5&month=1
