In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import typing
import time

In [2]:
url = f'https://www.peco-online.ro/index.php'

In [3]:
def doGet(url: str) -> BeautifulSoup:
    return BeautifulSoup(requests.get(url).content, 'html.parser')

In [4]:
def doPost(url: str, form_data: dict) -> BeautifulSoup:
    return BeautifulSoup(requests.post(url, form_data).content, 'html.parser')

In [5]:
page = doGet(url)

In [6]:
# get all gas vendors
vendors = [option.get('value').strip() for option in page.find('select', id='Retea').find_all('option')]

In [7]:
# get all gas types
types = [option.get('value').strip() for option in page.find('select', id='carburant').find_all('option')]

In [6]:
counties_df = pd.read_json('../data/counties.json', orient='records')

In [18]:
def createFormData(vendors: list, types: list, counties: list) -> list:
    form_datas = []

    for county in counties:
        for type in types:
            form_datas.append({
                'carburant': type,
                'locatie': 'Judet',
                'nume_locatie': county,
                'retea[]': vendors
            })

    return form_datas

In [26]:
form_datas = createFormData(vendors, types, counties_df['County'].to_list())
data = []

for form_data in form_datas:
    data.append({'type': form_data['carburant'], 'county': form_data['nume_locatie'], 'data': doPost(url, form_data)})
    time.sleep(0.15)

In [40]:
scraped_data = []

for county_findings in data:
    dom = county_findings['data']
    try:
        findings = dom.find('table', id='tabelaRezultate').find_all('tr')[1:]
        for finding in findings:
            cells = finding.find_all('td')

            price = cells[0].find('span', class_='pret').text.strip()
            vendor = cells[1].find('img').get('title').strip()
            city = cells[1].find('span', class_='small d-block text-muted').text.strip()
            addr = cells[1].find('span', class_='').text.strip() # will be kept only for uniqueness of data

            scraped_data.append({'Type': county_findings['type'], 'County': county_findings['county'], 'City': city, 'Addr': addr, 'Vendor': vendor, 'Price': price})
    except:
        print('Error at ' + county_findings['county'] + ' with type ' + county_findings['type'] + '. Info: \n' + str(dom))

In [42]:
scraped_df = pd.DataFrame(scraped_data)

In [43]:
# delete all falsy values
scraped_df['Price'].replace('- -', np.nan, inplace=True)
scraped_df['Price'] = pd.to_numeric(scraped_df['Price'])
# better type definition
scraped_df['Type'] = [value.replace('_', ' ') for value in scraped_df['Type']]
# delete county names from city names
scraped_df['City'] = [re.sub(',[^,]*$', '', value) for value in scraped_df['City']]
# add timestamp to scrape
scraped_df['Timestamp'] = pd.to_datetime('now')

In [35]:
type_ro_hun_assoc = {
    'Benzina Regular': '95-ös Benzin',
    'GPL': 'LPG',
    'Benzina Premium': 'Prémium Benzin',
    'Motorina Regular': 'Gázolaj',
    'Motorina Premium': 'Prémium Gázolaj',
    'AdBlue': 'AdBlue'
}

def type_translation_to_hun(type: str):
    return type_ro_hun_assoc[type]

In [36]:
scraped_df['Type'] = [type_translation_to_hun(value) for value in scraped_df['Type']]

In [None]:
scraped_df = scraped_df.merge(counties_df[['County', 'ISO Code']], how='left', left_on='County', right_on='County')
scraped_df.rename(columns={'ISO Code': 'ISO'}, inplace=True)

In [38]:
path = '../data/gas_tmp-' + str(pd.to_datetime('now').date().day) + '.json'
scraped_df.to_json(path, orient='records')