In [None]:
!pip install pandas
!pip install beautifulsoup4

In [69]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import typing

In [70]:
url = f'https://www.peco-online.ro/index.php'

In [71]:
def doGet(url: str) -> BeautifulSoup:
    return BeautifulSoup(requests.get(url).content, 'html.parser')

In [72]:
def doPost(url: str, form_data: dict) -> BeautifulSoup:
    return BeautifulSoup(requests.post(url, form_data).content, 'html.parser')

In [73]:
page = doGet(url)

In [74]:
# get all gas vendors
vendors = [option.get('value').strip() for option in page.find('select', id='Retea').find_all('option')]

In [75]:
# get all gas types
types = [option.get('value').strip() for option in page.find('select', id='carburant').find_all('option')]

In [55]:
# get all counties of Romania
counties_page = doGet('https://en.wikipedia.org/wiki/Counties_of_Romania')
counties_table_rows = counties_page.select_one('#mw-content-text > div.mw-parser-output > table.wikitable').find('tbody').find_all('tr')[1:]
counties_data = [(elem.text.strip() for elem in row.select('th, td')[:-2]) for row in counties_table_rows]
counties_df = pd.DataFrame(counties_data)
counties_df.rename(columns={0: 'County', 1: 'County Seat', 3: 'Development Region', 4: 'ISO Code', 7: 'NUTS Code', 8: 'Population'}, inplace=True)
counties_df['County Seat'] = [re.sub('\[.*\]', '', value) for value in counties_df['County Seat']]
counties_df.drop(columns=[2, 5, 6], inplace=True)
# easier
counties_df.loc[9]['County Seat'] = 'Bucharest'
counties_df['Population'] = [value.replace(',','') for value in counties_df['Population']]
counties_df['Population'] = pd.to_numeric(counties_df['Population'])

In [156]:
counties_df.to_json('../data/counties.json', orient='records')

In [10]:
counties_df = pd.read_json('../data/counties.json', orient='records')

In [76]:
def createFormData(vendors: list, types: list, counties: list) -> list:
    form_datas = []

    for county in counties:
        for type in types:
            form_datas.append({
                'carburant': type,
                'locatie': 'Judet',
                'nume_locatie': county,
                'retea[]': vendors
            })

    return form_datas

In [77]:
form_datas = createFormData(vendors, types, counties_df['County'].to_list())
data = [{'type': form_data['carburant'], 'county': form_data['nume_locatie'], 'data': doPost(url, form_data)} for form_data in form_datas]

In [78]:
scraped_data = []

for county_findings in data:
    dom = county_findings['data']
    findings = dom.find('table', id='tabelaRezultate').find_all('tr')[1:]

    for finding in findings:
        cells = finding.find_all('td')

        price = cells[0].find('span', class_='pret').text.strip()
        vendor = cells[1].find('img').get('title').strip()
        city = cells[1].find('span', class_='small d-block text-muted').text.strip()

        scraped_data.append({'Type': county_findings['type'], 'County': county_findings['county'], 'City': city, 'Vendor': vendor, 'Price': price})

In [79]:
scraped_df = pd.DataFrame(scraped_data)

In [81]:
# delete all falsy values
scraped_df['Price'].replace('- -', np.nan, inplace=True)
scraped_df['Price'] = pd.to_numeric(scraped_df['Price'])

In [82]:
# better type definition
scraped_df['Type'] = [value.replace('_', ' ') for value in scraped_df['Type']]

In [83]:
# delete county names from city names
scraped_df['City'] = [re.sub(',.*$', '', value) for value in scraped_df['City']]

In [84]:
# add timestamp to scrape
scraped_df['Timestamp'] = pd.to_datetime('now')

In [61]:
# all gas types
scraped_df['Type'].unique()

array(['Benzina Regular', 'Motorina Regular', 'GPL', 'Benzina Premium',
       'Motorina Premium', 'AdBlue'], dtype=object)

In [86]:
type_ro_hun_assoc = {
    'Benzina Regular': '95-ös Benzin',
    'GPL': 'LPG',
    'Benzina Premium': 'Prémium Benzin',
    'Motorina Regular': 'Gázolaj',
    'Motorina Premium': 'Prémium Gázolaj',
    'AdBlue': 'AdBlue'
}

def type_translation_to_hun(type: str):
    return type_ro_hun_assoc[type]

In [87]:
scraped_df['Type'] = [type_translation_to_hun(value) for value in scraped_df['Type']]

In [104]:
path = '../data/gas_tmp-' + str(pd.to_datetime('now').date().day) + '.json'
scraped_df.to_json(path, orient='records')