In [1]:
import pandas as pd
import requests
import lxml
from bs4 import BeautifulSoup
import re
import configparser
import sqlalchemy
from sqlalchemy import create_engine

In [2]:
config_path = '/home/sam/Everything Python/config_file_childcare.ini'
config = configparser.ConfigParser()
config.read(config_path)
user = config.get('db','username')
password = config.get('db','password')

In [3]:
sqlalchemy.__version__

'1.3.9'

In [4]:
connection_string = f'postgresql+psycopg2://{user}:{password}@localhost/fuel_prices'
engine = create_engine(connection_string, executemany_mode = 'batch')

In [5]:
#we will visit the website and get the json link that holds all the download links
r=requests.get('https://data.nsw.gov.au/data/dataset/fuel-check')

In [6]:
r.status_code #200 means download was successful

200

In [7]:
soup=BeautifulSoup(r.content,"lxml")
#transfer content into beautifulsoup, so i can filter what i want

a = soup.find("ul", class_="au-tags homepage-search-tags") #contains the JSON link which we will scrape to get the file links

json_link = [i.a['href'] for i in a.find_all('li') #find all 'li'
                         if i.text.strip()=='JSON'] #only give the 'href' for JSON

#pass it back into requests

url_json = f'https://data.nsw.gov.au/{json_link[0]}'

json_content = requests.get(url_json)

In [8]:
json_content.status_code

200

In [9]:
json_content = json_content.json()

In [10]:
p = re.compile(r'xls[x]?', re.IGNORECASE)

In [11]:
result = engine.execute('SELECT DISTINCT(key) FROM servicestations;').fetchall()

Keys = [''.join(i.values()) for i in result]

In [12]:
fuel_check = {}
for j in json_content['result']['resources']:
    if p.search(j['format']):
        if j['name'] not in Keys:
            fuel_check[j['name']] = j['url']

In [13]:
def df_header(dataframe):
    '''
    Rename header of dataframes
    '''
    dataframe.columns = ['ServiceStationName','address','suburb',
                         'postcode','brand', 
                         'fuelcode','priceupdateddate',
                         'price']
    
    return dataframe

In [14]:
def dtype_fix(dataframe):
    dataframe['postcode'] = pd.to_numeric(dataframe['postcode'])
    dataframe['price'] = pd.to_numeric(dataframe['price'])
    dataframe['priceupdateddate'] = pd.to_datetime(dataframe['priceupdateddate'],yearfirst=True)
    dataframe = dataframe.loc[~dataframe.isna().all(axis=1)]
    return dataframe

In [15]:
def read_url(key,link):
    download = ( pd.read_excel(link,header=None, index_col = 0, engine = 'xlrd') 
                   .loc['ServiceStationName':,]
                   .iloc[1:,]
                   .reset_index()
                   .pipe(df_header)
                   .pipe(dtype_fix)
                   .fillna(method = 'ffill')
                   .assign(key = key)
               )
    return download

In [16]:
# execution time - about 5 mins
dlds =  [read_url(key,link) 
         for key, link in fuel_check.items()
        ]

In [17]:
list_all = pd.concat(dlds,ignore_index=False,sort=False).drop_duplicates(subset=['address','suburb','brand','fuelcode','priceupdateddate','price'])

In [None]:
list_all.to_sql('servicestations',con=engine, if_exists='append') #took about 3 minutes to process

In [18]:
list_all.dtypes

ServiceStationName            object
address                       object
suburb                        object
postcode                     float64
brand                         object
fuelcode                      object
priceupdateddate      datetime64[ns]
price                        float64
key                           object
dtype: object

In [19]:
list_all.shape

(2489430, 9)

In [20]:
list_all.isna().all(axis=1).sum()

0

In [21]:
list_all[list_all.duplicated(subset=['address','brand','fuelcode','priceupdateddate','price'], keep=False)]

Unnamed: 0,ServiceStationName,address,suburb,postcode,brand,fuelcode,priceupdateddate,price,key
