In [None]:
import pandas as pd
import requests
import lxml
from bs4 import BeautifulSoup
import re
import configparser
import sqlalchemy
from sqlalchemy import create_engine

In [None]:
config_path = '/home/sam/Everything Python/config_file.ini'
config = configparser.ConfigParser()
config.read(config_path)
user = config.get('db','username')
password = config.get('db','password')

In [None]:
sqlalchemy.__version__

In [None]:
connection_string = f'postgresql+psycopg2://{user}:{password}@localhost/fuel_prices'
engine = create_engine(connection_string, executemany_mode = 'batch')

In [None]:
#we will visit the website and get the json link that holds all the download links
r=requests.get('https://data.nsw.gov.au/data/dataset/fuel-check')

In [None]:
r.status_code #200 means download was successful

In [None]:
#transfer content into beautifulsoup, so i can filter what i want
soup=BeautifulSoup(r.content,"lxml")

#contains the JSON link which we will scrape to get the file links
a = soup.find("ul", class_="au-tags homepage-search-tags") 

json_link = [i.a['href'] for i in a.find_all('li') #find all 'li'
                         if i.text.strip()=='JSON'] #only give the 'href' for JSON

#pass it back into requests
#to get the json that holds all the links for the fuel prices of each month

url_json = f'https://data.nsw.gov.au/{json_link[0]}'
json_content = requests.get(url_json)

In [None]:
json_content.status_code

In [None]:
json_content = json_content.json()

In [None]:
p = re.compile(r'xls[x]?', re.IGNORECASE) 

In [None]:
#checks the database
#extracts the unique keys
#comes in handy during downloads to ensure we are not downloading an already existing data

result = engine.execute('SELECT DISTINCT(key) FROM servicestations;').fetchall()
Keys = [''.join(i.values()) for i in result]

In [None]:
#this gets us all the excel file links 
fuel_check = {}
for j in json_content['result']['resources']:
    #search for only formats that are xls or xlsx
    if p.search(j['format']):
        #if file has already been downloaded, then dont download again.
        if j['name'] not in Keys:
            fuel_check[j['name']] = j['url']

In [None]:
len(fuel_check)

In [None]:
def df_header(dataframe):
    '''
    Rename column names of dataframes
    '''
    dataframe.columns = ['servicestationname','address','suburb',
                         'postcode','brand', 
                         'fuelcode','priceupdateddate',
                         'price']
    
    return dataframe

In [None]:
def dtype_fix(dataframe):
    '''
    Change the data types of the columns
    '''
    dataframe['postcode'] = pd.to_numeric(dataframe['postcode'], errors = 'coerce')
    dataframe['price'] = pd.to_numeric(dataframe['price'], errors = 'coerce')
    dataframe['priceupdateddate'] = pd.to_datetime(dataframe['priceupdateddate'],
                                                   yearfirst=True,
                                                   errors = 'coerce')
    return dataframe

In [None]:
def read_url(key, url):
    '''
    function reads in file and 
    does some preprocessing
    '''
    
    col_names = ['servicestationname','address','suburb',
                         'postcode','brand', 
                         'fuelcode','priceupdateddate',
                         'price']
    
    download = (pd.read_excel(url, header = None, names = col_names)
                  .pipe(dtype_fix)
                  .assign(key = key)
               )
    return download

In [None]:
dls = [read_url(key,url) 
       for key, url 
       in fuel_check.items()
      ]

In [None]:
fuel_dls = pd.concat(dls,
                     ignore_index=True,
                     sort = False)

In [None]:
fuel_dls = (fuel_dls.query('price.notna()')
                    .fillna(method='ffill'))

In [None]:
#export to the postgres database
fuel_dls.to_sql('servicestations',
                con=engine,
                if_exists='append',
                index=False) 