**Note:**
This data pipeline scrappes raw data from [joburgmarket](http://www.joburgmarket.co.za/dailyprices.php) to an SQL Database.


In [1]:
import pandas as pd
import requests
import urllib
from scrapy.http import HtmlResponse
from sqlalchemy import create_engine, insert, Table, MetaData, select

# Custom upload with connection string
from engine_info import server_info
# From tables.py
import tables

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Url where the data is going to be scrapped from
url = 'http://www.joburgmarket.co.za/dailyprices.php'

In [3]:
# Extracting html content of the url
html = requests.get(url).content
# Creating a response variable to enable web scrapping in the url 
response = HtmlResponse(url=url, body=html)

In [4]:
# Extracting the date for latest information displayed on the website
date = response.xpath('//div[@id="right2"]').css('p b ::text').get()
print(f"The latest information in the Joburg market website is for {date}.")

The latest information in the Joburg market website is for 30 September 2020.


In [5]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [6]:
# Check what is in the database
engine.table_names()

['Capetown_Fresh_produce_market',
 'container',
 'Durban_Fresh_produce_market',
 'inventory',
 'Joburg_Fresh_produce_combined_cleaned',
 'Joburg_Fresh_produce_commodity_cleaned',
 'Joburg_Fresh_produce_commodity_raw',
 'Joburg_Fresh_produce_container_cleaned',
 'Joburg_Fresh_produce_container_raw',
 'Joburg_Fresh_produce_product_combination_raw',
 'Joburg_Fresh_produce_scrapping_date',
 'Joburg_Fresh_produced_cleaned_date',
 'master_date',
 'PickNPay_Prices',
 'product',
 'product_combination',
 'sales',
 'Shoprite_Prices',
 'sysdiagrams',
 'woolworths_Prices']

## scrapping_date update

In [7]:
metadata = MetaData(bind=engine)

In [8]:
# Dates from the sql date table
scrapped_dates = Table('Joburg_Fresh_produce_scrapping_date', metadata, autoload=True, autoload_engine=engine)

In [9]:
stmt = select([scrapped_dates.columns.date])

In [10]:
sql_dates = connection.execute(stmt).fetchall()

In [11]:
# Latest date in the SQL database
sql_dates[-1][0]

'29 September 2020'

In [12]:
# Check if the data in the database is up to date
if date == sql_dates[-1][0]:
    print("Warning!!! The date is already in the database, this will create duplicated data in the database.")
else:
    date_df = pd.DataFrame([date], columns=[tables.ScrappingDates.__table__.columns.keys()[1]])

In [13]:
date_df

Unnamed: 0,date
0,30 September 2020


In [14]:
# Transfer dataframe to sql table
date_df.to_sql('Joburg_Fresh_produce_scrapping_date', con=engine, index=False, if_exists='append')

IntegrityError: (pyodbc.IntegrityError) ('23000', "[23000] [Microsoft][ODBC SQL Server Driver][SQL Server]Cannot insert the value NULL into column 'rowid', table 'Fresh_Produce_Market_Data.dbo.Joburg_Fresh_produce_scrapping_date'; column does not allow nulls. INSERT fails. (515) (SQLExecDirectW); [23000] [Microsoft][ODBC SQL Server Driver][SQL Server]The statement has been terminated. (3621)")
[SQL: INSERT INTO [Joburg_Fresh_produce_scrapping_date] (date) VALUES (?)]
[parameters: ('30 September 2020',)]
(Background on this error at: http://sqlalche.me/e/13/gkpj)

## commodity_raw update

In [None]:
# Data scrapped comes in as a list, which must be converted to a Dataframe
commodity_df = pd.read_html(url)[0]

In [None]:
commodity_df.head()

In [None]:
# Attach the date column to the dataframe
commodity_df.insert(loc=0, column='Date', value=date)

In [None]:
commodity_df.head()

In [None]:
# The commodity_df must be transfered to commodity_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
commodity_df.columns = tables.Commodity.__table__.columns.keys()[1:] # Exclude the id column

In [None]:
commodity_df.head()

In [None]:
# Transfer dataframe to sql table
commodity_df.to_sql('commodity_raw', con=engine, index=False, if_exists='append')

## container_raw update

In [None]:
# Extract value that links it to the commodity website for a detailed stats
commodity_values = response.xpath('//select/option/@value').extract()[1:] # Exclude 'All' option
# Extract commodity name
commodity_names = response.xpath('//table[@class="alltable"]').css('td.tleft2 ::text').extract()

In [None]:
for index, (name, value) in enumerate(zip(commodity_names, commodity_values)):
    # URL for container information which will update value for each commodity
    url_one = f'http://www.joburgmarket.co.za/dailyprices.php?commodity={value}&containerall=1'
    
    # Initially create a DataFrame 
    if not index:
        
        container_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the container dataframe
        container_df.insert(loc=0, column='commodity', value=name)
        
    else:
        
        # Create a temporary DataFrame
        temp_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the temporary dataframe
        temp_df.insert(loc=0, column='commodity', value=name)
        # Concatenate temporary DataFrame to the container DataFrame
        container_df = pd.concat([container_df, temp_df], ignore_index=True)

In [None]:
# Attach the date column to the dataframe
container_df.insert(loc=0, column='Date', value=date)

In [None]:
container_df.head()

In [None]:
# The container_df must be transfered to container_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
container_df.columns = tables.Container.__table__.columns.keys()[1:] # Exclude the id column

In [None]:
container_df.head()

In [None]:
# Transfer dataframe to sql table
container_df.to_sql('Joburg_Fresh_produce_container_raw', con=engine, index=False, if_exists='append')

## product_combination_raw update

In [None]:
for index, (name, value) in enumerate(zip(commodity_names, commodity_values)):
    # URL for container information which will update value for each commodity
    url_two = f'http://www.joburgmarket.co.za/dailyprices.php?commodity={value}&containerall=2'
    
    # Initially create a DataFrame 
    if not index:
        
        product_combo_df = pd.read_html(url_two)[0]
        # Attach the commodity name column to the product_combo dataframe
        product_combo_df.insert(loc=0, column='commodity', value=name)
        
    else:
        
        # Create a temporary DataFrame
        temp_df = pd.read_html(url_two)[0]
        # Attach the commodity name column to the temporary dataframe
        temp_df.insert(loc=0, column='commodity', value=name)
        # Concatenate temporary DataFrame to the product_combo DataFrame
        product_combo_df = pd.concat([product_combo_df, temp_df], ignore_index=True)

In [None]:
product_combo_df.tail()

In [None]:
# Attach the date column to the dataframe
product_combo_df.insert(loc=0, column='Date', value=date)

In [None]:
product_combo_df.head()

In [None]:
# The container_df must be transfered to container_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
product_combo_df.columns = tables.ProductCombination.__table__.columns.keys()[1:] # Exclude the id column

In [None]:
product_combo_df.head()

In [None]:
# Transfer dataframe to sql table
product_combo_df.to_sql('Joburg_Fresh_produce_product_combination_raw', con=engine, index=False, if_exists='append')

In [None]:
connection.close()