**Note:**
This data pipeline scrappes raw data from [joburgmarket](http://www.joburgmarket.co.za/dailyprices.php) to an SQL Database.


In [1]:
import pandas as pd
import requests
import urllib
from scrapy.http import HtmlResponse
from sqlalchemy import create_engine, insert, Table, MetaData, select

# Custom upload with connection string
from engine_info import server_info
# From tables.py
import tables

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Url where the data is going to be scrapped from
url = 'http://www.joburgmarket.co.za/dailyprices.php'

In [3]:
# Extracting html content of the url
html = requests.get(url).content
# Creating a response variable to enable web scrapping in the url 
response = HtmlResponse(url=url, body=html)

In [4]:
# Extracting the date for latest information displayed on the website
date = response.xpath('//div[@id="right2"]').css('p b ::text').get()
print(f"The latest information in the Joburg market website is for {date}.")

The latest information in the Joburg market website is for 1 September 2020.


In [5]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [6]:
# Check what is in the database
engine.table_names()

['Capetown_Fresh_produce_market',
 'Capetown_Fresh_produce_market_cleaned',
 'Durban_Fresh_produce_market',
 'Durban_Fresh_produce_market_cleaned',
 'Joburg_Fresh_produce_combined_cleaned',
 'Joburg_Fresh_produce_commodity_cleaned',
 'Joburg_Fresh_produce_commodity_raw',
 'Joburg_Fresh_produce_container_cleaned',
 'Joburg_Fresh_produce_container_raw',
 'Joburg_Fresh_produce_product_combination_raw',
 'Joburg_Fresh_produce_scrapping_date',
 'PickNPay_Prices',
 'Shoprite_Prices',
 'woolworths_Prices']

## scrapping_date update

In [7]:
metadata = MetaData(bind=engine)

In [8]:
# Dates from the sql date table
scrapped_dates = Table('Joburg_Fresh_produce_scrapping_date', metadata, autoload=True, autoload_engine=engine)

In [9]:
stmt = select([scrapped_dates.columns.date])

In [10]:
sql_dates = connection.execute(stmt).fetchall()

In [11]:
# Latest date in the SQL database
sql_dates[-1][0]

'31 August 2020'

In [12]:
# Check if the data in the database is up to date
if date == sql_dates[-1][0]:
    print("Warning!!! The date is already in the database, this will create duplicated data in the database.")
else:
    date_df = pd.DataFrame([date], columns=[tables.ScrappingDates.__table__.columns.keys()[1]])

In [13]:
date_df

Unnamed: 0,date
0,1 September 2020


In [14]:
# Transfer dataframe to sql table
date_df.to_sql('Joburg_Fresh_produce_scrapping_date', con=engine, index=False, if_exists='append')

IntegrityError: (pyodbc.IntegrityError) ('23000', "[23000] [Microsoft][ODBC SQL Server Driver][SQL Server]Cannot insert the value NULL into column 'rowid', table 'Fresh_Produce_Market_Data.dbo.Joburg_Fresh_produce_scrapping_date'; column does not allow nulls. INSERT fails. (515) (SQLExecDirectW); [23000] [Microsoft][ODBC SQL Server Driver][SQL Server]The statement has been terminated. (3621)")
[SQL: INSERT INTO [Joburg_Fresh_produce_scrapping_date] (date) VALUES (?)]
[parameters: ('1 September 2020',)]
(Background on this error at: http://sqlalche.me/e/13/gkpj)

## commodity_raw update

In [16]:
# Data scrapped comes in as a list, which must be converted to a Dataframe
commodity_df = pd.read_html(url)[0]

In [17]:
commodity_df.head()

Unnamed: 0,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,APPLES,"R1,384,561.00MTD: R26,320,686.00","15,402MTD: 301,679","190,267MTD: 3,666,996",110934
1,ARTICHOKES,"R0.00MTD: R56,220.00",0MTD: 465,0MTD: 553,1
2,ASPARAGUS,"R29,250.00MTD: R59,770.00",117MTD: 255,293MTD: 638,0
3,ATCHARA,"R0.00MTD: R1,429.20",0MTD: 24,0MTD: 69,206
4,AVOCADOS,"R476,110.00MTD: R12,859,706.40","4,074MTD: 136,591","26,636MTD: 824,555",15323


In [18]:
# Attach the date column to the dataframe
commodity_df.insert(loc=0, column='Date', value=date)

In [19]:
commodity_df.head()

Unnamed: 0,Date,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,24 August 2020,APPLES,"R1,384,561.00MTD: R26,320,686.00","15,402MTD: 301,679","190,267MTD: 3,666,996",110934
1,24 August 2020,ARTICHOKES,"R0.00MTD: R56,220.00",0MTD: 465,0MTD: 553,1
2,24 August 2020,ASPARAGUS,"R29,250.00MTD: R59,770.00",117MTD: 255,293MTD: 638,0
3,24 August 2020,ATCHARA,"R0.00MTD: R1,429.20",0MTD: 24,0MTD: 69,206
4,24 August 2020,AVOCADOS,"R476,110.00MTD: R12,859,706.40","4,074MTD: 136,591","26,636MTD: 824,555",15323


In [20]:
# The commodity_df must be transfered to commodity_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
commodity_df.columns = tables.Commodity.__table__.columns.keys()[1:] # Exclude the id column

In [21]:
commodity_df.head()

Unnamed: 0,date,commodity,total_value_sold,total_qty_sold,total_kg_sold,qty_available
0,24 August 2020,APPLES,"R1,384,561.00MTD: R26,320,686.00","15,402MTD: 301,679","190,267MTD: 3,666,996",110934
1,24 August 2020,ARTICHOKES,"R0.00MTD: R56,220.00",0MTD: 465,0MTD: 553,1
2,24 August 2020,ASPARAGUS,"R29,250.00MTD: R59,770.00",117MTD: 255,293MTD: 638,0
3,24 August 2020,ATCHARA,"R0.00MTD: R1,429.20",0MTD: 24,0MTD: 69,206
4,24 August 2020,AVOCADOS,"R476,110.00MTD: R12,859,706.40","4,074MTD: 136,591","26,636MTD: 824,555",15323


In [22]:
# Transfer dataframe to sql table
commodity_df.to_sql('commodity_raw', con=engine, index=False, if_exists='append')

## container_raw update

In [23]:
# Extract value that links it to the commodity website for a detailed stats
commodity_values = response.xpath('//select/option/@value').extract()[1:] # Exclude 'All' option
# Extract commodity name
commodity_names = response.xpath('//table[@class="alltable"]').css('td.tleft2 ::text').extract()

In [24]:
for index, (name, value) in enumerate(zip(commodity_names, commodity_values)):
    # URL for container information which will update value for each commodity
    url_one = f'http://www.joburgmarket.co.za/dailyprices.php?commodity={value}&containerall=1'
    
    # Initially create a DataFrame 
    if not index:
        
        container_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the container dataframe
        container_df.insert(loc=0, column='commodity', value=name)
        
    else:
        
        # Create a temporary DataFrame
        temp_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the temporary dataframe
        temp_df.insert(loc=0, column='commodity', value=name)
        # Concatenate temporary DataFrame to the container DataFrame
        container_df = pd.concat([container_df, temp_df], ignore_index=True)

In [25]:
# Attach the date column to the dataframe
container_df.insert(loc=0, column='Date', value=date)

In [26]:
container_df.head()

Unnamed: 0,Date,commodity,Container,Qty Available,Value Sold,Qty Sold,Kg Sold,Average Price per Kg
0,24 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,16.0,"R100.00MTD: R42,032.00",1MTD: 548,"10MTD: 5,480",R10
1,24 August 2020,APPLES,10KG JUMBLE CARTON,0.0,"R900.00MTD: R1,000.00",30MTD: 34,300MTD: 340,R3
2,24 August 2020,APPLES,11KG JUMBLE CARTON,1096.0,"R8,325.00MTD: R272,944.00","137MTD: 3,967","1507MTD: 43,637",R5.52
3,24 August 2020,APPLES,12 X 1KG ECONO PACK CARTON,2088.0,"R124,946.00MTD: R1,748,980.00","1474MTD: 20,930","17688MTD: 251,160",R7.06
4,24 August 2020,APPLES,12.5KG M6 CARTON,11.0,"R0.00MTD: R17,003.00",0MTD: 287,"0MTD: 3,588",R0


In [27]:
# The container_df must be transfered to container_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
container_df.columns = tables.Container.__table__.columns.keys()[1:] # Exclude the id column

In [28]:
container_df.head()

Unnamed: 0,date,commodity,container,qty_available,value_sold,qty_sold,kg_sold,average_price_per_kg
0,24 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,16.0,"R100.00MTD: R42,032.00",1MTD: 548,"10MTD: 5,480",R10
1,24 August 2020,APPLES,10KG JUMBLE CARTON,0.0,"R900.00MTD: R1,000.00",30MTD: 34,300MTD: 340,R3
2,24 August 2020,APPLES,11KG JUMBLE CARTON,1096.0,"R8,325.00MTD: R272,944.00","137MTD: 3,967","1507MTD: 43,637",R5.52
3,24 August 2020,APPLES,12 X 1KG ECONO PACK CARTON,2088.0,"R124,946.00MTD: R1,748,980.00","1474MTD: 20,930","17688MTD: 251,160",R7.06
4,24 August 2020,APPLES,12.5KG M6 CARTON,11.0,"R0.00MTD: R17,003.00",0MTD: 287,"0MTD: 3,588",R0


In [29]:
# Transfer dataframe to sql table
container_df.to_sql('Joburg_Fresh_produce_container_raw', con=engine, index=False, if_exists='append')

## product_combination_raw update

In [30]:
for index, (name, value) in enumerate(zip(commodity_names, commodity_values)):
    # URL for container information which will update value for each commodity
    url_two = f'http://www.joburgmarket.co.za/dailyprices.php?commodity={value}&containerall=2'
    
    # Initially create a DataFrame 
    if not index:
        
        product_combo_df = pd.read_html(url_two)[0]
        # Attach the commodity name column to the product_combo dataframe
        product_combo_df.insert(loc=0, column='commodity', value=name)
        
    else:
        
        # Create a temporary DataFrame
        temp_df = pd.read_html(url_two)[0]
        # Attach the commodity name column to the temporary dataframe
        temp_df.insert(loc=0, column='commodity', value=name)
        # Concatenate temporary DataFrame to the product_combo DataFrame
        product_combo_df = pd.concat([product_combo_df, temp_df], ignore_index=True)

In [31]:
product_combo_df.tail()

Unnamed: 0,commodity,Container,Unit Mass,Product Combination,Total Value Sold,Total Qty Sold,Total Kg Sold,Average,Highest Price,Ave per Kg,Highest Price per Kg
1874,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,L,*,*","R5,675.00",240.0,1200.0,R23.65,R120.00,R4.73,R24.00
1875,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,L/M,*,*",R200.00,2.0,10.0,R100.00,R100.00,R20.00,R20.00
1876,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,M,*,*","R1,100.00",55.0,275.0,R20.00,R20.00,R4.00,R4.00
1877,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,S,*,*","R1,250.00",62.0,310.0,R20.16,R25.00,R4.03,R5.00
1878,YELLOW PEPPERS,8KG BOX,8.0,"*,*,*,*,*",R700.00,35.0,280.0,R20.00,R20.00,R2.50,R2.50


In [32]:
# Attach the date column to the dataframe
product_combo_df.insert(loc=0, column='Date', value=date)

In [33]:
product_combo_df.head()

Unnamed: 0,Date,commodity,Container,Unit Mass,Product Combination,Total Value Sold,Total Qty Sold,Total Kg Sold,Average,Highest Price,Ave per Kg,Highest Price per Kg
0,24 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"GOLDEN DELICIOUS,CL 1,*,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
1,24 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"ROYAL GALA,CL 1,*,*,*",R100.00,1.0,10.0,R100.00,R100.00,R10.00,R10.00
2,24 August 2020,APPLES,10KG JUMBLE CARTON,10.0,"GOLDEN DELICIOUS,CL 2,S,*,*",R900.00,30.0,300.0,R30.00,R30.00,R3.00,R3.00
3,24 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"CRIPPS PINK,CL 2,S,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
4,24 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"FUJI,CL 2,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00


In [34]:
# The container_df must be transfered to container_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
product_combo_df.columns = tables.ProductCombination.__table__.columns.keys()[1:] # Exclude the id column

In [35]:
product_combo_df.head()

Unnamed: 0,date,commodity,container,unit_mass,product_combination,total_value_sold,total_qty_sold,total_kg_sold,average,highest_price,ave_per_kg,highest_price_per_kg
0,24 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"GOLDEN DELICIOUS,CL 1,*,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
1,24 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"ROYAL GALA,CL 1,*,*,*",R100.00,1.0,10.0,R100.00,R100.00,R10.00,R10.00
2,24 August 2020,APPLES,10KG JUMBLE CARTON,10.0,"GOLDEN DELICIOUS,CL 2,S,*,*",R900.00,30.0,300.0,R30.00,R30.00,R3.00,R3.00
3,24 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"CRIPPS PINK,CL 2,S,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
4,24 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"FUJI,CL 2,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00


In [36]:
# Transfer dataframe to sql table
product_combo_df.to_sql('Joburg_Fresh_produce_product_combination_raw', con=engine, index=False, if_exists='append')

In [37]:
connection.close()