**Note:**
This data pipeline scrappes raw data from [joburgmarket](http://www.joburgmarket.co.za/dailyprices.php) to an SQL Database.


In [1]:
import pandas as pd
import requests
import urllib
from scrapy.http import HtmlResponse
from sqlalchemy import create_engine, insert, Table, MetaData, select

# Custom upload with connection string
from engine_info import server_info
# From tables.py
import tables

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Url where the data is going to be scrapped from
url = 'http://www.joburgmarket.co.za/dailyprices.php'

In [3]:
# Extracting html content of the url
html = requests.get(url).content
# Creating a response variable to enable web scrapping in the url 
response = HtmlResponse(url=url, body=html)

In [4]:
# Extracting the date for latest information displayed on the website
date = response.xpath('//div[@id="right2"]').css('p b ::text').get()
print(f"The latest information in the Joburg market website is for {date}.")

The latest information in the Joburg market website is for 21 August 2020.


In [5]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [6]:
# Check what is in the database
engine.table_names()

['commodity_raw', 'container_raw', 'product_combination_raw', 'scrapping_date']

## scrapping_date update

In [7]:
metadata = MetaData(bind=engine)

In [8]:
# Dates from the sql date table
scrapped_dates = Table('scrapping_date', metadata, autoload=True, autoload_engine=engine)

In [9]:
stmt = select([scrapped_dates.columns.date])

In [10]:
sql_dates = connection.execute(stmt).fetchall()

In [11]:
# Latest date in the SQL database
sql_dates[0][0]

'20 August 2020'

In [12]:
# Check if the data in the database is up to date
if date == sql_dates[0][0]:
    # Close connection
    connection.close()
else:
    date_df = pd.DataFrame([date], columns=[tables.ScrappingDates.__table__.columns.keys()[1]])

In [13]:
date_df

Unnamed: 0,date
0,21 August 2020


In [14]:
# Transfer dataframe to sql table
date_df.to_sql('scrapping_date', con=engine, index=False, if_exists='append')

## commodity_raw update

In [15]:
# Data scrapped comes in as a list, which must be converted to a Dataframe
commodity_df = pd.read_html(url)[0]

In [16]:
commodity_df.head()

Unnamed: 0,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,APPLES,"R1,266,304.01MTD: R23,897,725.00","14,845MTD: 275,655","171,119MTD: 3,332,046",97019
2,ARTICHOKES,"R3,120.00MTD: R56,220.00",26MTD: 465,31MTD: 553,1
3,ASPARAGUS,"R3,750.00MTD: R262,725.00",5MTD: 364,"25MTD: 1,820",3
4,ATCHARA,"R78.00MTD: R1,429.20",1MTD: 24,4MTD: 69,206


In [17]:
# Attach the date column to the dataframe
commodity_df.insert(loc=0, column='Date', value=date)

In [18]:
commodity_df.head()

Unnamed: 0,Date,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,21 August 2020,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,21 August 2020,APPLES,"R1,266,304.01MTD: R23,897,725.00","14,845MTD: 275,655","171,119MTD: 3,332,046",97019
2,21 August 2020,ARTICHOKES,"R3,120.00MTD: R56,220.00",26MTD: 465,31MTD: 553,1
3,21 August 2020,ASPARAGUS,"R3,750.00MTD: R262,725.00",5MTD: 364,"25MTD: 1,820",3
4,21 August 2020,ATCHARA,"R78.00MTD: R1,429.20",1MTD: 24,4MTD: 69,206


In [19]:
# The commodity_df must be transfered to commodity_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
commodity_df.columns = tables.Commodity.__table__.columns.keys()[1:] # Exclude the id column

In [20]:
commodity_df.head()

Unnamed: 0,date,commodity,total_value_sold,total_qty_sold,total_kg_sold,qty_available
0,21 August 2020,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,21 August 2020,APPLES,"R1,266,304.01MTD: R23,897,725.00","14,845MTD: 275,655","171,119MTD: 3,332,046",97019
2,21 August 2020,ARTICHOKES,"R3,120.00MTD: R56,220.00",26MTD: 465,31MTD: 553,1
3,21 August 2020,ASPARAGUS,"R3,750.00MTD: R262,725.00",5MTD: 364,"25MTD: 1,820",3
4,21 August 2020,ATCHARA,"R78.00MTD: R1,429.20",1MTD: 24,4MTD: 69,206


In [21]:
# Transfer dataframe to sql table
commodity_df.to_sql('commodity_raw', con=engine, index=False, if_exists='append')

## container_raw update

In [22]:
# Extract value that links it to the commodity website for a detailed stats
commodity_values = response.xpath('//select/option/@value').extract()[1:] # Exclude 'All' option
# Extract commodity name
commodity_names = response.xpath('//table[@class="alltable"]').css('td.tleft2 ::text').extract()

In [23]:
for index, (name, value) in enumerate(zip(commodity_names, commodity_values)):
    # URL for container information which will update value for each commodity
    url_one = f'http://www.joburgmarket.co.za/dailyprices.php?commodity={value}&containerall=1'
    
    # Initially create a DataFrame 
    if not index:
        
        container_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the container dataframe
        container_df.insert(loc=0, column='commodity', value=name)
        
    else:
        
        # Create a temporary DataFrame
        temp_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the temporary dataframe
        temp_df.insert(loc=0, column='commodity', value=name)
        # Concatenate temporary DataFrame to the container DataFrame
        container_df = pd.concat([container_df, temp_df], ignore_index=True)

In [24]:
# Attach the date column to the dataframe
container_df.insert(loc=0, column='Date', value=date)

In [25]:
container_df.head()

Unnamed: 0,Date,commodity,Container,Qty Available,Value Sold,Qty Sold,Kg Sold,Average Price per Kg
0,21 August 2020,AMADUMBE,20KG POCKET,2.0,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",R0
1,21 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,17.0,"R0.00MTD: R41,932.00",0MTD: 547,"0MTD: 5,470",R0
2,21 August 2020,APPLES,10KG JUMBLE CARTON,30.0,R0.00MTD: R100.00,0MTD: 4,0MTD: 40,R0
3,21 August 2020,APPLES,11KG JUMBLE CARTON,1224.0,"R28,005.00MTD: R246,919.00","399MTD: 3,569","4389MTD: 39,259",R6.38
4,21 August 2020,APPLES,12 X 1KG ECONO PACK CARTON,2737.0,"R108,898.00MTD: R1,563,470.00","1456MTD: 18,809","17472MTD: 225,708",R6.23


In [26]:
# The container_df must be transfered to container_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
container_df.columns = tables.Container.__table__.columns.keys()[1:] # Exclude the id column

In [27]:
container_df.head()

Unnamed: 0,date,commodity,container,qty_available,value_sold,qty_sold,kg_sold,average_price_per_kg
0,21 August 2020,AMADUMBE,20KG POCKET,2.0,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",R0
1,21 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,17.0,"R0.00MTD: R41,932.00",0MTD: 547,"0MTD: 5,470",R0
2,21 August 2020,APPLES,10KG JUMBLE CARTON,30.0,R0.00MTD: R100.00,0MTD: 4,0MTD: 40,R0
3,21 August 2020,APPLES,11KG JUMBLE CARTON,1224.0,"R28,005.00MTD: R246,919.00","399MTD: 3,569","4389MTD: 39,259",R6.38
4,21 August 2020,APPLES,12 X 1KG ECONO PACK CARTON,2737.0,"R108,898.00MTD: R1,563,470.00","1456MTD: 18,809","17472MTD: 225,708",R6.23


In [28]:
# Transfer dataframe to sql table
container_df.to_sql('container_raw', con=engine, index=False, if_exists='append')

## product_combination_raw update

In [29]:
for index, (name, value) in enumerate(zip(commodity_names, commodity_values)):
    # URL for container information which will update value for each commodity
    url_two = f'http://www.joburgmarket.co.za/dailyprices.php?commodity={value}&containerall=2'
    
    # Initially create a DataFrame 
    if not index:
        
        product_combo_df = pd.read_html(url_two)[0]
        # Attach the commodity name column to the product_combo dataframe
        product_combo_df.insert(loc=0, column='commodity', value=name)
        
    else:
        
        # Create a temporary DataFrame
        temp_df = pd.read_html(url_two)[0]
        # Attach the commodity name column to the temporary dataframe
        temp_df.insert(loc=0, column='commodity', value=name)
        # Concatenate temporary DataFrame to the product_combo DataFrame
        product_combo_df = pd.concat([product_combo_df, temp_df], ignore_index=True)

In [30]:
product_combo_df.tail()

Unnamed: 0,commodity,Container,Unit Mass,Product Combination,Total Value Sold,Total Qty Sold,Total Kg Sold,Average,Highest Price,Ave per Kg,Highest Price per Kg
2078,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,L,*,*",R85.00,3.0,15.0,R28.33,R30.00,R5.67,R6.00
2079,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,L/M,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
2080,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,M,*,*","R1,980.00",170.0,850.0,R11.65,R60.00,R2.33,R12.00
2081,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,S,*,*",R340.00,16.0,80.0,R21.25,R40.00,R4.25,R8.00
2082,YELLOW PEPPERS,8KG BOX,8.0,"*,*,*,*,*","R1,170.00",12.0,96.0,R97.50,R100.00,R12.19,R12.50


In [31]:
# Attach the date column to the dataframe
product_combo_df.insert(loc=0, column='Date', value=date)

In [32]:
product_combo_df.head()

Unnamed: 0,Date,commodity,Container,Unit Mass,Product Combination,Total Value Sold,Total Qty Sold,Total Kg Sold,Average,Highest Price,Ave per Kg,Highest Price per Kg
0,21 August 2020,AMADUMBE,20KG POCKET,20.0,"*,*,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
1,21 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"GOLDEN DELICIOUS,CL 1,*,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
2,21 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"ROYAL GALA,CL 1,*,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
3,21 August 2020,APPLES,10KG JUMBLE CARTON,10.0,"GOLDEN DELICIOUS,CL 2,S,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
4,21 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"FUJI,CL 2,L,*,*","R18,905.00",270.0,2970.0,R70.02,R75.00,R6.37,R6.82


In [33]:
# The container_df must be transfered to container_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
product_combo_df.columns = tables.ProductCombination.__table__.columns.keys()[1:] # Exclude the id column

In [34]:
product_combo_df.head()

Unnamed: 0,date,commodity,container,unit_mass,product_combination,total_value_sold,total_qty_sold,total_kg_sold,average,highest_price,ave_per_kg,highest_price_per_kg
0,21 August 2020,AMADUMBE,20KG POCKET,20.0,"*,*,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
1,21 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"GOLDEN DELICIOUS,CL 1,*,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
2,21 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"ROYAL GALA,CL 1,*,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
3,21 August 2020,APPLES,10KG JUMBLE CARTON,10.0,"GOLDEN DELICIOUS,CL 2,S,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
4,21 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"FUJI,CL 2,L,*,*","R18,905.00",270.0,2970.0,R70.02,R75.00,R6.37,R6.82


In [35]:
# Transfer dataframe to sql table
product_combo_df.to_sql('product_combination_raw', con=engine, index=False, if_exists='append')

In [36]:
connection.close()