**Note:**
This data pipeline scrappes raw data from [joburgmarket](http://www.joburgmarket.co.za/dailyprices.php) to an SQL Database.


In [109]:
import pandas as pd
import requests
import urllib
from scrapy.http import HtmlResponse
from sqlalchemy import create_engine, insert

# Custom upload with connection string
from engine_info import server_info
# From tables.py
import tables

import warnings
warnings.filterwarnings('ignore')

In [58]:
# Url where the data is going to be scrapped from
url = 'http://www.joburgmarket.co.za/dailyprices.php'

In [59]:
# Extracting html content of the url
html = requests.get(url).content
# Creating a response variable to enable web scrapping in the url 
response = HtmlResponse(url=url, body=html)

In [60]:
# Extracting the date for latest information displayed on the website
date = response.xpath('//div[@id="right2"]').css('p b ::text').get()
print(f"The information in the website is for {date}.")

The information in the website is for 20 August 2020.


In [61]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [62]:
# Check what is in the database
engine.table_names()

['commodity_raw', 'container_raw', 'product_combination_raw', 'scrapping_date']

## scrapping_date update

In [143]:
date_df = pd.DataFrame([date], columns=[tables.ScrappingDates.__table__.columns.keys()[1]])

In [144]:
date_df

Unnamed: 0,date
0,20 August 2020


In [145]:
# Transfer dataframe to sql table
date_df.to_sql('scrapping_date', con=engine, index=False, if_exists='append')

## commodity_raw update

In [63]:
# Data scrapped comes in as a list, which must be converted to a Dataframe
commodity_df = pd.read_html(url)[0]

In [64]:
commodity_df.head()

Unnamed: 0,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,APPLES,"R1,205,932.00MTD: R22,664,221.00","13,799MTD: 261,296","157,462MTD: 3,163,863",91755
2,ARTICHOKES,"R600.00MTD: R53,100.00",4MTD: 439,3MTD: 522,1
3,ASPARAGUS,"R34,000.00MTD: R258,975.00",50MTD: 359,"250MTD: 1,795",8
4,ATCHARA,"R0.00MTD: R1,351.20",0MTD: 23,0MTD: 65,207


In [65]:
# Attach the date column to the dataframe
commodity_df.insert(loc=0, column='Date', value=date)

In [67]:
commodity_df.head()

Unnamed: 0,Date,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,20 August 2020,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,20 August 2020,APPLES,"R1,205,932.00MTD: R22,664,221.00","13,799MTD: 261,296","157,462MTD: 3,163,863",91755
2,20 August 2020,ARTICHOKES,"R600.00MTD: R53,100.00",4MTD: 439,3MTD: 522,1
3,20 August 2020,ASPARAGUS,"R34,000.00MTD: R258,975.00",50MTD: 359,"250MTD: 1,795",8
4,20 August 2020,ATCHARA,"R0.00MTD: R1,351.20",0MTD: 23,0MTD: 65,207


In [68]:
# The commodity_df must be transfered to commodity_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
commodity_df.columns = tables.Commodity.__table__.columns.keys()[1:] # Exclude the id column

In [69]:
commodity_df.head()

Unnamed: 0,date,commodity,total_value_sold,total_qty_sold,total_kg_sold,qty_available
0,20 August 2020,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,20 August 2020,APPLES,"R1,205,932.00MTD: R22,664,221.00","13,799MTD: 261,296","157,462MTD: 3,163,863",91755
2,20 August 2020,ARTICHOKES,"R600.00MTD: R53,100.00",4MTD: 439,3MTD: 522,1
3,20 August 2020,ASPARAGUS,"R34,000.00MTD: R258,975.00",50MTD: 359,"250MTD: 1,795",8
4,20 August 2020,ATCHARA,"R0.00MTD: R1,351.20",0MTD: 23,0MTD: 65,207


In [70]:
# Transfer dataframe to sql table
commodity_df.to_sql('commodity_raw', con=engine, index=False, if_exists='append')

## container_raw update

In [87]:
# Extract value that links it to the commodity website for a detailed stats
commodity_values = response.xpath('//select/option/@value').extract()[1:] # Exclude 'All' option
# Extract commodity name
commodity_names = response.xpath('//table[@class="alltable"]').css('td.tleft2 ::text').extract()

In [110]:
for index, (name, value) in enumerate(zip(commodity_names, commodity_values)):
    # URL for container information which will update value for each commodity
    url_one = f'http://www.joburgmarket.co.za/dailyprices.php?commodity={value}&containerall=1'
    
    # Initially create a DataFrame 
    if not index:
        
        container_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the container dataframe
        container_df.insert(loc=0, column='commodity', value=name)
        
    else:
        
        # Create a temporary DataFrame
        temp_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the temporary dataframe
        temp_df.insert(loc=0, column='commodity', value=name)
        # Concatenate temporary DataFrame to the container DataFrame
        container_df = pd.concat([container_df, temp_df], ignore_index=True)

In [113]:
# Attach the date column to the dataframe
container_df.insert(loc=0, column='Date', value=date)

In [114]:
container_df.head()

Unnamed: 0,Date,commodity,Container,Qty Available,Value Sold,Qty Sold,Kg Sold,Average Price per Kg
0,20 August 2020,AMADUMBE,20KG POCKET,2.0,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",R0
1,20 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,17.0,"R100.00MTD: R41,932.00",1MTD: 547,"10MTD: 5,470",R10
2,20 August 2020,APPLES,11KG JUMBLE CARTON,343.0,"R1,190.00MTD: R218,914.00","17MTD: 3,170","187MTD: 34,870",R6.36
3,20 August 2020,APPLES,12 X 1KG ECONO PACK CARTON,3233.0,"R40,738.00MTD: R1,454,572.00","432MTD: 17,353","5184MTD: 208,236",R7.86
4,20 August 2020,APPLES,12.5KG M6 CARTON,12.0,"R0.00MTD: R16,903.00",0MTD: 286,"0MTD: 3,575",R0


In [115]:
# The container_df must be transfered to container_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
container_df.columns = tables.Container.__table__.columns.keys()[1:] # Exclude the id column

In [119]:
container_df.head()

Unnamed: 0,date,commodity,container,qty_available,value_sold,qty_sold,kg_sold,average_price_per_kg
0,20 August 2020,AMADUMBE,20KG POCKET,2.0,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",R0
1,20 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,17.0,"R100.00MTD: R41,932.00",1MTD: 547,"10MTD: 5,470",R10
2,20 August 2020,APPLES,11KG JUMBLE CARTON,343.0,"R1,190.00MTD: R218,914.00","17MTD: 3,170","187MTD: 34,870",R6.36
3,20 August 2020,APPLES,12 X 1KG ECONO PACK CARTON,3233.0,"R40,738.00MTD: R1,454,572.00","432MTD: 17,353","5184MTD: 208,236",R7.86
4,20 August 2020,APPLES,12.5KG M6 CARTON,12.0,"R0.00MTD: R16,903.00",0MTD: 286,"0MTD: 3,575",R0


In [117]:
# Transfer dataframe to sql table
container_df.to_sql('container_raw', con=engine, index=False, if_exists='append')

## product_combination_raw update

In [121]:
for index, (name, value) in enumerate(zip(commodity_names, commodity_values)):
    # URL for container information which will update value for each commodity
    url_two = f'http://www.joburgmarket.co.za/dailyprices.php?commodity={value}&containerall=2'
    
    # Initially create a DataFrame 
    if not index:
        
        product_combo_df = pd.read_html(url_two)[0]
        # Attach the commodity name column to the product_combo dataframe
        product_combo_df.insert(loc=0, column='commodity', value=name)
        
    else:
        
        # Create a temporary DataFrame
        temp_df = pd.read_html(url_two)[0]
        # Attach the commodity name column to the temporary dataframe
        temp_df.insert(loc=0, column='commodity', value=name)
        # Concatenate temporary DataFrame to the product_combo DataFrame
        product_combo_df = pd.concat([product_combo_df, temp_df], ignore_index=True)

In [124]:
product_combo_df.tail()

Unnamed: 0,commodity,Container,Unit Mass,Product Combination,Total Value Sold,Total Qty Sold,Total Kg Sold,Average,Highest Price,Ave per Kg,Highest Price per Kg
2157,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,L,*,*",R590.00,19.0,95.0,R31.05,R60.00,R6.21,R12.00
2158,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,L/M,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
2159,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,M,*,*","R2,565.00",170.0,850.0,R15.09,R40.00,R3.02,R8.00
2160,YELLOW PEPPERS,5KG BOX,5.0,"*,CL 2,S,*,*",R60.00,1.0,5.0,R60.00,R60.00,R12.00,R12.00
2161,YELLOW PEPPERS,8KG BOX,8.0,"*,*,*,*,*",R80.00,1.0,8.0,R80.00,R80.00,R10.00,R10.00


In [125]:
# Attach the date column to the dataframe
product_combo_df.insert(loc=0, column='Date', value=date)

In [126]:
product_combo_df.head()

Unnamed: 0,Date,commodity,Container,Unit Mass,Product Combination,Total Value Sold,Total Qty Sold,Total Kg Sold,Average,Highest Price,Ave per Kg,Highest Price per Kg
0,20 August 2020,AMADUMBE,20KG POCKET,20.0,"*,*,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
1,20 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"GOLDEN DELICIOUS,CL 1,*,*,*",R100.00,1.0,10.0,R100.00,R100.00,R10.00,R10.00
2,20 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"ROYAL GALA,CL 1,*,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
3,20 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"GOLDEN DELICIOUS,CL 2,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
4,20 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"TOPRED,CL 2,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00


In [127]:
# The container_df must be transfered to container_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
product_combo_df.columns = tables.ProductCombination.__table__.columns.keys()[1:] # Exclude the id column

In [128]:
product_combo_df.head()

Unnamed: 0,date,commodity,container,unit_mass,product_combination,total_value_sold,total_qty_sold,total_kg_sold,average,highest_price,ave_per_kg,highest_price_per_kg
0,20 August 2020,AMADUMBE,20KG POCKET,20.0,"*,*,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
1,20 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"GOLDEN DELICIOUS,CL 1,*,*,*",R100.00,1.0,10.0,R100.00,R100.00,R10.00,R10.00
2,20 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,10.0,"ROYAL GALA,CL 1,*,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
3,20 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"GOLDEN DELICIOUS,CL 2,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00
4,20 August 2020,APPLES,11KG JUMBLE CARTON,11.0,"TOPRED,CL 2,L,*,*",R0.00,0.0,0.0,R0.00,R0.00,R0.00,R0.00


In [129]:
# Transfer dataframe to sql table
product_combo_df.to_sql('product_combination_raw', con=engine, index=False, if_exists='append')

In [146]:
connection.close()