**Note:**
This data pipeline scrappes raw data from [joburgmarket](http://www.joburgmarket.co.za/dailyprices.php) to an SQL Database.


In [57]:
import pandas as pd
import requests
import urllib
from scrapy.http import HtmlResponse
from sqlalchemy import create_engine, insert


# Custom upload with connection string
from engine_info import server_info
# From tables.py
import tables

import warnings
warnings.filterwarnings('ignore')

In [58]:
# Url where the data is going to be scrapped from
url = 'http://www.joburgmarket.co.za/dailyprices.php'

In [59]:
# Extracting html content of the url
html = requests.get(url).content
# Creating a response variable to enable web scrapping in the url 
response = HtmlResponse(url=url, body=html)

In [60]:
# Extracting the date for latest information displayed on the website
date = response.xpath('//div[@id="right2"]').css('p b ::text').get()
print(f"The information in the website is for {date}.")

The information in the website is for 20 August 2020.


In [61]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [62]:
# Check what is in the database
engine.table_names()

['commodity_raw', 'container_raw', 'product_combination_raw', 'scrapping_date']

## commodity_raw update

In [63]:
# Data scrapped comes in as a list, which must be converted to a Dataframe
commodity_df = pd.read_html(url)[0]

In [64]:
commodity_df.head()

Unnamed: 0,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,APPLES,"R1,205,932.00MTD: R22,664,221.00","13,799MTD: 261,296","157,462MTD: 3,163,863",91755
2,ARTICHOKES,"R600.00MTD: R53,100.00",4MTD: 439,3MTD: 522,1
3,ASPARAGUS,"R34,000.00MTD: R258,975.00",50MTD: 359,"250MTD: 1,795",8
4,ATCHARA,"R0.00MTD: R1,351.20",0MTD: 23,0MTD: 65,207


In [65]:
# Attach the date column to the dataframe
commodity_df.insert(loc=0, column='Date', value=date)

In [67]:
commodity_df.head()

Unnamed: 0,Date,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,20 August 2020,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,20 August 2020,APPLES,"R1,205,932.00MTD: R22,664,221.00","13,799MTD: 261,296","157,462MTD: 3,163,863",91755
2,20 August 2020,ARTICHOKES,"R600.00MTD: R53,100.00",4MTD: 439,3MTD: 522,1
3,20 August 2020,ASPARAGUS,"R34,000.00MTD: R258,975.00",50MTD: 359,"250MTD: 1,795",8
4,20 August 2020,ATCHARA,"R0.00MTD: R1,351.20",0MTD: 23,0MTD: 65,207


In [68]:
# The commodity_df must be transfered to commodity_raw sql database.
# Rename the column names in DataFrame to match column names in SQL table
commodity_df.columns = tables.Commodity.__table__.columns.keys()[1:] # Exclude the id column

In [69]:
commodity_df.head()

Unnamed: 0,date,commodity,total_value_sold,total_qty_sold,total_kg_sold,qty_available
0,20 August 2020,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,20 August 2020,APPLES,"R1,205,932.00MTD: R22,664,221.00","13,799MTD: 261,296","157,462MTD: 3,163,863",91755
2,20 August 2020,ARTICHOKES,"R600.00MTD: R53,100.00",4MTD: 439,3MTD: 522,1
3,20 August 2020,ASPARAGUS,"R34,000.00MTD: R258,975.00",50MTD: 359,"250MTD: 1,795",8
4,20 August 2020,ATCHARA,"R0.00MTD: R1,351.20",0MTD: 23,0MTD: 65,207


In [70]:
# Transfer dataframe to sql table
commodity_df.to_sql('commodity_raw', con=engine, index=False, if_exists='append')

## container_raw update

In [87]:
# Extract value that links it to the commodity website for a detailed stats
commodity_values = response.xpath('//select/option/@value').extract()[1:] # Exclude 'All' option
# Extract commodity name
commodity_names = response.xpath('//table[@class="alltable"]').css('td.tleft2 ::text').extract()

In [None]:
for index, (name, value) in enumerate(zip(commodity_names, commodity_values)):
    # URL for container information which will update value for each commodity
    url_one = f'http://www.joburgmarket.co.za/dailyprices.php?commodity={value}&containerall=1'
    
    # Initially create a DataFrame 
    if not index:
        
        container_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the dataframe
        commodity_df.insert(loc=0, column='commodity', value=name)
        
    else:
        
        # Create a temporary DataFrame
        temp_df = pd.read_html(url_one)[0]
        # Attach the commodity name column to the temporary dataframe
        temp_df.insert(loc=0, column='commodity', value=name)
        # Concatenate temporary DataFrame to the container DataFrame
        container_df = pd.concat([container_df, temp_df])


In [None]:
# Attach the date column to the dataframe
container_df.insert(loc=0, column='Date', value=date)

In [None]:
response.css('div#right2 > h1 ::text').extract()

In [None]:
response.xpath('//div[@id="right2"]/h1//text()').extract()

In [None]:
response.xpath('//div[@id="right2"]').css('h1 ::text').extract_first()

In [None]:
response.xpath('//div[@id="right2"]').css('p b ::text').extract_first()

In [None]:
response.xpath('//div[@id="right2"]/h1//text()').get()

In [None]:
date = response.xpath('//div[@id="right2"]').css('p b ::text').get()

In [None]:
#response.css('select option ::text').getall()[1:]

In [73]:
response.xpath('//select/option/@value').extract()[1:5]

['', '112', '90', '113', '272']

In [None]:
sample

In [None]:
for item in sample:
    print(re.findall(r'\d+', item)[0])

In [None]:
#link?commodity={}

In [None]:
col_headings = [
    'Date',
    'Commodity',
    'Total Value Sold',
    'Total Value Sold MTD',
    'Total Qty Sold',
    'Total Qty Sold MTD',
    'Total Kg Sold',
    'Total Kg Sold MTD',
    'Qty Available'
]

In [75]:
response.xpath('//table[@class="alltable"]').css('td.tleft2 ::text').extract()[:5]

['AMADUMBE', 'APPLES', 'ARTICHOKES', 'ASPARAGUS', 'ATCHARA']

In [None]:
product[:5]

In [None]:
purchases = []
for i, fresh_produce in enumerate(product):
    row = response.xpath('//table[@class="alltable"]').css('td.tleft ::text').extract()[(i * 7):(i * 7) + 7]
    modified_list = [] 
    for index, item in enumerate(row):
        item = item.replace(",", "").replace("R", "").replace("MTD:", "") # i.e. converted e.g. R 1,234.56 --> 1234.56
        item = item.lstrip()
        modified_list.append(float(item))
    purchases.append([date] + [fresh_produce] + modified_list)

In [None]:
purchases[0]

In [None]:
df = pd.DataFrame(data=purchases, columns=col_headings)

In [None]:
df.head()

In [None]:
import urllib

In [None]:
#engine = create_engine('mssql+pyodbc://LAPTOP-7FR129DV/SQLEXPRESS/jhb_market/?driver=SQL Server?Trusted_Connection=yes')


In [None]:
engine.table_names()

In [None]:
if 'product_raw' in ['product_raw']:
    alert('table in database')