**Note:**
This data pipeline scrappes raw data from [joburgmarket](http://www.joburgmarket.co.za/dailyprices.php) to an SQL Database.


In [1]:
import pandas as pd
import requests
import urllib
from scrapy.http import HtmlResponse
from sqlalchemy import create_engine, MetaData


# Custom upload
from engine_info import server_info

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Url where the data is going to be scrapped from
url = 'http://www.joburgmarket.co.za/dailyprices.php'

In [13]:
# Extracting html content of the url
html = requests.get(url).content
# Creating a response variable to enable web scrapping in the url 
response = HtmlResponse(url=url, body=html)

In [16]:
# Extracting the date for latest information displayed on the website
date = response.xpath('//div[@id="right2"]').css('p b ::text').get()
print(f"The information in the website is for {date}.")

The information in the website is for 20 August 2020.


In [3]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [11]:
# Check what is in the database
engine.table_names()

['container_raw', 'product_combination_raw', 'product_raw', 'scrapping_date']

In [9]:
# Data scrapped comes in as a list, which must be converted to a Dataframe
commodity_df = pd.read_html(url)[0]

In [10]:
commodity_df.head()

Unnamed: 0,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,APPLES,"R1,205,932.00MTD: R22,664,221.00","13,799MTD: 261,296","157,462MTD: 3,163,863",91755
2,ARTICHOKES,"R600.00MTD: R53,100.00",4MTD: 439,3MTD: 522,1
3,ASPARAGUS,"R34,000.00MTD: R258,975.00",50MTD: 359,"250MTD: 1,795",8
4,ATCHARA,"R0.00MTD: R1,351.20",0MTD: 23,0MTD: 65,207


In [17]:
# Attach the date column to the dataframe
commodity_df.insert(loc=0, column='Date', value=date)

In [20]:
commodity_df.head()

Unnamed: 0,Date,Commodity,Total Value Sold,Total Qty Sold,Total Kg Sold,Qty Available
0,20 August 2020,AMADUMBE,"R0.00MTD: R39,870.00",0MTD: 97,"0MTD: 1,940",2
1,20 August 2020,APPLES,"R1,205,932.00MTD: R22,664,221.00","13,799MTD: 261,296","157,462MTD: 3,163,863",91755
2,20 August 2020,ARTICHOKES,"R600.00MTD: R53,100.00",4MTD: 439,3MTD: 522,1
3,20 August 2020,ASPARAGUS,"R34,000.00MTD: R258,975.00",50MTD: 359,"250MTD: 1,795",8
4,20 August 2020,ATCHARA,"R0.00MTD: R1,351.20",0MTD: 23,0MTD: 65,207


In [6]:
mydf[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Commodity         169 non-null    object
 1   Total Value Sold  169 non-null    object
 2   Total Qty Sold    169 non-null    object
 3   Total Kg Sold     169 non-null    object
 4   Qty Available     169 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 6.7+ KB


In [97]:
response.css('div#right2 > h1 ::text').extract()

['Daily Prices']

In [98]:
response.xpath('//div[@id="right2"]/h1//text()').extract()

['Daily Prices']

In [99]:
response.xpath('//div[@id="right2"]').css('h1 ::text').extract_first()

'Daily Prices'

In [100]:
response.xpath('//div[@id="right2"]').css('p b ::text').extract_first()

'19 August 2020'

In [101]:
response.xpath('//div[@id="right2"]/h1//text()').get()

'Daily Prices'

In [102]:
date = response.xpath('//div[@id="right2"]').css('p b ::text').get()

In [103]:
#response.css('select option ::text').getall()[1:]

In [141]:
sample = response.xpath('//select/option/@value').extract()[1:5]

In [142]:
sample

['112', '90', '113', '272']

In [106]:
for item in sample:
    print(re.findall(r'\d+', item)[0])

112
90
113
272


In [107]:
#link?commodity={}

In [108]:
col_headings = [
    'Date',
    'Commodity',
    'Total Value Sold',
    'Total Value Sold MTD',
    'Total Qty Sold',
    'Total Qty Sold MTD',
    'Total Kg Sold',
    'Total Kg Sold MTD',
    'Qty Available'
]

In [109]:
product = response.xpath('//table[@class="alltable"]').css('td.tleft2 ::text').extract()

In [110]:
product[:5]

['AMADUMBE', 'APPLES', 'ARTICHOKES', 'ASPARAGUS', 'ATCHARA']

In [111]:
purchases = []
for i, fresh_produce in enumerate(product):
    row = response.xpath('//table[@class="alltable"]').css('td.tleft ::text').extract()[(i * 7):(i * 7) + 7]
    modified_list = [] 
    for index, item in enumerate(row):
        item = item.replace(",", "").replace("R", "").replace("MTD:", "") # i.e. converted e.g. R 1,234.56 --> 1234.56
        item = item.lstrip()
        modified_list.append(float(item))
    purchases.append([date] + [fresh_produce] + modified_list)

In [112]:
purchases[0]

['19 August 2020', 'AMADUMBE', 0.0, 39870.0, 0.0, 97.0, 0.0, 1940.0, 2.0]

In [113]:
df = pd.DataFrame(data=purchases, columns=col_headings)

In [114]:
df.head()

Unnamed: 0,Date,Commodity,Total Value Sold,Total Value Sold MTD,Total Qty Sold,Total Qty Sold MTD,Total Kg Sold,Total Kg Sold MTD,Qty Available
0,19 August 2020,AMADUMBE,0.0,39870.0,0.0,97.0,0.0,1940.0,2.0
1,19 August 2020,APPLES,1219191.0,21457329.0,12036.0,247471.0,163679.0,3005996.0,80452.0
2,19 August 2020,ARTICHOKES,2520.0,52500.0,21.0,435.0,25.0,518.0,1.0
3,19 August 2020,ASPARAGUS,2000.0,224975.0,3.0,309.0,15.0,1545.0,18.0
4,19 August 2020,ATCHARA,128.0,1351.2,3.0,23.0,6.0,65.0,207.0


In [8]:
import urllib

In [11]:
#engine = create_engine('mssql+pyodbc://LAPTOP-7FR129DV/SQLEXPRESS/jhb_market/?driver=SQL Server?Trusted_Connection=yes')


[]


In [17]:
engine.table_names()

[]

In [14]:
if 'product_raw' in ['product_raw']:
    alert('table in database')

NameError: name 'alert' is not defined