In [57]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from quantulum3 import parser
from urllib3.util import parse_url
from datetime import date


In [45]:
url = 'https://cookingonabootstrap.com/2020/02/25/berry-buckwheat-pancake-recipe/'
res = requests.get(url)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
text = soup.find_all(text=True)

In [46]:
matchers = ['(', ')']
matching = [line for line in text if any(chars in line for chars in matchers)]

In [58]:
#funcs

#1 - find brackets and slash positions
#2 - extract price
#3 - extract unit after slash with quantulum3.parser
#4 - extract name of item somehow from bit to the left of the brackets
    #drop everything from start til last character detected by parser.parse BEFORE the bracket_start    
#5 - extract date from URL

#6 - extract shop from end of bracketed section, matching against list of shops

#7 - add to dataframe


#dataframe cols: item, price, unit, amount, shop, date

df_prices = pd.DataFrame(columns=['item_name', 'price', 'unit', 'amount', 'date'])

page_date = extract_date(url)

for idx,line in enumerate(matching):
    bracket_start,bracket_end, slash_pos = find_delimiters(line)
    if slash_pos != -1: #.find returns -1 on a no match
        price = extract_price(line, bracket_start, slash_pos)
        print(price)
        unit,amount = extract_unit(line, slash_pos)
        print(unit,amount)
        item = extract_name(line, bracket_start)
        new_row = {'item_name': item, 'price':price, 'unit':unit, 'amount':amount, 'date':page_date}
        df_prices = df_prices.append(new_row, ignore_index=True)
    if idx >5:
        break



1.48
dimensionless 12.0
medium egg
0.49
litre 1.0
whole milk
1.39
gram 170.0
baking powder
2.4
no amount after slash
kilogram 1.0
buckwheat flour
1.5
no amount after slash
kilogram 1.0
gluten-free flour
2.6
no amount after slash
kilogram 1.0
mixed berries


In [59]:
df_prices

Unnamed: 0,item_name,price,unit,amount,date
0,medium egg,1.48,dimensionless,12.0,2020-02-25
1,whole milk,0.49,litre,1.0,2020-02-25
2,baking powder,1.39,gram,170.0,2020-02-25
3,buckwheat flour,2.4,kilogram,1.0,2020-02-25
4,gluten-free flour,1.5,kilogram,1.0,2020-02-25
5,mixed berries,2.6,kilogram,1.0,2020-02-25


In [50]:
def find_delimiters(line):
    list_delimiters = [line.find('('),line.find(')'),line.find('/')]
    return list_delimiters

def extract_price(line, bracket_start, slash_pos):
    str_price = line[bracket_start+1:slash_pos]
    pound_pos = str_price.find('£')
    pence_pos = str_price.find('p')
    if pound_pos != -1:
        price = float(str_price[pound_pos+1:])
    if pence_pos != -1:
        price = float(str_price[:pence_pos])
        price = price/100
    return price

def extract_unit(line, slash_pos):
    try:
        quantity = parser.parse(line[slash_pos+1:])[0]
    except:
        print('no amount after slash')
        line = line[:slash_pos+1]+'1'+line[slash_pos+1:]
        quantity = parser.parse(line[slash_pos+1:])[0]
    unit = quantity.unit.name
    amount = quantity.value
    return unit,amount

def extract_name(line,bracket_start):
    quant_end = parser.parse(line[:line.find(',')])[-1].span[1]
    item_name = line[quant_end:line.find(',')]
    if item_name[0] == ' ':
        item_name=item_name[1:]
    print(item_name)
    return item_name

def extract_date(url):
    url_path = parse_url(url).path
    year = url_path[1:5]
    month = url_path[6:8]
    day = url_path[9:11]
    page_date = date(int(year), int(month), int(day))
    return page_date


In [53]:
print(date)

2020-02-25


In [148]:
parser.parse( '1l, Asda)')

[Quantity(1, "Unit(name="litre", entity=Entity("volume"), uri=Litre)")]

In [126]:
line.find(',')

18

In [114]:
quant_end = parser.parse(line[:bracket_start])[-1].span[1]

In [9]:
parser.parse( '1 tsp baking powder, 3p (£1.39/170g, Dr Oetker at Asda)')

[Quantity(1, "Unit(name="teaspoon", entity=Entity("volume"), uri=Teaspoon)"),
 Quantity(3, "Unit(name="pint", entity=Entity("volume"), uri=Pint)")]

In [18]:
line2 =  '1 tsp baking powder, 3p (£1.39/170g, Dr Oetker at Asda)'

In [11]:
parser.parse(line2)

[Quantity(1, "Unit(name="teaspoon", entity=Entity("volume"), uri=Teaspoon)"),
 Quantity(3, "Unit(name="pint", entity=Entity("volume"), uri=Pint)")]

In [20]:
parser.parse(line2[31:])

[Quantity(170, "Unit(name="gram", entity=Entity("mass"), uri=Gram)")]

In [21]:
line2[31:]

'170g, Dr Oetker at Asda)'

In [22]:
parser.parse('12 eggs, Asda)')

[Quantity(12, "Unit(name="dimensionless", entity=Entity("dimensionless"), uri=Dimensionless_quantity)")]

In [45]:
parser.parse('kg, Asda)')[0]

IndexError: list index out of range

In [47]:
test_line = 'kg, Asda)'

try:
    parsed = parser.parse(test_line)[0]
except:
    print('no amount after slash')
finally:
    test_line = '1' + test_line
    parsed = parser.parse(test_line)[0]

no amount after slash


In [48]:
parsed

Quantity(1, "Unit(name="kilogram", entity=Entity("mass"), uri=Kilogram)")

In [125]:
matching

['Buckwheat isn’t actually a grain – it’s from the same plant family as rhubarb, so ideal for gluten-free baking. (I’m not gluten-free, but my Mum is, and many of my readers are, so I do develop gf and coeliac-friendly recipes from time to time.)',
 '1 medium egg, 12p (£1.48/12 medium eggs, Asda)',
 '180ml whole milk, 9p (49p/1l, Asda)',
 '1 tsp baking powder, 3p (£1.39/170g, Dr Oetker at Asda)',
 '4 tbsp or 40g buckwheat flour, 10p (£2.40/kg, Doves Farm at Ocado)',
 '4 tbsp or 40g gluten-free flour, 6p (£1.50/kg, Asda)',
 '100g mixed berries, 26p (£2.60/kg, frozen at Asda)',
 "Jack Monroe is an award winning food writer and bestselling author. Books include A Girl Called Jack, A Year In 120 Recipes and Cooking On A Bootstrap. She has won the Fortnum & Mason Food and Drink award (ironically), the Observer Food Monthly Best Food Blog, Marie Claire 'Woman At The Top', Red Magazine's 'Red Hot Women', the YMCA Courage & Inspiration Award, the Woman Of The Year Entrepreneur award, the Women