In [23]:
import requests as r
import numpy as np
import pandas as pd
import matplotlib.pyplot
import json
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

# Scrap data from [lavkarbo.no](lavkarbo.no)

In [2]:
rootpage = 'https://www.lavkarbo.no/'

# CATEGORIES
cat_ext = 'matvaretabell'
r_cat = r.get(rootpage + cat_ext)

In [3]:
soup = BeautifulSoup(r_cat.content, 'html.parser')

In [56]:
list_items = soup.find_all('li')

In [57]:
categories = [l.get('href') for l in soup.find_all('a') if l.get('href')[:4] != 'http']

In [58]:
categories

['/matvaretabell/melk-og-melkeprodukter',
 '/matvaretabell/egg',
 '/matvaretabell/fjoerfe-og-kjoett',
 '/matvaretabell/fisk-og-skalldyr',
 '/matvaretabell/korn-og-bakevarer-froe-og-noetter',
 '/matvaretabell/poteter-groennsaker-frukt-og-baer',
 '/matvaretabell/sukker-og-soete-produkter',
 '/matvaretabell/margarin-smoer-matolje-ol',
 '/matvaretabell/drikke',
 '/matvaretabell/spedbarnsmat',
 '/matvaretabell/diverse-retter-produkter-og-ingredienser']

In [59]:
milk = categories[0]

In [60]:
"""you should build a pipeline for this"""
milk_page = r.get(rootpage + milk)

In [61]:
milk_soup = BeautifulSoup(milk_page.content, 'html.parser')

In [62]:
milk_tabels = milk_soup.find_all('table', 'foods-table')

In [64]:
columns = [th.string for th in milk_soup.table.thead.find_all('th')]

In [65]:
columns

['Navn', 'Protein', 'Karbo', 'Fett']

In [66]:
milk_data = []

b = 1
row = []
for t in milk_soup.find_all('td'):
    row.append(t.string)
    
    if b%4==0:
        milk_data.append(row)
        row = []

    b += 1


In [67]:
milk_data[:3]

[['Kakao, med lettmelk, tilberedt', '4,0g', '12g', '2,3g'],
 ['Lettmelk, 1,0 % fett', '3,5g', '4,6g', '1,0g'],
 ['Lettmelk, 0,5 % fett, med vitamin D', '3,5g', '4,6g', '0,50g']]

In [68]:
def cat_to_data(cat, root=rootpage):
    page = r.get(rootpage + cat)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    columns = [th.string for th in milk_soup.table.thead.find_all('th')]
    columns.insert(0, 'category')
    
    category = cat.split('/')[2].replace('-', ' ')
    
    data = []
    b = 1
    row = [category]
    for t in soup.find_all('td'):
        # sort decimals from names
        if b == 1:
            row.append(t.string)
        else:
            # convert string values to float
            match = re.search(r'(?P<integer>\d+)\,(?P<decimal>\d+)', t.string)
            if not match:
                match = re.search(r'(?P<integer>\d+)', t.string)
                row.append(float(match.group('integer')))
            else:
                row.append(float(match.group('integer')+'.'+match.group('decimal')))

        if b%4==0:
            data.append(row)
            row = [category]
            b = 0

        b += 1
    return pd.DataFrame(data, columns=columns)

In [69]:
data = pd.DataFrame()
for c in tqdm(categories):
    data = data.append(cat_to_data(c))

100%|██████████████████████████████████████████| 11/11 [00:21<00:00,  1.92s/it]


In [70]:
data.shape

(1677, 5)

In [26]:
data.columns

Index(['category', 'Navn', 'Protein', 'Karbo', 'Fett'], dtype='object')

In [31]:
lavkarbo = pd.DataFrame(data[data['Karbo']<5.0])

In [28]:
pd.unique(data['category'])

array(['melk og melkeprodukter', 'egg', 'fjoerfe og kjoett',
       'fisk og skalldyr', 'korn og bakevarer froe og noetter',
       'poteter groennsaker frukt og baer', 'sukker og soete produkter',
       'margarin smoer matolje ol', 'drikke',
       'diverse retter produkter og ingredienser'], dtype=object)

In [32]:
lavkarbo.rename(columns={'Navn':'text'}, inplace=True)

In [52]:
lavkarbo.head()

Unnamed: 0,category,text,Protein,Karbo,Fett
1,melk og melkeprodukter,"Lettmelk, 1,0 % fett",3.5,4.6,1.0
2,melk og melkeprodukter,"Lettmelk, 0,5 % fett, med vitamin D",3.5,4.6,0.5
3,melk og melkeprodukter,"Helmelk, 3,9 % fett",3.3,4.6,3.9
5,melk og melkeprodukter,"Helmelk, 3,5 % fett",3.4,4.5,3.5
6,melk og melkeprodukter,"Lettmelk, 1,2 % fett",3.5,4.5,1.2


# Translate to english

In [36]:
from translate import translate

In [108]:
def get_random_ingredients():
    ingredienter = ''

    for category in pd.unique(data['category']):
        # ingredients = pd.DataFrame(lavkarbo[lavkarbo.category==category]['text']).to_dict('records')    
        ingredienter += str(lavkarbo[lavkarbo.category==category].sample()['text'])[3:].split('\n')[0]+'*'    # very ugly but works for now

    ingredienter = ingredienter.replace(',', ' ')
    ingredienter = ingredienter.replace('*', ',')

    ingredients = translate(ingredienter)
    return ingredients[0]['translations'][0]['text']  # also pretty ugly but will do for now

In [109]:
get_random_ingredients()

'    Normanna blue cheese, Scrambled eggs fried in fat, Pork thin rib with sworn raw, Snails canned, Flaxseed, Cauliflower cooked, Lozenges without sugar, Wheat germ oil, Mineral water with carbonated acid Olden, Salad Greek with feta cheese olives,'

# Call Spoonacular API for recipe

In [110]:
from recipe import get_recipe

ImportError: cannot import name 'recipe' from 'recipe' (C:\Users\perha\Desktop\code\Python\Diet\recipe.py)