In [57]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm

In [55]:
def extract_info_from_ing_url(ing_url: str) -> list:

    # Get response
    response = requests.get(ing_url)
    soup = BeautifulSoup(response.text, "html.parser")

    card = soup.find('div', class_='ingredient')

    # Get title
    title = card.find('h1').text

    #Get desc
    desc_div = card.find('div', class_='ingredient-description')
    desc = desc_div.text if desc_div else None



    #Get also known as
    also_known_as_div = card.find('div', class_='ingredient-info-header')
    if also_known_as_div:
        if 'also known as' in also_known_as_div.text.lower():
            also_known_as = also_known_as_div.next_sibling.text
        else:
            also_known_as = None
    else:
        also_known_as = None
        

    #Get Scientific Name
    try:
        scientific_name_div = card.find('div', class_="flex-item").find('div', class_='ingredient-scientific-name').next_element
        scientific_name = scientific_name_div.text
    except:
        scientific_name = None



    subs = soup.find_all('div', class_='sortable-row card my-3 border-0')
    
    list_of_subs = []
    for sub in subs:
        name = sub.find('a').text
        list_of_subs.append(name)

    return [title, desc, also_known_as, scientific_name, list_of_subs]


In [60]:
def extract_cards_from_page(url: str) -> list:

    base_url = 'https://foodsubs.com'

    response = requests.get(url)
    bs = BeautifulSoup(response.text, "html.parser")

    container = bs.find('div', class_='ingredients-container')
    cards = container.find_all('div', class_='ingredient card border-0 mb-2')

    extracted_info = []
    for card in tqdm(cards):
        href = card.find('a', class_='card-learn-more')['href']

        extracted_info.append(extract_info_from_ing_url(base_url + href))

    return extracted_info
    

In [95]:
def all_subs_of_type(type_of_food: str) -> pd.DataFrame:
    
    print(f"Scrapping group type {type_of_food}")
    
    full_accompanimnets = []
    for i in range(21):
        url = f'https://foodsubs.com/groups/{type_of_food}?page.number={i}&page.size=40&i=true'
        full_accompanimnets.extend(extract_cards_from_page(url=url))

    dict_of_substitutes = {
    'Name': [],
    'Description': [],
    'Also known as': [],
    'Scientific name': [],
    'Substitutions': [],
    }
    
    for index, (_, value) in enumerate(dict_of_substitutes.items()):
        for item in full_accompanimnets:
            value.append(item[index])

    subs_df = pd.DataFrame(dict_of_substitutes)

    subs_df.to_csv(f'data\scrapped_{type_of_food}.csv')

    return subs_df

In [97]:
list_of_types = [
    'flavorings',
    'accompaniments',
    'baked-goods',
    'baking-supplies',
    'dairy',
    'equipment',
    'vegetarian',
    'vegetables',
    'miscellaneous',
    'fats-oils',
    'fish',
    'fruit',
    'grain-products',
    'grains',
    'legumes-nuts',
    'liquids',
    'meats',
]

In [98]:
for food in list_of_types:
    all_subs_of_type(food)

Scrapping group type fruit


100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:48<00:00,  1.22s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 31/31 [00:36<00:00,  1.18s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Scrapping group type grain-products


100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 31/31 [00:36<00:00,  1.19s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Scrapping group type grains


100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 16/16 [00:18<00:00,  1.18s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Scrapping group type legumes-nuts


100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:46<00:00,  1.17s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 2/2 [00:02<00:00,  1.26s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Scrapping group type liquids


100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 1/1 [00:01<00:00,  1.20s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


Scrapping group type meats


100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.19s/it]
100%|██████████| 40/40 [00:47<00:00,  1.18s/it]
100%|██████████| 30/30 [00:35<00:00,  1.19s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
