In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

In [2]:

# Base URL of the webpage
base_url = "https://mediadive.dsmz.de"

# Function to extract data from a single page
def extract_data_from_page(url, page, retries=5):
    params = {"p": page}
    for attempt in range(retries):
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            data = []
            table_rows = soup.find('table', class_="table mb-4").find_all('tr')[1:]  # Skip the header row
            
            for row in table_rows:
                columns = row.find_all('td')
                row_data = [page]  # Add the page number

                # Name and Name Link
                name_tag = columns[0].find('a')
                name = name_tag.get_text(strip=True)
                name_link = base_url + name_tag['href']
                row_data.append(name)
                row_data.append(name_link)

                # ChEBI-ID and Link
                chid_tag = columns[1].find('a')
                if chid_tag:
                    chid = chid_tag.get_text(strip=True)
                    chid_link = base_url + chid_tag['href']
                    row_data.append(chid)
                    row_data.append(chid_link)
                else:
                    row_data.append(None)
                    row_data.append(None)
                
                # CAS Registry NumberⓇ and Link
                cas_tag = columns[2].find('a')
                if cas_tag:
                    cas = cas_tag.get_text(strip=True)
                    cas_link = base_url + cas_tag['href']
                    row_data.append(cas)
                    row_data.append(cas_link)
                else:
                    row_data.append(None)
                    row_data.append(None)
                
                # Formula
                formula = columns[3].get_text(strip=True)
                row_data.append(formula if formula else None)

                # Mass
                mass = columns[4].get_text(strip=True)
                row_data.append(mass if mass else None)

                data.append(row_data)

            return data
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching page {page}: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(1)  # Wait before retrying
    return []

# Main scraping process
all_data = []
num_pages = 5  # Adjust the number of pages you want to scrape

max_workers = 64  # Adjust based on the MacBook M3 Pro capabilities

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_page = {executor.submit(extract_data_from_page, base_url + "/ingredients", page): page for page in range(1, num_pages + 1)}

    for future in tqdm(as_completed(future_to_page), total=num_pages, desc="Extracting data from pages"):
        page_data = future.result()
        all_data.extend(page_data)

# Sort the data based on the page number to maintain the order
all_data.sort(key=lambda x: x[0])

# Create a DataFrame from the extracted data
columns = ["Page", "Name", "Name Link", "ChEBI-ID", "ChEBI-ID Link", "CAS Registry NumberⓇ", "CAS Registry NumberⓇ Link", "Formula", "Mass"]
df = pd.DataFrame(all_data, columns=columns)
df.drop(columns=["Page"], inplace=True)  # Remove the page column if not needed

# print(df)  # Print the DataFrame to check if it's populated
# df.to_csv('ingredients.csv', index=False)

df

Extracting data from pages: 100%|██████████| 5/5 [00:02<00:00,  2.44it/s]


Unnamed: 0,Name,Name Link,ChEBI-ID,ChEBI-ID Link,CAS Registry NumberⓇ,CAS Registry NumberⓇ Link,Formula,Mass
0,Peptone,https://mediadive.dsmz.de/ingredients/1?p=1,,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,73049-73-7,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex
1,Meat extract,https://mediadive.dsmz.de/ingredients/2?p=1,,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex
2,Agar,https://mediadive.dsmz.de/ingredients/3?p=1,2509,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,9002-18-0,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex
3,Distilled water,https://mediadive.dsmz.de/ingredients/4?p=1,15377,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,7732-18-5,https://mediadive.dsmz.dehttps://commonchemist...,H2O,18.0153
4,Glucose,https://mediadive.dsmz.de/ingredients/5?p=1,17234,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,50-99-7,https://mediadive.dsmz.dehttps://commonchemist...,C6H12O6,180.16
...,...,...,...,...,...,...,...,...
95,Trypticase peptone,https://mediadive.dsmz.de/ingredients/124?p=5,,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,91079-40-2,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex
96,Phytone peptone,https://mediadive.dsmz.de/ingredients/125?p=5,,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex
97,KOH,https://mediadive.dsmz.de/ingredients/126?p=5,32035,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,1310-58-3,https://mediadive.dsmz.dehttps://commonchemist...,HKO,56.106
98,Uric acid,https://mediadive.dsmz.de/ingredients/127?p=5,27226,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,69-93-2,https://mediadive.dsmz.dehttps://commonchemist...,C5-H4-N4-O3,168.11


In [3]:
# Function to extract additional data from individual Name links
def extract_additional_data(name_link, pbar, retries=50):
    for attempt in range(retries):
        try:
            response = requests.get(name_link)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser').find('div', class_="content-wrapper")

            id_value = soup.find('div', class_="col-lg-6").find('p', class_="text-muted").get_text(strip=True).split(": ")[1] if soup.find('div', class_="col-lg-6").find('p', class_="text-muted") else None

            identifiers_div = soup.find('div', id='identifiers', class_="box")
            identifiers_dict = {}
            if identifiers_div:
                for li in identifiers_div.find_all('li'):
                    identifier_name = li.find('span').get_text(strip=True)
                    identifier_text = li.find('a').get_text(strip=True) if li.find('a') else None
                    identifier_href = li.find('a')['href'] if li.find('a') else None
                    identifiers_dict[identifier_name] = {
                        'text': identifier_text,
                        'href': identifier_href
                    }
            else:
                identifiers_dict = None
            pubchem_cid = identifiers_dict.get('PubChem CID', {}).get('text') if identifiers_dict else None
            gestis_zvg_nr = identifiers_dict.get('GESTIS ZVG-Nr.', {}).get('text') if identifiers_dict else None
            identifiers_str = json.dumps(identifiers_dict)
            
            group = soup.find('div', id='groups', class_="box").find('a').get_text(strip=True) if soup.find('div', id="groups", class_="box") else None
            other_ingredients_div = soup.find('div', id='groups', class_="box")
            other_ingredients = []
            if other_ingredients_div:
                for li in other_ingredients_div.find_all('li'):
                    ingredient_name = li.find('a').get_text(strip=True)
                    other_ingredients.append(ingredient_name)
            else:
                other_ingredients = None
            
            kegg_id = identifiers_dict.get('KEGG', {}).get('text') if identifiers_dict else None
            
            chemical_soup = soup.find('div', id='chemical-data', class_="box")
            density = chemical_soup.find('span', id='in-density').get_text(strip=True) if chemical_soup and chemical_soup.find('span', id='in-density') else None
            
            synonym_div = soup.find('div', id='synonyms' ,class_="box")
            synonym = [span.get_text(strip=True) for span in synonym_div.find_all('span', class_="badge")] if synonym_div else None
            
            used_attributes_div = soup.find('div', id="attributes", class_="box")
            used_attributes = [span.get_text(strip=True) for span in used_attributes_div.find_all('span', class_="badge")] if used_attributes_div else None
            
            found_in_div = soup.find('div', id='finders', class_="box")
            found_in = [a.get_text(strip=True) for a in found_in_div.find_all('a')] if found_in_div else None
            solutions_list_link = base_url + found_in_div.find('a', class_="btn primary").get('href') if found_in_div and found_in_div.find('a', class_="btn primary") else None
            media_list_link = base_url + found_in_div.find('a', class_="btn danger").get('href') if found_in_div and found_in_div.find('a', class_="btn danger") else None

            solutions_list, media_list = None, None
            
            if solutions_list_link:
                solutions_list = extract_list_data(solutions_list_link, "solution", pbar)
                
            if media_list_link:
                media_list = extract_list_data(media_list_link, "media", pbar)
                
            return [name_link, id_value, pubchem_cid, gestis_zvg_nr, group, other_ingredients, kegg_id, density, synonym, used_attributes, found_in, solutions_list_link, media_list_link, identifiers_str, solutions_list, media_list]
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching {name_link}: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(1)
    return [name_link, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]

# Function to extract data from list links
def extract_list_data(list_link, list_type, pbar):
    try:
        response = requests.get(list_link)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser').find('div', class_="content-wrapper")
        page_num = int(soup.find('button', title="last")['value']) if soup.find('button', title="last") else 1
        list_data = []
        with ThreadPoolExecutor(max_workers=16) as executor:
            future_to_page = {executor.submit(extract_list_page, list_link + "&p=" + str(p), list_type): p for p in range(1, page_num+1)}
            for future in as_completed(future_to_page):
                list_data.extend(future.result())
                pbar.update(1)
        return list_data
    except Exception as e:
        print(f"Error fetching list from {list_link}: {e}")
        return None

# Function to extract data from a single page of list
def extract_list_page(page_link, list_type):
    response = requests.get(page_link)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser').find('div', class_="content-wrapper")
    if list_type == "solution":
        return [item['href'].split("/")[-1] for item in soup.find('div', class_="row").find_all('a', class_="btn mr-10")]
    elif list_type == "media":
        return [item['href'].split("/")[-1] for item in soup.find('div', class_="row").find_all('a', class_="btn")]
    else:
        return []

# Extract additional data from all Name links
# df = pd.read_csv('ingredients.csv')
additional_data = []
name_links = df['Name Link'].dropna().unique()

# Function to calculate total number of pages for all solution and media links
def calculate_total_pages(name_link):
    total_pages = 0
    try:
        response = requests.get(name_link)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser').find('div', class_="content-wrapper")
        
        found_in_div = soup.find('div', id='finders', class_="box")
        solutions_list_link = base_url + found_in_div.find('a', class_="btn primary").get('href') if found_in_div and found_in_div.find('a', class_="btn primary") else None
        media_list_link = base_url + found_in_div.find('a', class_="btn danger").get('href') if found_in_div and found_in_div.find('a', class_="btn danger") else None

        if solutions_list_link:
            response = requests.get(solutions_list_link)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser').find('div', class_="content-wrapper")
            total_pages += int(soup.find('button', title="last")['value']) if soup.find('button', title="last") else 1
            
        if media_list_link:
            response = requests.get(media_list_link)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser').find('div', class_="content-wrapper")
            total_pages += int(soup.find('button', title="last")['value']) if soup.find('button', title="last") else 1
    except Exception as e:
        print(f"Error calculating pages for {name_link}: {e}")
    return total_pages

# Calculate the total number of pages for all solution and media links
total_pages = 0
with tqdm(total=len(name_links), desc="Calculating total pages") as pbar:
    with ThreadPoolExecutor(max_workers=64) as executor:
        future_to_name_link = {executor.submit(calculate_total_pages, name_link): name_link for name_link in name_links}
        for future in as_completed(future_to_name_link):
            total_pages += future.result()
            pbar.update(1)

with tqdm(total=total_pages, desc="Overall Progress") as pbar:
    with ThreadPoolExecutor(max_workers=64) as executor:
        future_to_name_link = {executor.submit(extract_additional_data, name_link, pbar): name_link for name_link in name_links}
        for future in as_completed(future_to_name_link):
            link_data = future.result()
            additional_data.append(link_data)

    additional_columns = ["Name Link", "ID", "PubChem CID", "GESTIS ZVG-Nr.", "group", "Other ingredients from the group", "kegg_id", "density", "Synonym", "Used attributes", "Found in", "solutions list link", "media list link", "identifiers", "solutions list", "media list"]
    additional_df = pd.DataFrame(additional_data, columns=additional_columns)

    merged_df = pd.merge(df, additional_df, on='Name Link', how='left').fillna("")
merged_df = merged_df.fillna("")
merged_df

Calculating total pages: 100%|██████████| 100/100 [00:43<00:00,  2.28it/s]
Overall Progress: 100%|██████████| 4947/4947 [04:26<00:00, 18.55it/s]   


Unnamed: 0,Name,Name Link,ChEBI-ID,ChEBI-ID Link,CAS Registry NumberⓇ,CAS Registry NumberⓇ Link,Formula,Mass,ID,PubChem CID,...,kegg_id,density,Synonym,Used attributes,Found in,solutions list link,media list link,identifiers,solutions list,media list
0,Peptone,https://mediadive.dsmz.de/ingredients/1?p=1,,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,73049-73-7,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex,C1,,...,,,"[Pepton, Tryptones]","[BD BACTO (18×), Oxoid (11×), Bacto (9×), Difc...","[327 solutions, 311 media]",https://mediadive.dsmz.de/solution-finder/Pept...,https://mediadive.dsmz.de/finder/Peptone///,"{""CAS Registry Number\u00ae"": {""text"": ""73049-...","[819, 829, 868, 1138, 1165, 1185, 1487, 1496, ...","[1, 1a, 2, 7, 8, 10, 11, 11a, 11b, 21, 1, 1a, ..."
1,Meat extract,https://mediadive.dsmz.de/ingredients/2?p=1,,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex,C2,,...,,,,"[OXOID (2×), Difco (1×)]","[52 solutions, 53 media]",https://mediadive.dsmz.de/solution-finder/Meat...,https://mediadive.dsmz.de/finder/Meat+extract///,,"[189, 867, 868, 1435, 2666, 4356, 5676, 5721, ...","[1, 1a, 2, 11, 11a, 11b, 31, 51, 57, 58, 1, 1a..."
2,Agar,https://mediadive.dsmz.de/ingredients/3?p=1,2509,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,9002-18-0,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex,C3,71571511,...,C08815,,"[(Agar, Agar agar, Agar agar flake, Agar Difco...","[Bacto (21×), (16×), BD-Difco, Bacto (16×), D...","[908 solutions, 892 media]",https://mediadive.dsmz.de/solution-finder/Agar...,https://mediadive.dsmz.de/finder/Agar///,"{""CAS Registry Number\u00ae"": {""text"": ""9002-1...","[18, 52, 71, 110, 139, 256, 292, 436, 479, 790...","[1, 1a, 2, 3, 6, 7, 8, 9, 9a, 10, 1, 1a, 2, 3,..."
3,Distilled water,https://mediadive.dsmz.de/ingredients/4?p=1,15377,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,7732-18-5,https://mediadive.dsmz.dehttps://commonchemist...,H2O,18.0153,C4,962,...,C00001,1.00293,"[a.dest, acqua, add distilled water, Add disti...","[or deionized water (2×), ad (2×), for liquid ...","[4049 solutions, 2774 media]",https://mediadive.dsmz.de/solution-finder/Dist...,https://mediadive.dsmz.de/finder/Distilled+wat...,"{""CAS Registry Number\u00ae"": {""text"": ""7732-1...","[732, 734, 756, 758, 759, 761, 763, 765, 766, ...","[1, 1a, 2, 3, 6, 7, 8, 9, 9a, 10, 1, 1a, 2, 3,..."
4,Glucose,https://mediadive.dsmz.de/ingredients/5?p=1,17234,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,50-99-7,https://mediadive.dsmz.dehttps://commonchemist...,C6H12O6,180.16,C5,5793,...,C00293,1.544,"[(+)-Glucose, Anhydrous dextrose, Brake (pH-en...","[1 M (22×), 10% (9×), 1.0 M (8×), 2.5%, steril...","[592 solutions, 583 media]",https://mediadive.dsmz.de/solution-finder/Gluc...,https://mediadive.dsmz.de/finder/Glucose///,"{""CAS Registry Number\u00ae"": {""text"": ""50-99-...","[18, 71, 72, 85, 292, 296, 437, 456, 541, 542,...","[3, 7, 8, 10, 11, 11a, 11b, 13, 21, 48, 3, 7, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Trypticase peptone,https://mediadive.dsmz.de/ingredients/124?p=5,,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,91079-40-2,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex,C124,,...,,,[Trypticase Pepton],"[BD BBL (93×), BD-BBL (54×), BBL (2×), BD BACT...","[172 solutions, 181 media]",https://mediadive.dsmz.de/solution-finder/Tryp...,https://mediadive.dsmz.de/finder/Trypticase+pe...,"{""CAS Registry Number\u00ae"": {""text"": ""91079-...","[446, 453, 463, 644, 655, 672, 829, 1571, 2585...","[75, 104, 104a, 104b, 104c, 104d, 141, 141b, 1..."
96,Phytone peptone,https://mediadive.dsmz.de/ingredients/125?p=5,,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,,https://mediadive.dsmz.dehttps://commonchemist...,complex,complex,C125,,...,,,,"[BD-BBL (7×), BD-Difco (1×)]","[9 solutions, 9 media]",https://mediadive.dsmz.de/solution-finder/Phyt...,https://mediadive.dsmz.de/finder/Phytone+pepto...,,[],"[75, J13, J220, J240, J641, J802, J925, J1230,..."
97,KOH,https://mediadive.dsmz.de/ingredients/126?p=5,32035,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,1310-58-3,https://mediadive.dsmz.dehttps://commonchemist...,HKO,56.106,C126,14797,...,C12568,,"[Aetzkali, caustic potash, hydroxyde de potass...","[0.1 N (1×), 1 N (1×)]","[8 solutions, 7 media]",https://mediadive.dsmz.de/solution-finder/KOH/...,https://mediadive.dsmz.de/finder/KOH///,"{""CAS Registry Number\u00ae"": {""text"": ""1310-5...","[1871, 2509, 5229, 6130, 6243]","[76, 909, 1556, J1130, C13, C16, C106]"
98,Uric acid,https://mediadive.dsmz.de/ingredients/127?p=5,27226,https://mediadive.dsmz.dehttps://www.ebi.ac.uk...,69-93-2,https://mediadive.dsmz.dehttps://commonchemist...,C5-H4-N4-O3,168.11,C127,,...,,,"[1H-Purine-2,6,8(3H)-trione, 7,9-dihydro-, 7,9...",,"[1 solutions, 1 media]",https://mediadive.dsmz.de/solution-finder/Uric...,https://mediadive.dsmz.de/finder/Uric+acid///,"{""CAS Registry Number\u00ae"": {""text"": ""69-93-...",[],[76]


In [None]:
# Save the merged DataFrame to a CSV file
merged_df.to_csv('ingredients.csv', index=False)