In [1]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Base URL of the webpage
base_url = "https://mediadive.dsmz.de"
max_workers = 32  # Reduced the number to avoid too many open browser instances


# Function to initialize a new WebDriver instance
def create_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run headless Chrome
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(5)  # Implicit wait for elements to be present
    return driver


# Function to ensure the page is fully loaded
def ensure_page_loaded(driver):
    WebDriverWait(driver, 10).until(
        lambda d: d.execute_script('return document.readyState') == 'complete'
    )


# Function to extract data from a single page using Selenium
def extract_data_from_link(link, extractor_method, pbar, new_names, retries=1):
    driver = create_driver()  # Each thread gets its own driver instance
    try:
        for attempt in range(retries):
            try:
                driver.get(link)
                ensure_page_loaded(driver)
                
                # Wait for the presence of a specific element (in this case, <pre> tag)
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'pre')))
                
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                extractor = Extractor(link, soup, extractor_method)
                data = extractor.extract()
                if data is None:
                    data = [link] + [None] * (len(new_names) - 1)
                pbar.update(1)
                return data
            except Exception as e:
                # print(f"Error fetching link {link}: {e}. Retrying {attempt + 1}/{retries}...")
                time.sleep(1)  # Wait before retrying
    finally:
        driver.quit()  # Ensure the driver is closed after use
    # Return empty data structure if all retries fail
    return [link] + [None] * (len(new_names) - 1)


def scrape_link(old_df, on_old_name, extractor_method, new_names):
    link_data_temp = []
    links = old_df[on_old_name].dropna().unique()

    with tqdm(total=len(links), desc=on_old_name + " link Progress") as pbar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_link = {executor.submit(extract_data_from_link, link, extractor_method, pbar, new_names): link for
                              link in links}
            for future in as_completed(future_to_link):
                data = future.result()
                link_data_temp.append(data)
        link_df = pd.DataFrame(link_data_temp, columns=new_names)
        merged_df = pd.merge(old_df, link_df, on=on_old_name, how='left')
    return merged_df


class Extractor:
    def __init__(self, link, soup, method):
        self.link = link
        self.soup = soup
        self.method = method

    def extract(self):
        return self.method(self.link, self.soup)

In [2]:

# Read the CSV file
merged_merged_merged_df = pd.read_csv('strains.csv', low_memory=False)

# # Use only the first few rows for testing
# merged_merged_merged_df = merged_merged_merged_df.head()

# Modify the '16S rRNA gene no. link' to append the desired query parameters
merged_merged_merged_df['16S rRNA gene no. link txt'] = merged_merged_merged_df['16S rRNA gene no. link'].apply(
    lambda x: x.rsplit('.', 1)[0] + '?report=fasta&log$=seqview&format=text' if pd.notnull(x) and x != '' else np.NaN)

merged_merged_merged_df

Unnamed: 0,Organism Group,Name,Name Link,DSM No.,Taxonomy Link,Growth media,Growth Media Links,external links,DSMZ Catalogue,Bacdive Link,...,Wink compendium link,Supplied as raw,Supplied as dict,Price of Freeze Dried,Price of Active culture on request,Price of DNA,Price Category,Culture link,Synonyms Full,16S rRNA gene no. link txt
0,Bacterium,Heyndrickxia coagulans DSM 1,https://mediadive.dsmz.de/strains/view/DSM 1,1,https://mediadive.dsmz.de/taxonomy?level=speci...,"['453', '1', 'J22']","['https://mediadive.dsmz.de/medium/453', 'http...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/654,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacilluscoagulansHammer 1915 (Approved Lists 1...,https://www.ncbi.nlm.nih.gov/nuccore/DQ297928?...
1,Bacterium,Paenibacillus macquariensis subsp. macquariens...,https://mediadive.dsmz.de/strains/view/DSM 2,2,https://mediadive.dsmz.de/taxonomy?level=speci...,['1'],['https://mediadive.dsmz.de/medium/1'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11477,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusmacquariensisMarshall and Ohye 1966 (A...,
2,Bacterium,Sporosarcina psychrophila DSM 3,https://mediadive.dsmz.de/strains/view/DSM 3,3,https://mediadive.dsmz.de/taxonomy?level=speci...,"['1', 'J22']","['https://mediadive.dsmz.de/medium/1', 'https:...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11984,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,Bacilluspsychrophilus(exLarkin and Stokes 1967...,
3,Bacterium,Sporosarcina globispora DSM 4,https://mediadive.dsmz.de/strains/view/DSM 4,4,https://mediadive.dsmz.de/taxonomy?level=speci...,"['514', 'J22']","['https://mediadive.dsmz.de/medium/514', 'http...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11976,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusglobisporusLarkin and Stokes 1967 (App...,
4,Bacterium,Psychrobacillus insolitus DSM 5,https://mediadive.dsmz.de/strains/view/DSM 5,5,https://mediadive.dsmz.de/taxonomy?level=speci...,['123'],['https://mediadive.dsmz.de/medium/123'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/1565,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusinsolitusLarkin and Stokes 1967 (Appro...,https://www.ncbi.nlm.nih.gov/nuccore/AM980508?...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46258,,Phage (phagum) DSM 117437,https://mediadive.dsmz.de/strains/view/DSM 117437,117437,,['381'],['https://mediadive.dsmz.de/medium/381'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,,
46259,,Phage (phagum) DSM 117679,https://mediadive.dsmz.de/strains/view/DSM 117679,117679,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,,
46260,,Phage (phagum) DSM 117680,https://mediadive.dsmz.de/strains/view/DSM 117680,117680,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,,
46261,,Staphylococcus epidermidis DSM 117681,https://mediadive.dsmz.de/strains/view/DSM 117681,117681,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,,


In [3]:

# Function to extract the 16S rRNA gene data from the page
def sixteenS_method(sixteenS_link, soup):
    sixteenS = []
    pres = soup.find_all('pre') if soup.find('pre') else None
    if pres:
        for pre in pres:
            sixteenS.append(pre.get_text(strip=True))
    return [sixteenS_link, sixteenS]

# Scrape the 16S rRNA gene data from the links
four_merged_df = scrape_link(merged_merged_merged_df, '16S rRNA gene no. link txt', sixteenS_method,
                             ['16S rRNA gene no. link txt', '16S rRNA gene'])

four_merged_df

16S rRNA gene no. link txt link Progress: 100%|█████████▉| 9526/9570 [14:49<00:04, 10.71it/s] 


Unnamed: 0,Organism Group,Name,Name Link,DSM No.,Taxonomy Link,Growth media,Growth Media Links,external links,DSMZ Catalogue,Bacdive Link,...,Supplied as raw,Supplied as dict,Price of Freeze Dried,Price of Active culture on request,Price of DNA,Price Category,Culture link,Synonyms Full,16S rRNA gene no. link txt,16S rRNA gene
0,Bacterium,Heyndrickxia coagulans DSM 1,https://mediadive.dsmz.de/strains/view/DSM 1,1,https://mediadive.dsmz.de/taxonomy?level=speci...,"['453', '1', 'J22']","['https://mediadive.dsmz.de/medium/453', 'http...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/654,...,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacilluscoagulansHammer 1915 (Approved Lists 1...,https://www.ncbi.nlm.nih.gov/nuccore/DQ297928?...,[>DQ297928.1 Bacillus coagulans strain ATCC 70...
1,Bacterium,Paenibacillus macquariensis subsp. macquariens...,https://mediadive.dsmz.de/strains/view/DSM 2,2,https://mediadive.dsmz.de/taxonomy?level=speci...,['1'],['https://mediadive.dsmz.de/medium/1'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11477,...,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusmacquariensisMarshall and Ohye 1966 (A...,,
2,Bacterium,Sporosarcina psychrophila DSM 3,https://mediadive.dsmz.de/strains/view/DSM 3,3,https://mediadive.dsmz.de/taxonomy?level=speci...,"['1', 'J22']","['https://mediadive.dsmz.de/medium/1', 'https:...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11984,...,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,Bacilluspsychrophilus(exLarkin and Stokes 1967...,,
3,Bacterium,Sporosarcina globispora DSM 4,https://mediadive.dsmz.de/strains/view/DSM 4,4,https://mediadive.dsmz.de/taxonomy?level=speci...,"['514', 'J22']","['https://mediadive.dsmz.de/medium/514', 'http...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11976,...,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusglobisporusLarkin and Stokes 1967 (App...,,
4,Bacterium,Psychrobacillus insolitus DSM 5,https://mediadive.dsmz.de/strains/view/DSM 5,5,https://mediadive.dsmz.de/taxonomy?level=speci...,['123'],['https://mediadive.dsmz.de/medium/123'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/1565,...,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusinsolitusLarkin and Stokes 1967 (Appro...,https://www.ncbi.nlm.nih.gov/nuccore/AM980508?...,[>AM980508.1 Bacillus insolitus partial 16S rR...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46258,,Phage (phagum) DSM 117437,https://mediadive.dsmz.de/strains/view/DSM 117437,117437,,['381'],['https://mediadive.dsmz.de/medium/381'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,,,
46259,,Phage (phagum) DSM 117679,https://mediadive.dsmz.de/strains/view/DSM 117679,117679,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,,,
46260,,Phage (phagum) DSM 117680,https://mediadive.dsmz.de/strains/view/DSM 117680,117680,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,,,
46261,,Staphylococcus epidermidis DSM 117681,https://mediadive.dsmz.de/strains/view/DSM 117681,117681,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,,,


In [4]:

# Save the scraped data to a new CSV file
four_merged_df.to_csv('strains2.csv', index=False)