In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import ast
import os

In [18]:
# Base URL of the webpage
base_url = "https://mediadive.dsmz.de"
max_workers = 128

In [19]:
# Function to extract data from a single page
def extract_data_from_link(link, extractor_method, pbar, new_names, retries=5):
    for attempt in range(retries):
        try:
            response = requests.get(link)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            extractor = Extractor(link, soup, extractor_method)
            data = extractor.extract()
            if data is None:
                print('No soup for {}'.format(link))
                # pbar.set_description('Failed to extract data from {}'.format(link))
                data = [link]
                data.extend([None]*(len(new_names)-1))
            pbar.update(1)
            return data
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching link {link}: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(1)  # Wait before retrying
    return [link].extend([None]*(len(new_names)-1))

def scrape_link(old_df, on_old_name, extractor_method, new_names):
    global link_data_temp
    link_data_temp = []
    links = old_df[on_old_name].dropna().unique()
    
    with tqdm(total=len(links), desc=on_old_name+" link Progress") as pbar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_link = {executor.submit(extract_data_from_link, link, extractor_method, pbar, new_names): link for link in links}
            for future in as_completed(future_to_link):
                data = future.result()
                link_data_temp.append(data)
        link_df = pd.DataFrame(link_data_temp, columns=new_names)
        merged_df = pd.merge(old_df, link_df, on=on_old_name, how='left')
    return merged_df

class Extractor:
    def __init__(self, link, soup, method):
        self.link = link
        self.soup = soup
        self.method = method
    
    def extract(self):
        return self.method(self.link, self.soup)

In [20]:
df = pd.read_csv("strains.csv", low_memory=False)
df

Unnamed: 0,Organism Group,Name,Name Link,DSM No.,Taxonomy Link,Growth media,Growth Media Links,external links,DSMZ Catalogue,Bacdive Link,...,Wink compendium,Wink compendium link,Supplied as raw,Supplied as dict,Price of Freeze Dried,Price of Active culture on request,Price of DNA,Price Category,Culture link,Synonyms Full
0,Bacterium,Heyndrickxia coagulans DSM 1,https://mediadive.dsmz.de/strains/view/DSM 1,1,https://mediadive.dsmz.de/taxonomy?level=speci...,"['453', '1', 'J22']","['https://mediadive.dsmz.de/medium/453', 'http...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/654,...,,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacilluscoagulansHammer 1915 (Approved Lists 1...
1,Bacterium,Paenibacillus macquariensis subsp. macquariens...,https://mediadive.dsmz.de/strains/view/DSM 2,2,https://mediadive.dsmz.de/taxonomy?level=speci...,['1'],['https://mediadive.dsmz.de/medium/1'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11477,...,,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusmacquariensisMarshall and Ohye 1966 (A...
2,Bacterium,Sporosarcina psychrophila DSM 3,https://mediadive.dsmz.de/strains/view/DSM 3,3,https://mediadive.dsmz.de/taxonomy?level=speci...,"['1', 'J22']","['https://mediadive.dsmz.de/medium/1', 'https:...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11984,...,,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,Bacilluspsychrophilus(exLarkin and Stokes 1967...
3,Bacterium,Sporosarcina globispora DSM 4,https://mediadive.dsmz.de/strains/view/DSM 4,4,https://mediadive.dsmz.de/taxonomy?level=speci...,"['514', 'J22']","['https://mediadive.dsmz.de/medium/514', 'http...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11976,...,,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusglobisporusLarkin and Stokes 1967 (App...
4,Bacterium,Psychrobacillus insolitus DSM 5,https://mediadive.dsmz.de/strains/view/DSM 5,5,https://mediadive.dsmz.de/taxonomy?level=speci...,['123'],['https://mediadive.dsmz.de/medium/123'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/1565,...,,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusinsolitusLarkin and Stokes 1967 (Appro...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46258,,Phage (phagum) DSM 117437,https://mediadive.dsmz.de/strains/view/DSM 117437,117437,,['381'],['https://mediadive.dsmz.de/medium/381'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,
46259,,Phage (phagum) DSM 117679,https://mediadive.dsmz.de/strains/view/DSM 117679,117679,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,
46260,,Phage (phagum) DSM 117680,https://mediadive.dsmz.de/strains/view/DSM 117680,117680,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,
46261,,Staphylococcus epidermidis DSM 117681,https://mediadive.dsmz.de/strains/view/DSM 117681,117681,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,


In [22]:

# Define the function to create the links
def create_16rs_link(genbank_dict):
    if pd.isna(genbank_dict):
        return np.nan
    else:
        return ast.literal_eval(genbank_dict).get('16S rRNA gene:')[1] if ast.literal_eval(genbank_dict).get('16S rRNA gene:') else np.nan

# Apply the function to the DataFrame and create a new column
df['16S rRNA Link'] = df['Genbank dict'].apply(create_16rs_link)

# Display the DataFrame
df

Unnamed: 0,Organism Group,Name,Name Link,DSM No.,Taxonomy Link,Growth media,Growth Media Links,external links,DSMZ Catalogue,Bacdive Link,...,Wink compendium link,Supplied as raw,Supplied as dict,Price of Freeze Dried,Price of Active culture on request,Price of DNA,Price Category,Culture link,Synonyms Full,16S rRNA Link
0,Bacterium,Heyndrickxia coagulans DSM 1,https://mediadive.dsmz.de/strains/view/DSM 1,1,https://mediadive.dsmz.de/taxonomy?level=speci...,"['453', '1', 'J22']","['https://mediadive.dsmz.de/medium/453', 'http...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/654,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacilluscoagulansHammer 1915 (Approved Lists 1...,https://www.ncbi.nlm.nih.gov/nuccore/DQ297928
1,Bacterium,Paenibacillus macquariensis subsp. macquariens...,https://mediadive.dsmz.de/strains/view/DSM 2,2,https://mediadive.dsmz.de/taxonomy?level=speci...,['1'],['https://mediadive.dsmz.de/medium/1'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11477,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusmacquariensisMarshall and Ohye 1966 (A...,
2,Bacterium,Sporosarcina psychrophila DSM 3,https://mediadive.dsmz.de/strains/view/DSM 3,3,https://mediadive.dsmz.de/taxonomy?level=speci...,"['1', 'J22']","['https://mediadive.dsmz.de/medium/1', 'https:...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11984,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,Bacilluspsychrophilus(exLarkin and Stokes 1967...,
3,Bacterium,Sporosarcina globispora DSM 4,https://mediadive.dsmz.de/strains/view/DSM 4,4,https://mediadive.dsmz.de/taxonomy?level=speci...,"['514', 'J22']","['https://mediadive.dsmz.de/medium/514', 'http...",['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/11976,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusglobisporusLarkin and Stokes 1967 (App...,
4,Bacterium,Psychrobacillus insolitus DSM 5,https://mediadive.dsmz.de/strains/view/DSM 5,5,https://mediadive.dsmz.de/taxonomy?level=speci...,['123'],['https://mediadive.dsmz.de/medium/123'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,https://bacdive.dsmz.de/strain/1565,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,BacillusinsolitusLarkin and Stokes 1967 (Appro...,https://www.ncbi.nlm.nih.gov/nuccore/AM980508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46258,,Phage (phagum) DSM 117437,https://mediadive.dsmz.de/strains/view/DSM 117437,117437,,['381'],['https://mediadive.dsmz.de/medium/381'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,,
46259,,Phage (phagum) DSM 117679,https://mediadive.dsmz.de/strains/view/DSM 117679,117679,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,,
46260,,Phage (phagum) DSM 117680,https://mediadive.dsmz.de/strains/view/DSM 117680,117680,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,"""Delivery formPricesLiquid Suspension160,- €Pr...","{'Liquid Suspension': '160,- €'}",,,,6.0,,,
46261,,Staphylococcus epidermidis DSM 117681,https://mediadive.dsmz.de/strains/view/DSM 117681,117681,,['92'],['https://mediadive.dsmz.de/medium/92'],['https://www.dsmz.de/collection/catalogue/det...,https://www.dsmz.de/collection/catalogue/detai...,,...,,"""Delivery formPricesFreeze Dried100,- €Active ...","{'Freeze Dried': '100,- €', 'Active culture on...","100,- €","240,- €","150,- €",1.0,https://www.dsmz.de/search?tx_kesearch_pi1[swo...,,


In [23]:
def ncbi_method(ncbi_link, soup):
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html_snippet, 'html.parser')
    
    # Find the <a> tag with the id 'btn_download'
    a_tag = soup.find('a', id='btn_download')
    
    # Extract the URL from the href attribute
    download_url = a_tag['href']
    
    # Send a GET request to download the file
    response = requests.get(download_url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Specify the directory and file name
        download_dir = '/temp'
        os.makedirs(download_dir, exist_ok=True)  # Create directory if it doesn't exist
        file_path = os.path.join(download_dir, 'downloaded_file.fasta')
        
        # Write the content to the file
        with open(file_path, 'wb') as file:
            file.write(response.content)
        
        print(f"File downloaded successfully to {file_path}!")
    else:
        print("Failed to download the file.")
    sixteenS = None
    return [ncbi_link, sixteenS]

merged_df = scrape_link(df, '16S rRNA Link', ncbi_method, ['16S rRNA Link', '16S rRNA gene'])

16S rRNA Link link Progress:  74%|███████▍  | 7092/9570 [03:04<00:38, 63.89it/s]

Error fetching link https://www.ncbi.nlm.nih.gov/nucc </div>
 </div>
    <div class=: 404 Client Error: Not Found for url: https://www.ncbi.nlm.nih.gov/nucc%20%3C/div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class=. Retrying 1/5...


16S rRNA Link link Progress:  75%|███████▍  | 7149/9570 [03:06<00:35, 67.74it/s]

Error fetching link https://www.ncbi.nlm.nih.gov/nucc </div>
 </div>
    <div class=: 404 Client Error: Not Found for url: https://www.ncbi.nlm.nih.gov/nucc%20%3C/div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class=. Retrying 2/5...


16S rRNA Link link Progress:  75%|███████▌  | 7215/9570 [03:07<02:35, 15.18it/s]

Error fetching link https://www.ncbi.nlm.nih </div>
 </div>
    <div class=: HTTPSConnectionPool(host='www.ncbi.nlm.nih%20%3c', port=443): Max retries exceeded with url: /div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class= (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x3bcb07ac0>: Failed to resolve 'www.ncbi.nlm.nih%20%3c' ([Errno 8] nodename nor servname provided, or not known)")). Retrying 1/5...
Error fetching link https://www.ncbi.nlm.nih.gov/nucc </div>
 </div>
    <div class=: 404 Client Error: Not Found for url: https://www.ncbi.nlm.nih.gov/nucc%20%3C/div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class=. Retrying 3/5...


16S rRNA Link link Progress:  76%|███████▌  | 7275/9570 [03:09<00:44, 51.08it/s]

Error fetching link https://www.ncbi.nlm.nih </div>
 </div>
    <div class=: HTTPSConnectionPool(host='www.ncbi.nlm.nih%20%3c', port=443): Max retries exceeded with url: /div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class= (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x3a6ecd0f0>: Failed to resolve 'www.ncbi.nlm.nih%20%3c' ([Errno 8] nodename nor servname provided, or not known)")). Retrying 2/5...
Error fetching link https://www.ncbi.nlm.nih.gov/nucc </div>
 </div>
    <div class=: 404 Client Error: Not Found for url: https://www.ncbi.nlm.nih.gov/nucc%20%3C/div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class=. Retrying 4/5...


16S rRNA Link link Progress:  77%|███████▋  | 7338/9570 [03:10<01:28, 25.11it/s]

Error fetching link https://www.ncbi.nlm.nih </div>
 </div>
    <div class=: HTTPSConnectionPool(host='www.ncbi.nlm.nih%20%3c', port=443): Max retries exceeded with url: /div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class= (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x3b8ae0130>: Failed to resolve 'www.ncbi.nlm.nih%20%3c' ([Errno 8] nodename nor servname provided, or not known)")). Retrying 3/5...
Error fetching link https://www.ncbi.nlm.nih.gov/nucc </div>
 </div>
    <div class=: 404 Client Error: Not Found for url: https://www.ncbi.nlm.nih.gov/nucc%20%3C/div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class=. Retrying 5/5...


16S rRNA Link link Progress:  77%|███████▋  | 7386/9570 [03:11<00:37, 58.45it/s]

Error fetching link https://www.ncbi.nlm.nih </div>
 </div>
    <div class=: HTTPSConnectionPool(host='www.ncbi.nlm.nih%20%3c', port=443): Max retries exceeded with url: /div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class= (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x17554d7e0>: Failed to resolve 'www.ncbi.nlm.nih%20%3c' ([Errno 8] nodename nor servname provided, or not known)")). Retrying 4/5...


16S rRNA Link link Progress:  78%|███████▊  | 7450/9570 [03:13<01:59, 17.74it/s]

Error fetching link https://www.ncbi.nlm.nih </div>
 </div>
    <div class=: HTTPSConnectionPool(host='www.ncbi.nlm.nih%20%3c', port=443): Max retries exceeded with url: /div%3E%0A%20%3C/div%3E%0A%20%20%20%20%3Cdiv%20class= (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x3baa437c0>: Failed to resolve 'www.ncbi.nlm.nih%20%3c' ([Errno 8] nodename nor servname provided, or not known)")). Retrying 5/5...


16S rRNA Link link Progress:  80%|████████  | 7677/9570 [03:17<02:08, 14.68it/s]

Error fetching link https: </div>
 </div>
    <div class=: Invalid URL 'https: </div>\n </div>\n    <div class=': No host supplied. Retrying 1/5...


16S rRNA Link link Progress:  81%|████████  | 7747/9570 [03:19<01:11, 25.52it/s]

Error fetching link https: </div>
 </div>
    <div class=: Invalid URL 'https: </div>\n </div>\n    <div class=': No host supplied. Retrying 2/5...


16S rRNA Link link Progress:  81%|████████  | 7771/9570 [03:19<01:10, 25.37it/s]

Error fetching link https: </div>
 </div>
    <div class=: Invalid URL 'https: </div>\n </div>\n    <div class=': No host supplied. Retrying 3/5...


16S rRNA Link link Progress:  82%|████████▏ | 7834/9570 [03:21<01:51, 15.62it/s]

Error fetching link https: </div>
 </div>
    <div class=: Invalid URL 'https: </div>\n </div>\n    <div class=': No host supplied. Retrying 4/5...


16S rRNA Link link Progress:  82%|████████▏ | 7894/9570 [03:22<03:11,  8.77it/s]

Error fetching link https: </div>
 </div>
    <div class=: Invalid URL 'https: </div>\n </div>\n    <div class=': No host supplied. Retrying 5/5...


16S rRNA Link link Progress: 100%|█████████▉| 9567/9570 [05:00<00:00, 31.83it/s]


TypeError: object of type 'NoneType' has no len()

In [24]:
merged_df

NameError: name 'merged_df' is not defined