In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL of the page to scrape
base_url = 'https://mediadive.dsmz.de/gas'

# Prepare lists to hold the data
N2 = []
O2 = []
CO2 = []
H2 = []
CH4 = []
CO = []
Air = []
mediums = []

# Function to scrape a single page
def scrape_page(page_url):
    response = requests.get(page_url)
    response.raise_for_status()  # Check if the request was successful
    
    # Parse the page content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table rows
    rows = soup.find_all('tr')

    # Check if rows are empty (end of pagination)
    if not rows:
        return False
    
    table_rows = soup.find('div', class_="table-responsive my-15").find_all('tr')[1:]  # Skip the header row
    # print(table_rows)
    # Loop through the rows and extract data
    for row in table_rows:
        columns = row.find_all('td')
        Medium = columns[0].get_text(strip=True)
        mediums.append(Medium)
        N_2 = columns[1].get_text(strip=True)
        N2.append(N_2)
        O_2 = columns[2].get_text(strip=True)
        O2.append(O_2)
        CO_2 = columns[3].get_text(strip=True)
        CO2.append(CO_2)
        # print(Medium)
        H_2 = columns[4].get_text(strip=True)
        H2.append(H_2)
        CH_4 = columns[5].get_text(strip=True)
        CH4.append(CH_4)
        CO1 = columns[6].get_text(strip=True)
        CO.append(CO1)
        air = columns[7].get_text(strip=True)
        Air.append(air)
    return True

# Loop through the first 100 pages
for page in range(1, 49):
    page_url = f'{base_url}?p={page}'
    print(f'Scraping page: {page}')
    if not scrape_page(page_url):
        break


# Create a DataFrame
df = pd.DataFrame({
    'Medium': mediums,
    'N2':N2,
    'O2':O2,
    'CO2':CO2,
    'H2':H2,
    'CH4':CH4,
    'CO':CO,
    'Air':Air
})

df
df.to_csv('data.csv')

Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Scraping page: 10
Scraping page: 11
Scraping page: 12
Scraping page: 13
Scraping page: 14
Scraping page: 15
Scraping page: 16
Scraping page: 17
Scraping page: 18
Scraping page: 19
Scraping page: 20
Scraping page: 21
Scraping page: 22
Scraping page: 23
Scraping page: 24
Scraping page: 25
Scraping page: 26
Scraping page: 27
Scraping page: 28
Scraping page: 29
Scraping page: 30
Scraping page: 31
Scraping page: 32
Scraping page: 33
Scraping page: 34
Scraping page: 35
Scraping page: 36
Scraping page: 37
Scraping page: 38
Scraping page: 39
Scraping page: 40
Scraping page: 41
Scraping page: 42
Scraping page: 43
Scraping page: 44
Scraping page: 45
Scraping page: 46
Scraping page: 47
Scraping page: 48


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Base URL of the page to scrape
base_url = 'https://mediadive.dsmz.de/steps'

# Function to scrape a single page
def scrape_page(page):
    page_url = f'{base_url}?p={page}'
    response = requests.get(page_url)
    response.raise_for_status()  # Check if the request was successful
    
    # Parse the page content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table rows
    rows = soup.find_all('tr')

    # Check if rows are empty (end of pagination)
    if not rows:
        return None, page
    
    table_rows = soup.find('div', class_="table-responsive my-15").find_all('tr')[1:]  # Skip the header row

    # Temporary lists to hold data for this page
    temp_ids = []
    temp_steps = []
    temp_solutions = []
    temp_solutions_link = []
    temp_mediums = []
    temp_mediums_link = []

    # Loop through the rows and extract data
    for row in table_rows:
        id = row.get('id')
        temp_ids.append(id)
        
        columns = row.find_all('td')
        
        Step = columns[0].get_text(strip=True)
        temp_steps.append(Step)
        
        Solution = columns[1].get_text(strip=True)
        temp_solutions.append(Solution)
        Solution_link = columns[1].find('a')['href'] if columns[1].find('a') else None
        temp_solutions_link.append(Solution_link)
        
        Medium = columns[2].get_text(strip=True)
        temp_mediums.append(Medium)
        Medium_link = columns[2].find('a')['href'] if columns[2].find('a') else None
        temp_mediums_link.append(Medium_link)
    
    return (temp_ids, temp_steps, temp_solutions, temp_solutions_link, temp_mediums, temp_mediums_link), page

# Number of threads
num_threads = 10

# Dictionary to hold the results from each page
results_dict = {}

# Using ThreadPoolExecutor to parallelize the scraping process
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = {executor.submit(scrape_page, page): page for page in range(1, 308)}
    with tqdm(total=307, desc="Scraping pages", position=0, leave=True) as pbar:
        for future in as_completed(futures):
            result, page = future.result()
            if result:
                results_dict[page] = result
            pbar.update(1)

# Lists to hold the final results in order
ids = []
Steps = []
Solutions = []
Solutions_link = []
Mediums = []
Mediums_link = []

# Concatenate results in the order of pages
with tqdm(total=307, desc="Concatenating results", position=1, leave=True) as pbar:
    for page in range(1, 308):
        if page in results_dict:
            temp_ids, temp_steps, temp_solutions, temp_solutions_link, temp_mediums, temp_mediums_link = results_dict[page]
            ids.extend(temp_ids)
            Steps.extend(temp_steps)
            Solutions.extend(temp_solutions)
            Solutions_link.extend(temp_solutions_link)
            Mediums.extend(temp_mediums)
            Mediums_link.extend(temp_mediums_link)
        pbar.update(1)

# Create a DataFrame
df = pd.DataFrame({
    'ID': ids,
    'Step': Steps,
    'Solution': Solutions,
    'Solution_link': Solutions_link,
    'Medium': Mediums,
    'Mediums_link': Mediums_link
})

# Save the DataFrame to a CSV file
df.to_csv('Steps_new.csv', index=False)

print('Scraping complete. Data saved to Steps_new.csv')

Scraping pages: 100%|██████████| 307/307 [00:38<00:00,  7.95it/s]

Concatenating results: 100%|██████████| 307/307 [00:00<00:00, 224095.25it/s]

Scraping complete. Data saved to Steps_new.csv





\textbf{MEDIA
}

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import re

# Base URL of the webpage
base_url = "https://mediadive.dsmz.de"

# Function to extract data from a single page
def extract_data_from_page(url, page, pbar, retries=5):
    params = {"p": page}
    for attempt in range(retries):
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            data = []
            table_rows = soup.find_all('tr')[1:]  # Skip the header row
            for row in table_rows:
                row_data = extract_data_from_row(row, pbar)
                data.append(row_data)
            return data
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching page {page}: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(1)  # Wait before retrying
    return []

def extract_data_from_row(row, pbar, retries=5):
    for attempt in range(retries):
        try:
            columns = row.find_all('td')
            row_data = []
                
            # ID
            ID_group = columns[1].get_text(strip=True)
            row_data.append(ID_group)
                
            # Source
            Source_group = columns[2].get_text(strip=True)
            row_data.append(Source_group)
                
            # Name and link
            name_tag = columns[3].find('a')
            name = name_tag.get_text(strip=True)
            name_link = base_url + name_tag['href']
            row_data.append(name)
            row_data.append(name_link)
                
            #Type
            type = columns[4].find('span').get_text(strip=True)
            row_data.append(type)
            
            if type == 'complex':
                complex_medium = 'yes'
            else:
                complex_medium = 'no'
            row_data.append(complex_medium)
                
            # Final pH
            ph = columns[5].get_text(strip=True)
            row_data.append(ph)
                
            if "-" in ph:
                min_ph = ph.split(" - ")[0]
                max_ph = ph.split(" - ")[1]
            else:
                min_ph = ph
                max_ph = ph
            row_data.append(min_ph)
            row_data.append(max_ph)
                
            # Tax range
            tax_ranges = [a['title'] for a in columns[6].find_all('span', class_=None)] 
            row_data.append(tax_ranges)
            
            # Strains
            Strains_num = columns[7].find('a').get_text(strip=True) if columns[7].find('a') else None
            Strains_link = base_url + columns[7].find('a')['href'] if columns[7].find('a') else None
            row_data.append(Strains_num)
            row_data.append(Strains_link)
            
            # PDF
            pdf = base_url + columns[8].find('a')['href'] if columns[8].find('a') else None
            row_data.append(pdf)
            
            pbar.update(1)
            return row_data
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(1)  # Wait before retrying
    return []



def extract_data_from_strainslink(strainslink, retries=5):
    for attempt in range(retries):
        try:
            response = requests.get(strainslink)
            response.raise_for_status()
            soup_n = BeautifulSoup(response.content, 'html.parser')
            return []
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching strains link: {strainslink}. Retrying {attempt + 1}/{retries}...")
            time.sleep(1)
    return []

# Function to calculate total number of pages for all solution and media links
def calculate_page_rows(url, page, retries=5):
    params = {"p": page}
    for attempt in range(retries):
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            page_rows = len(soup.find_all('tr')[1:])  # Skip the header row
            return page_rows
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
                print(f"Error fetching page {page}: {e}. Retrying {attempt + 1}/{retries}...")
                time.sleep(1)
    return 0

# Main scraping process
all_data = []
num_pages = 166  # Adjust the number of pages you want to scrape

max_workers = 648  # Adjust based on the MacBook M3 Pro capabilities

page_data_segments = [None] * num_pages  # Initialize a list to hold data for each page

# Calculate the total number of pages for all solution and media links
total_rows = 0
with tqdm(total=num_pages, desc="Calculating total pages") as pbar:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_name_link = {executor.submit(calculate_page_rows, base_url + "/media", page): page for page in range(1, num_pages + 1)}
        for future in as_completed(future_to_name_link):
            total_rows += future.result()
            pbar.update(1)

with tqdm(total=total_rows, desc="Rows Progress") as pbar:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_page = {executor.submit(extract_data_from_page, base_url + "/media", page, pbar): page for page in range(1, num_pages + 1)}
        for future in as_completed(future_to_page):
            page = future_to_page[future]
            page_data = future.result()
            if page_data:
                page_data_segments[page-1] = page_data  # Store the data segment for the corresponding page

# Concatenate all page data segments in order
for segment in page_data_segments:
    if segment:
        all_data.extend(segment)

# Create a DataFrame from the extracted data
columns = ["ID", "Source", "Name", "Name Link", "Type", "Complex Medium", "Final PH", "min pH", "max pH", "Tax.range type", "Strains number", "Strains links", "PDF"]
df = pd.DataFrame(all_data, columns=columns)

df



Calculating total pages: 100%|██████████| 1/1 [00:01<00:00,  1.98s/it]
Rows Progress: 100%|██████████| 20/20 [00:02<00:00,  9.43it/s]


Unnamed: 0,ID,Source,Name,Name Link,Type,Complex Medium,Final PH,min pH,max pH,Tax.range type,Strains number,Strains links,PDF
0,1,DSMZ,NUTRIENT AGAR,https://mediadive.dsmz.de/medium/1,complex,yes,7.0,7.0,7.0,"[Bacteria, Phages]",2305,https://mediadive.dsmz.de/strains/medium/1,https://mediadive.dsmz.de/pdf/1
1,1a,DSMZ,REACTIVATION WITH LIQUID MEDIUM 1,https://mediadive.dsmz.de/medium/1a,complex,yes,7.0,7.0,7.0,[Bacteria],209,https://mediadive.dsmz.de/strains/medium/1a,https://mediadive.dsmz.de/pdf/1a
2,2,DSMZ,BACILLUS PASTEURII MEDIUM,https://mediadive.dsmz.de/medium/2,complex,yes,,,,[Bacteria],9,https://mediadive.dsmz.de/strains/medium/2,https://mediadive.dsmz.de/pdf/2
3,3,DSMZ,AZOTOBACTER MEDIUM,https://mediadive.dsmz.de/medium/3,defined,no,7.3,7.3,7.3,[Bacteria],40,https://mediadive.dsmz.de/strains/medium/3,https://mediadive.dsmz.de/pdf/3
4,6,DSMZ,ALLANTOIN MINERAL MEDIUM,https://mediadive.dsmz.de/medium/6,complex,yes,,,,[Bacteria],22,https://mediadive.dsmz.de/strains/medium/6,https://mediadive.dsmz.de/pdf/6
5,7,DSMZ,ANCYLOBACTER - SPIROSOMA MEDIUM,https://mediadive.dsmz.de/medium/7,complex,yes,7.0,7.0,7.0,[Bacteria],41,https://mediadive.dsmz.de/strains/medium/7,https://mediadive.dsmz.de/pdf/7
6,8,DSMZ,"BACILLUS ""RACEMILACTICUS"" MEDIUM",https://mediadive.dsmz.de/medium/8,complex,yes,6.8,6.8,6.8,[Bacteria],7,https://mediadive.dsmz.de/strains/medium/8,https://mediadive.dsmz.de/pdf/8
7,9,DSMZ,VY/2 AGAR,https://mediadive.dsmz.de/medium/9,complex,yes,7.2,7.2,7.2,[Bacteria],2849,https://mediadive.dsmz.de/strains/medium/9,https://mediadive.dsmz.de/pdf/9
8,9a,DSMZ,"VY/2, REDUCED MEDIUM",https://mediadive.dsmz.de/medium/9a,complex,yes,7.2,7.2,7.2,[Bacteria],4,https://mediadive.dsmz.de/strains/medium/9a,https://mediadive.dsmz.de/pdf/9a
9,10,DSMZ,ZYMOMONAS MEDIUM,https://mediadive.dsmz.de/medium/10,complex,yes,,,,[Bacteria],9,https://mediadive.dsmz.de/strains/medium/10,https://mediadive.dsmz.de/pdf/10


In [4]:
# Function to extract data from a single page
def extract_data_from_namelink(namelink, pbar ,retries=5):
    for attempt in range(retries):
        try:
            response = requests.get(namelink)
            response.raise_for_status()
            soup_n = BeautifulSoup(response.content, 'html.parser')
            
            metadata_table = soup_n.find('table', class_='table small', id="metadata-box") if soup_n.find('table', class_='table small', id="metadata-box") else None
            if metadata_table:
                last_modified = soup_n.select_one("td:contains('Last modified:')").find_next_sibling('td').get_text(strip=True) if soup_n.select_one("td:contains('Last modified:')") else None
            else:
                last_modified = None
            
            related_table = soup_n.find('div', class_="box", id="related-media-box") if soup_n.find('div', class_="box", id="related-media-box") else None
            related_ids = []
            related_names = []
            if related_table:
                related_ids = [tr.find_all('td')[0].find('a').get_text(strip=True) for tr in related_table.find_all('tr')]
                related_names = ['"' + tr.find_all('td')[1].get_text(strip=True) + '"' for tr in related_table.find_all('tr')]
            
            strains_with_mod = soup_n.select_one("td:contains('Strains with modifications:')").find_next_sibling('td').get_text(strip=True) if soup_n.select_one("td:contains('Strains with modifications:')") else None
           
            div_responsive = soup_n.find('div', class_='box', id="bacdive-box") if soup_n.find('div', class_='box', id="bacdive-box") else None
            if div_responsive:
                table_responsive = div_responsive.find('div', class_="table-responsive") if soup_n.find('div', class_="table-responsive") else None
            else:
                table_responsive = None
            cultivation_metadata = []
            if table_responsive:
                for row in table_responsive.find_all('tr'):
                    cultivation_metadata.append([re.sub(r'\s+', ' ', td.get_text(strip=True)) for td in row.find_all('td')])
    
            pbar.update(1)
            return [namelink, last_modified, related_ids, related_names, strains_with_mod, cultivation_metadata]
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching name link {namelink}: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(1)  # Wait before retrying
    return [namelink, None, ]

# Extract additional data from all Name links
namelink_data = []
name_links = df['Name Link'].dropna().unique()

with tqdm(total=len(name_links), desc="Namelink Progress") as pbar:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_name_link = {executor.submit(extract_data_from_namelink, name_link, pbar): name_link for name_link in name_links}
        for future in as_completed(future_to_name_link):
            link_data = future.result()
            namelink_data.append(link_data)

    namelink_data_columns = ["Name Link", "Last modified", "Related media ID for medium", "Related media name for medium", "Strains with modifications", "Cultivation metadata from strains"]
    namelink_df = pd.DataFrame(namelink_data, columns=namelink_data_columns)
    
    merged_df = pd.merge(df, namelink_df, on='Name Link', how='left').fillna("")

merged_df

Namelink Progress: 100%|██████████| 20/20 [00:03<00:00,  6.57it/s]


Unnamed: 0,ID,Source,Name,Name Link,Type,Complex Medium,Final PH,min pH,max pH,Tax.range type,Strains number,Strains links,PDF,Last modified,Related media ID for medium,Related media name for medium,Strains with modifications,Cultivation metadata from strains
0,1,DSMZ,NUTRIENT AGAR,https://mediadive.dsmz.de/medium/1,complex,yes,7.0,7.0,7.0,"[Bacteria, Phages]",2305,https://mediadive.dsmz.de/strains/medium/1,https://mediadive.dsmz.de/pdf/1,20.09.22,"[1, a, 2, 114, 115, a, 118, 238, 241, 258, 302...","[""NUTRIENT AGAR"", ""REACTIVATION WITH LIQUID ME...",183.0,"[[halophily, non-halophilic, 7 strains], [halo..."
1,1a,DSMZ,REACTIVATION WITH LIQUID MEDIUM 1,https://mediadive.dsmz.de/medium/1a,complex,yes,7.0,7.0,7.0,[Bacteria],209,https://mediadive.dsmz.de/strains/medium/1a,https://mediadive.dsmz.de/pdf/1a,22.02.22,"[1, a, 2, 114, 115, a, 118, 238, 241, 258, 302...","[""NUTRIENT AGAR"", ""REACTIVATION WITH LIQUID ME...",9.0,"[[pH, 6 - 8, 20 strains], [pH, 4 - 6, 14 strai..."
2,2,DSMZ,BACILLUS PASTEURII MEDIUM,https://mediadive.dsmz.de/medium/2,complex,yes,,,,[Bacteria],9,https://mediadive.dsmz.de/strains/medium/2,https://mediadive.dsmz.de/pdf/2,04.02.22,"[1, a, 2, 114, 115, a, 118, 238, 241, 258, 302...","[""NUTRIENT AGAR"", ""REACTIVATION WITH LIQUID ME...",,"[[phyla, 1, 9 strains], [temperature, mesophil..."
3,3,DSMZ,AZOTOBACTER MEDIUM,https://mediadive.dsmz.de/medium/3,defined,no,7.3,7.3,7.3,[Bacteria],40,https://mediadive.dsmz.de/strains/medium/3,https://mediadive.dsmz.de/pdf/3,22.02.22,[],[],,"[[phyla, 1, 40 strains], [temperature, mesophi..."
4,6,DSMZ,ALLANTOIN MINERAL MEDIUM,https://mediadive.dsmz.de/medium/6,complex,yes,,,,[Bacteria],22,https://mediadive.dsmz.de/strains/medium/6,https://mediadive.dsmz.de/pdf/6,30.11.23,[],[],1.0,"[[phyla, 2, 22 strains], [temperature, mesophi..."
5,7,DSMZ,ANCYLOBACTER - SPIROSOMA MEDIUM,https://mediadive.dsmz.de/medium/7,complex,yes,7.0,7.0,7.0,[Bacteria],41,https://mediadive.dsmz.de/strains/medium/7,https://mediadive.dsmz.de/pdf/7,22.02.22,[],[],1.0,"[[pH, 6 - 8, 3 strains], [pH, 10 - 12, 2 strai..."
6,8,DSMZ,"BACILLUS ""RACEMILACTICUS"" MEDIUM",https://mediadive.dsmz.de/medium/8,complex,yes,6.8,6.8,6.8,[Bacteria],7,https://mediadive.dsmz.de/strains/medium/8,https://mediadive.dsmz.de/pdf/8,22.02.22,[],[],,"[[phyla, 1, 7 strains], [oxygen, facultative a..."
7,9,DSMZ,VY/2 AGAR,https://mediadive.dsmz.de/medium/9,complex,yes,7.2,7.2,7.2,[Bacteria],2849,https://mediadive.dsmz.de/strains/medium/9,https://mediadive.dsmz.de/pdf/9,22.02.22,"[9, a]","[""VY/2 AGAR"", ""VY/2, REDUCED MEDIUM""]",8.0,"[[pH, 6 - 8, 9 strains], [pH, 8 - 10, 4 strain..."
8,9a,DSMZ,"VY/2, REDUCED MEDIUM",https://mediadive.dsmz.de/medium/9a,complex,yes,7.2,7.2,7.2,[Bacteria],4,https://mediadive.dsmz.de/strains/medium/9a,https://mediadive.dsmz.de/pdf/9a,22.02.22,"[9, a]","[""VY/2 AGAR"", ""VY/2, REDUCED MEDIUM""]",,"[[phyla, 1, 4 strains], [temperature, mesophil..."
9,10,DSMZ,ZYMOMONAS MEDIUM,https://mediadive.dsmz.de/medium/10,complex,yes,,,,[Bacteria],9,https://mediadive.dsmz.de/strains/medium/10,https://mediadive.dsmz.de/pdf/10,10.11.21,[],[],4.0,"[[phyla, 1, 9 strains], [oxygen, anaerobe, 4 s..."


In [5]:
# Function to extract data from a single page
def extract_data_from_strainslink(strainslink, pbar, retries=5):
    for attempt in range(retries):
        try:
            response = requests.get(strainslink)
            response.raise_for_status()
            soup_s = BeautifulSoup(response.content, 'html.parser').find('div', class_="content-wrapper")
            page_num = int(soup_s.find('button', title="last")['value']) if soup_s.find('button', title="last") else 1
            strain_names = []
            for p in range(1, page_num + 1):
                strain_names.extend(extract_strainslink_page(strainslink + "?p=" + str(p), pbar))
            return [strainslink, strain_names]
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching strains link {strainslink}: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(1)  # Wait before retrying
    return [strainslink, None]

def extract_strainslink_page(strainslink, pbar, retries=5):
    for attempt in range(retries):
        try:
            response = requests.get(strainslink)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser').find('div', class_="content-wrapper")
            page_names = []
            rows = soup.find('div', class_="table-responsive my-15").find_all('tr')[1:] if soup.find('div', class_="table-responsive my-15") else None
            if rows:
                for row in rows:
                    page_names.append(row.find('a').get_text(strip=True))
                    pbar.update(1)
            return page_names
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching {strainslink}: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(1)
    return []

# Assuming df and merged_df are predefined DataFrames
strainslink_data = []
strains_links = df['Strains links'].dropna().unique()

merged_df['Strains number'] = pd.to_numeric(df['Strains number'])

with tqdm(total=int(merged_df['Strains number'].sum()), desc="Strains link Progress") as pbar:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_strains_link = {executor.submit(extract_data_from_strainslink, strains_link, pbar): strains_link for strains_link in strains_links}
        for future in as_completed(future_to_strains_link):
            link_data = future.result()
            strainslink_data.append(link_data)

strainslink_data_columns = ["Strains links", "Strains Name"]
strainslink_df = pd.DataFrame(strainslink_data, columns=strainslink_data_columns)

merged_merged_df = pd.merge(merged_df, strainslink_df, on='Strains links', how='left').fillna("")

merged_merged_df

Strains link Progress: 100%|██████████| 6126/6126 [11:06<00:00,  9.19it/s] 


Unnamed: 0,ID,Source,Name,Name Link,Type,Complex Medium,Final PH,min pH,max pH,Tax.range type,Strains number,Strains links,PDF,Last modified,Related media ID for medium,Related media name for medium,Strains with modifications,Cultivation metadata from strains,test
0,1,DSMZ,NUTRIENT AGAR,https://mediadive.dsmz.de/medium/1,complex,yes,7.0,7.0,7.0,"[Bacteria, Phages]",2305,https://mediadive.dsmz.de/strains/medium/1,https://mediadive.dsmz.de/pdf/1,20.09.22,"[1, a, 2, 114, 115, a, 118, 238, 241, 258, 302...","[""NUTRIENT AGAR"", ""REACTIVATION WITH LIQUID ME...",183.0,"[[halophily, non-halophilic, 7 strains], [halo...","['Gottschalkia' methylica DSM 21470, Acerihabi..."
1,1a,DSMZ,REACTIVATION WITH LIQUID MEDIUM 1,https://mediadive.dsmz.de/medium/1a,complex,yes,7.0,7.0,7.0,[Bacteria],209,https://mediadive.dsmz.de/strains/medium/1a,https://mediadive.dsmz.de/pdf/1a,22.02.22,"[1, a, 2, 114, 115, a, 118, 238, 241, 258, 302...","[""NUTRIENT AGAR"", ""REACTIVATION WITH LIQUID ME...",9.0,"[[pH, 6 - 8, 20 strains], [pH, 4 - 6, 14 strai...","[Achromobacter sp. DSM 26587, Acidovorax avena..."
2,2,DSMZ,BACILLUS PASTEURII MEDIUM,https://mediadive.dsmz.de/medium/2,complex,yes,,,,[Bacteria],9,https://mediadive.dsmz.de/strains/medium/2,https://mediadive.dsmz.de/pdf/2,04.02.22,"[1, a, 2, 114, 115, a, 118, 238, 241, 258, 302...","[""NUTRIENT AGAR"", ""REACTIVATION WITH LIQUID ME...",,"[[phyla, 1, 9 strains], [temperature, mesophil...","[Sporosarcina pasteurii DSM 276, Sporosarcina ..."
3,3,DSMZ,AZOTOBACTER MEDIUM,https://mediadive.dsmz.de/medium/3,defined,no,7.3,7.3,7.3,[Bacteria],40,https://mediadive.dsmz.de/strains/medium/3,https://mediadive.dsmz.de/pdf/3,22.02.22,[],[],,"[[phyla, 1, 40 strains], [temperature, mesophi...","[Azomonas agilis DSM 89, Azomonas agilis DSM 3..."
4,6,DSMZ,ALLANTOIN MINERAL MEDIUM,https://mediadive.dsmz.de/medium/6,complex,yes,,,,[Bacteria],22,https://mediadive.dsmz.de/strains/medium/6,https://mediadive.dsmz.de/pdf/6,30.11.23,[],[],1.0,"[[phyla, 2, 22 strains], [temperature, mesophi...","[Bacillus sp. DSM 1302, Bacillus sp. DSM 1303,..."
5,7,DSMZ,ANCYLOBACTER - SPIROSOMA MEDIUM,https://mediadive.dsmz.de/medium/7,complex,yes,7.0,7.0,7.0,[Bacteria],41,https://mediadive.dsmz.de/strains/medium/7,https://mediadive.dsmz.de/pdf/7,22.02.22,[],[],1.0,"[[pH, 6 - 8, 3 strains], [pH, 10 - 12, 2 strai...","[Ancylobacter aquaticus DSM 101, Ancylobacter ..."
6,8,DSMZ,"BACILLUS ""RACEMILACTICUS"" MEDIUM",https://mediadive.dsmz.de/medium/8,complex,yes,6.8,6.8,6.8,[Bacteria],7,https://mediadive.dsmz.de/strains/medium/8,https://mediadive.dsmz.de/pdf/8,22.02.22,[],[],,"[[phyla, 1, 7 strains], [oxygen, facultative a...","[Bacillus sp. DSM 445, Bacillus sp. DSM 2309, ..."
7,9,DSMZ,VY/2 AGAR,https://mediadive.dsmz.de/medium/9,complex,yes,7.2,7.2,7.2,[Bacteria],2849,https://mediadive.dsmz.de/strains/medium/9,https://mediadive.dsmz.de/pdf/9,22.02.22,"[9, a]","[""VY/2 AGAR"", ""VY/2, REDUCED MEDIUM""]",8.0,"[[pH, 6 - 8, 9 strains], [pH, 8 - 10, 4 strain...","[Aggregicoccus edonensis DSM 27872, Angiococcu..."
8,9a,DSMZ,"VY/2, REDUCED MEDIUM",https://mediadive.dsmz.de/medium/9a,complex,yes,7.2,7.2,7.2,[Bacteria],4,https://mediadive.dsmz.de/strains/medium/9a,https://mediadive.dsmz.de/pdf/9a,22.02.22,"[9, a]","[""VY/2 AGAR"", ""VY/2, REDUCED MEDIUM""]",,"[[phyla, 1, 4 strains], [temperature, mesophil...","[Chondromyces robustus DSM 14739, Polyangium s..."
9,10,DSMZ,ZYMOMONAS MEDIUM,https://mediadive.dsmz.de/medium/10,complex,yes,,,,[Bacteria],9,https://mediadive.dsmz.de/strains/medium/10,https://mediadive.dsmz.de/pdf/10,10.11.21,[],[],4.0,"[[phyla, 1, 9 strains], [oxygen, anaerobe, 4 s...","[Zymomonas mobilis DSM 424, Zymomonas mobilis ..."
