In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL of the page to scrape
base_url = 'https://mediadive.dsmz.de/gas'

# Prepare lists to hold the data
N2 = []
O2 = []
CO2 = []
H2 = []
CH4 = []
CO = []
Air = []
mediums = []

# Function to scrape a single page
def scrape_page(page_url):
    response = requests.get(page_url)
    response.raise_for_status()  # Check if the request was successful
    
    # Parse the page content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table rows
    rows = soup.find_all('tr')

    # Check if rows are empty (end of pagination)
    if not rows:
        return False
    
    table_rows = soup.find('div', class_="table-responsive my-15").find_all('tr')[1:]  # Skip the header row
    # print(table_rows)
    # Loop through the rows and extract data
    for row in table_rows:
        columns = row.find_all('td')
        Medium = columns[0].get_text(strip=True)
        mediums.append(Medium)
        N_2 = columns[1].get_text(strip=True)
        N2.append(N_2)
        O_2 = columns[2].get_text(strip=True)
        O2.append(O_2)
        CO_2 = columns[3].get_text(strip=True)
        CO2.append(CO_2)
        # print(Medium)
        H_2 = columns[4].get_text(strip=True)
        H2.append(H_2)
        CH_4 = columns[5].get_text(strip=True)
        CH4.append(CH_4)
        CO1 = columns[6].get_text(strip=True)
        CO.append(CO1)
        air = columns[7].get_text(strip=True)
        Air.append(air)
    return True

# Loop through the first 100 pages
for page in range(1, 49):
    page_url = f'{base_url}?p={page}'
    print(f'Scraping page: {page}')
    if not scrape_page(page_url):
        break


# Create a DataFrame
df = pd.DataFrame({
    'Medium': mediums,
    'N2':N2,
    'O2':O2,
    'CO2':CO2,
    'H2':H2,
    'CH4':CH4,
    'CO':CO,
    'Air':Air
})

df
df.to_csv('data.csv')

Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Scraping page: 10
Scraping page: 11
Scraping page: 12
Scraping page: 13
Scraping page: 14
Scraping page: 15
Scraping page: 16
Scraping page: 17
Scraping page: 18
Scraping page: 19
Scraping page: 20
Scraping page: 21
Scraping page: 22
Scraping page: 23
Scraping page: 24
Scraping page: 25
Scraping page: 26
Scraping page: 27
Scraping page: 28
Scraping page: 29
Scraping page: 30
Scraping page: 31
Scraping page: 32
Scraping page: 33
Scraping page: 34
Scraping page: 35
Scraping page: 36
Scraping page: 37
Scraping page: 38
Scraping page: 39
Scraping page: 40
Scraping page: 41
Scraping page: 42
Scraping page: 43
Scraping page: 44
Scraping page: 45
Scraping page: 46
Scraping page: 47
Scraping page: 48


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Base URL of the page to scrape
base_url = 'https://mediadive.dsmz.de/steps'

# Function to scrape a single page
def scrape_page(page):
    page_url = f'{base_url}?p={page}'
    response = requests.get(page_url)
    response.raise_for_status()  # Check if the request was successful
    
    # Parse the page content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table rows
    rows = soup.find_all('tr')

    # Check if rows are empty (end of pagination)
    if not rows:
        return None, page
    
    table_rows = soup.find('div', class_="table-responsive my-15").find_all('tr')[1:]  # Skip the header row

    # Temporary lists to hold data for this page
    temp_ids = []
    temp_steps = []
    temp_solutions = []
    temp_solutions_link = []
    temp_mediums = []
    temp_mediums_link = []

    # Loop through the rows and extract data
    for row in table_rows:
        id = row.get('id')
        temp_ids.append(id)
        
        columns = row.find_all('td')
        
        Step = columns[0].get_text(strip=True)
        temp_steps.append(Step)
        
        Solution = columns[1].get_text(strip=True)
        temp_solutions.append(Solution)
        Solution_link = columns[1].find('a')['href'] if columns[1].find('a') else None
        temp_solutions_link.append(Solution_link)
        
        Medium = columns[2].get_text(strip=True)
        temp_mediums.append(Medium)
        Medium_link = columns[2].find('a')['href'] if columns[2].find('a') else None
        temp_mediums_link.append(Medium_link)
    
    return (temp_ids, temp_steps, temp_solutions, temp_solutions_link, temp_mediums, temp_mediums_link), page

# Number of threads
num_threads = 10

# Dictionary to hold the results from each page
results_dict = {}

# Using ThreadPoolExecutor to parallelize the scraping process
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = {executor.submit(scrape_page, page): page for page in range(1, 308)}
    with tqdm(total=307, desc="Scraping pages", position=0, leave=True) as pbar:
        for future in as_completed(futures):
            result, page = future.result()
            if result:
                results_dict[page] = result
            pbar.update(1)

# Lists to hold the final results in order
ids = []
Steps = []
Solutions = []
Solutions_link = []
Mediums = []
Mediums_link = []

# Concatenate results in the order of pages
with tqdm(total=307, desc="Concatenating results", position=1, leave=True) as pbar:
    for page in range(1, 308):
        if page in results_dict:
            temp_ids, temp_steps, temp_solutions, temp_solutions_link, temp_mediums, temp_mediums_link = results_dict[page]
            ids.extend(temp_ids)
            Steps.extend(temp_steps)
            Solutions.extend(temp_solutions)
            Solutions_link.extend(temp_solutions_link)
            Mediums.extend(temp_mediums)
            Mediums_link.extend(temp_mediums_link)
        pbar.update(1)

# Create a DataFrame
df = pd.DataFrame({
    'ID': ids,
    'Step': Steps,
    'Solution': Solutions,
    'Solution_link': Solutions_link,
    'Medium': Mediums,
    'Mediums_link': Mediums_link
})

# Save the DataFrame to a CSV file
df.to_csv('Steps_new.csv', index=False)

print('Scraping complete. Data saved to Steps_new.csv')

Scraping pages: 100%|██████████| 307/307 [00:38<00:00,  7.95it/s]

Concatenating results: 100%|██████████| 307/307 [00:00<00:00, 224095.25it/s]

Scraping complete. Data saved to Steps_new.csv





In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Base URL of the webpage
base_url = "https://mediadive.dsmz.de"

# Function to extract data from a single page
def extract_data_from_page(url, page, retries=5):
    params = {"p": page}
    for attempt in range(retries):
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            data = []
            table_rows = soup.find_all('tr')[1:]  # Skip the header row
            for row in table_rows:
                columns = row.find_all('td')
                row_data = []
                
                # ID
                ID_group = columns[1].get_text(strip=True)
                row_data.append(ID_group)
                
                # Source
                Source_group = columns[2].get_text(strip=True)
                row_data.append(Source_group)
                
                # Name and link
                name_tag = columns[3].find('a')
                name = name_tag.get_text(strip=True)
                name_link = base_url + name_tag['href']
                row_data.append(name)
                row_data.append(name_link)
                
                
                #Type
                types = [a.get_text(strip=True) for a in columns[4].find_all('span')]
                row_data.append(types)
                
                # Final pH
                PH_group = columns[5].get_text(strip=True)
                row_data.append(PH_group)
                
                # Tax range
                tax_ranges = [a['title'] for a in columns[6].find_all('span', class_=None)] 
                row_data.append(tax_ranges)
                
                # Strains
                Strains_num = [a.get_text(strip=True) for a in columns[7].find_all('a')]
                Strains_link = [base_url + a['href'] for a in columns[7].find_all('a')]
                row_data.append(Strains_num)
                row_data.append(Strains_link)
                
                # PDF
                pdf = [base_url + a['href'] for a in columns[8].find_all('a')]
                row_data.append(pdf)
                
                data.append(row_data)
            
            return data
        except (requests.exceptions.RequestException, ConnectionResetError) as e:
            print(f"Error fetching page {page}: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(5)  # Wait before retrying
    return []

# Main scraping process
all_data = []
num_pages = 166  # Adjust the number of pages you want to scrape

max_workers = 32  # Adjust based on the MacBook M3 Pro capabilities

page_data_segments = [None] * num_pages  # Initialize a list to hold data for each page

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_page = {executor.submit(extract_data_from_page, base_url + "/media", page): page for page in range(1, num_pages + 1)}
    
    for future in tqdm(as_completed(future_to_page), total=num_pages, desc="Extracting data from pages"):
        page = future_to_page[future]
        page_data = future.result()
        if page_data:
            page_data_segments[page-1] = page_data  # Store the data segment for the corresponding page

# Concatenate all page data segments in order
for segment in page_data_segments:
    if segment:
        all_data.extend(segment)

# Create a DataFrame from the extracted data
columns = ["ID", "Source", "Name", "Name Link", "Type", "Final PH", "Tax.range type", "Strains number", "Strains links", "PDF"]
df = pd.DataFrame(all_data, columns=columns)

df

df.to_csv('runkun_media', index=False)

Extracting data from pages:  99%|█████████▉| 165/166 [00:19<00:00,  6.66it/s]

Error fetching page 166: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')). Retrying 1/5...


Extracting data from pages: 100%|██████████| 166/166 [00:28<00:00,  5.80it/s]


Unnamed: 0,ID,Source,Name,Name Link,Type,Final PH,Tax.range type,Strains number,Strains links,PDF
0,1,DSMZ,NUTRIENT AGAR,https://mediadive.dsmz.de/medium/1,[complex],7.0,"[Bacteria, Phages]",[2299],[https://mediadive.dsmz.de/strains/medium/1],[https://mediadive.dsmz.de/pdf/1]
1,1a,DSMZ,REACTIVATION WITH LIQUID MEDIUM 1,https://mediadive.dsmz.de/medium/1a,[complex],7.0,[Bacteria],[209],[https://mediadive.dsmz.de/strains/medium/1a],[https://mediadive.dsmz.de/pdf/1a]
2,2,DSMZ,BACILLUS PASTEURII MEDIUM,https://mediadive.dsmz.de/medium/2,[complex],,[Bacteria],[9],[https://mediadive.dsmz.de/strains/medium/2],[https://mediadive.dsmz.de/pdf/2]
3,3,DSMZ,AZOTOBACTER MEDIUM,https://mediadive.dsmz.de/medium/3,[defined],7.3,[Bacteria],[40],[https://mediadive.dsmz.de/strains/medium/3],[https://mediadive.dsmz.de/pdf/3]
4,6,DSMZ,ALLANTOIN MINERAL MEDIUM,https://mediadive.dsmz.de/medium/6,[complex],,[Bacteria],[22],[https://mediadive.dsmz.de/strains/medium/6],[https://mediadive.dsmz.de/pdf/6]
...,...,...,...,...,...,...,...,...,...,...
3308,P5,public,RS Medium - Nutrient Medium (NM) Component,https://mediadive.dsmz.de/medium/P5,[complex],7.0,[],[],[],[https://mediadive.dsmz.de/pdf/P5]
3309,P6,public,mTA10,https://mediadive.dsmz.de/medium/P6,[complex],7.2,[],[],[],[https://mediadive.dsmz.de/pdf/P6]
3310,P7,public,N27 RHODOSPIRILLACEAE MEDIUM (modified),https://mediadive.dsmz.de/medium/P7,[complex],6.8,[],[],[],[https://mediadive.dsmz.de/pdf/P7]
3311,P8,public,M9-SNG medium,https://mediadive.dsmz.de/medium/P8,[complex],7.2,[],[],[],[https://mediadive.dsmz.de/pdf/P8]
