In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to extract data from a single page
def extract_data_from_page(url, page):
    params = {"p": page}
    response = requests.get(url, params=params)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    
    data = []
    table_rows = soup.find_all('tr')[1:]  # Skip the header row
    for row in table_rows:
        columns = row.find_all('td')
        row_data = []
        
        # Organism Group
        organism_group = columns[0].get_text(strip=True)
        row_data.append(organism_group)
        
        # Name and link
        name_tag = columns[1].find('a')
        name = name_tag.get_text(strip=True)
        name_link = name_tag['href']
        row_data.append(name)
        row_data.append(name_link)
        
        # Taxonomy link
        taxonomy_tag = columns[2].find('a')
        taxonomy_link = taxonomy_tag['href'] if taxonomy_tag else None
        row_data.append(taxonomy_link)
        
        # Growth Media links
        growth_media_links = [a['href'] for a in columns[3].find_all('a')]
        row_data.append(growth_media_links)
        
        # External links
        external_links = [a['href'] for a in columns[4].find_all('a')]
        row_data.append(external_links)
        
        data.append(row_data)
    
    return data, soup

# Base URL of the webpage
base_url = "https://mediadive.dsmz.de/strains"
all_data = []

# Scrape data from the first 20 pages
for page in range(1, 21):
    # Extract data from the current page
    page_data, soup = extract_data_from_page(base_url, page)
    all_data.extend(page_data)
    
    # Print the current page number
    print(f"Processing page {page}")

# Create a DataFrame from the extracted data
columns = ["Organism Group", "Name", "Name Link", "Taxonomy Link", "Growth Media Links", "External Links"]
df = pd.DataFrame(all_data, columns=columns)

# Save the DataFrame to a CSV file
df.to_csv('dsmz_strains.csv', index=False)

print("Data scraped and saved to dsmz_strains.csv")

Processing page 1
Processing page 2
Processing page 3
Processing page 4
Processing page 5
Processing page 6
Processing page 7
Processing page 8
Processing page 9
Processing page 10
Processing page 11
Processing page 12
Processing page 13
Processing page 14
Processing page 15
Processing page 16
Processing page 17
Processing page 18
Processing page 19
Processing page 20
Data scraped and saved to dsmz_strains.csv
