In [1]:
# env: AppCurateGEO 
import requests
from bs4 import BeautifulSoup
import os
import re
import pandas as pd
from collections import defaultdict

from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

def process_gse(gse_id, super_series=None):
    # If SuperSeries is not set, use the current GSE
    if not super_series:
        super_series = gse_id

    url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={gse_id}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    desired_fields = [
        "Title",
        "Summary",
        "Overall design",
        "Contact name",
        "E-mail(s)",
        "Phone",
        "Organization name",
        "Department",
        "Lab",
        "City",
        "State/province",
        "Country",
    ]

    data = {}
    rows = soup.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        if len(cols) == 2:
            label = cols[0].get_text(strip=True)
            value = cols[1].get_text(strip=True)
            if label in desired_fields:
                data[label] = value

    # Get platforms & samples from SOFT
    soft_url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={gse_id}&format=soft"
    soft_response = requests.get(soft_url)
    soft_text = soft_response.text

    platforms = set(re.findall(r"(GPL\d+)", soft_text))
    platforms_str = ", ".join(sorted(platforms))
    samples = set(re.findall(r"(GSM\d+)", soft_text))
    num_samples = len(samples)

    data["Platforms"] = platforms_str
    data["Samples"] = num_samples
    data["Series"] = gse_id
    data["SuperSeries"] = super_series

    supp_data = []

    # -------------------------
    # Case 1: Selenium "(custom)" scraping if link exists
    # -------------------------
    if soup.find("a", string="(custom)"):
        driver = webdriver.Chrome()
        try:
            driver.get(url)
            wait = WebDriverWait(driver, 10)

            custom_link = wait.until(
                EC.element_to_be_clickable((By.LINK_TEXT, "(custom)"))
            )
            custom_link.click()

            wait.until(EC.presence_of_all_elements_located((By.XPATH, "//input[@type='checkbox']")))
            checkboxes = driver.find_elements(By.XPATH, "//input[@type='checkbox']")

            files = []
            for checkbox in checkboxes:
                parent_text = checkbox.find_element(By.XPATH, "./..").text
                files.append(parent_text)
        finally:
            driver.quit()

        files = files[:-1] # exclude last item which is "(all files)"

        # Exclude "(all files)" if present at end
        files = [f for f in files if f.lower() != "(all files)"]

        if files:
            for f in files:
                row_dict = data.copy()
                row_dict["Custom File"] = f
                supp_data.append(row_dict)
        else:
            supp_data.append(data)

    # -------------------------
    # Case 2: Default parsing (original code)
    # -------------------------
    else:
        tables = soup.find_all('table')
        supp_table = None
        for table in tables[::-1]:
            header_row = table.find('tr')
            if not header_row:
                continue
            headers = [cell.get_text(strip=True) for cell in header_row.find_all(['td', 'th'])]
            if (
                "Supplementary file" in headers
                and "Size" in headers
                and "File type/resource" in headers
            ):
                supp_table = table
                break

        if supp_table:
            rows = supp_table.find_all('tr')[1:]  # skip header
            for row in rows:
                cells = row.find_all('td')
                if len(cells) >= 4:
                    file_name = cells[0].get_text(strip=True)
                    size = cells[1].get_text(strip=True)
                    file_type = cells[3].get_text(strip=True)
                    row_dict = data.copy()
                    row_dict.update({
                        "Supplementary file": file_name,
                        "Size": size,
                        "File type/resource": file_type
                    })
                    supp_data.append(row_dict)
        else:
            # If no supplementary table, at least record metadata once
            supp_data.append(data)

    return supp_data

# This script needs to be on my local machine NOT in OneDrive 
# local at /Users/mmarcao/Documents/GEO_app_test

In [2]:
# Query Search
query = 'pediatric glioma spatial transcriptomics'
dir_base = './'

In [3]:
driver = webdriver.Chrome()
driver.get("https://www.ncbi.nlm.nih.gov/geo/") #open webpage

search_bar = driver.find_element(by=By.CLASS_NAME, value="jig-ncbiclearbutton") #identify search bar
search_bar.send_keys(query) #enter your query

#see search results
search_button = driver.find_element(by=By.ID, value="setacc")
search_button.click()
wait = WebDriverWait(driver, 10)
search_link = wait.until(EC.element_to_be_clickable(
    (By.XPATH, "//p[contains(., 'results for') and contains(., 'DataSets')]//a")
))
search_link.click()

gse_list=[]
while True:
    try:
        #check to make sure you are not on the last page
        next_button = driver.find_element(By.XPATH, "//a[@title='Next page of results']")
    except:
        #extract GSE ids on last page and exit
        for element in driver.find_elements(By.XPATH, "//*[contains(text(), 'GSE')]"):
            gse_list.append(element.text)
        break
    else:
        #extract GSE ids and put in list
        for element in driver.find_elements(By.XPATH, "//*[contains(text(), 'GSE')]"):
            gse_list.append(element.text)
        #navigate to the next page
        next_button = driver.find_element(By.XPATH, 
        "//a[@title='Next page of results']")
        next_button.click()

driver.quit()

print(gse_list)

['GSE271936', 'GSE268577', 'GSE194329']


### Web scraping each GSE into a dataframe output 

In [4]:
# List of GSE IDs to process
all_data = []
for gse in gse_list:
    print(f"Processing {gse} ...")
    all_data.extend(process_gse(gse))

df_combined = pd.DataFrame(all_data)

# Save all results to CSV
os.makedirs(dir_base, exist_ok=True)
combined_path = os.path.join(dir_base, "geo_webscrap.csv")
df_combined.to_csv(combined_path, index=False)

print(df_combined)

Processing GSE271936 ...
Processing GSE268577 ...
Processing GSE194329 ...
                                                Title  \
0   Immune and tumor cell landscape in pediatric h...   
1   Immune and tumor cell landscape in pediatric h...   
2   Immune and tumor cell landscape in pediatric h...   
3   Immune and tumor cell landscape in pediatric h...   
4   Immune and tumor cell landscape in pediatric h...   
..                                                ...   
57  Spatial multi-omics reveals vulnerabilities of...   
58  Spatial multi-omics reveals vulnerabilities of...   
59  Spatial multi-omics reveals vulnerabilities of...   
60  Spatial multi-omics reveals vulnerabilities of...   
61  Spatial multi-omics reveals vulnerabilities of...   

                                              Summary  \
0   Single cell RNA-seq profiling of tumor, myeloi...   
1   Single cell RNA-seq profiling of tumor, myeloi...   
2   Single cell RNA-seq profiling of tumor, myeloi...   
3   Single c