In [1]:
# env: AppCurateGEO 
import requests
from bs4 import BeautifulSoup
import os
import re
import pandas as pd
from collections import defaultdict

from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

def process_gse(gse_id, super_series=None):                                       
    # If SuperSeries is not set, use the current GSE
    if not super_series:
        super_series = gse_id

    url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={gse_id}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    desired_fields = [
        "Title",
        "Summary",
        "Overall design",
        "Contact name",
        "E-mail(s)",
        "Phone",
        "Organization name",
        "Department",
        "Lab",
        "City",
        "State/province",
        "Country",
    ]

    data = {}
    rows = soup.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        if len(cols) == 2:
            label = cols[0].get_text(strip=True)
            value = cols[1].get_text(strip=True)
            if label in desired_fields:
                data[label] = value

    # Get platforms & samples from SOFT
    soft_url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={gse_id}&format=soft"
    soft_response = requests.get(soft_url)
    soft_text = soft_response.text

    platforms = set(re.findall(r"(GPL\d+)", soft_text))
    platforms_str = ", ".join(sorted(platforms))
    samples = set(re.findall(r"(GSM\d+)", soft_text))
    num_samples = len(samples)

    data["Platforms"] = platforms_str
    data["Samples"] = num_samples
    data["Series"] = gse_id
    data["SuperSeries"] = super_series

    supp_data = []

    # -------------------------
    # Case 1: Selenium "(custom)" scraping if link exists
    # -------------------------
    if soup.find("a", string="(custom)"):
        driver = webdriver.Chrome()
        try:
            driver.get(url)
            wait = WebDriverWait(driver, 10)

            custom_link = wait.until(
                EC.element_to_be_clickable((By.LINK_TEXT, "(custom)"))
            )
            custom_link.click()

            # Wait until file table loads
            wait.until(EC.presence_of_all_elements_located(
                (By.XPATH, "//table//tr[td/input[@type='checkbox']]")
            ))

            # Rows with checkboxes
            rows = driver.find_elements(By.XPATH, "//table//tr[td/input[@type='checkbox']]")

            for row in rows:
                cells = row.find_elements(By.TAG_NAME, "td")
                if len(cells) >= 2:
                    file_name = cells[0].text.strip()
                    size = cells[1].text.strip()  # "File size" → stored as "Size"

                    # Infer file type from filename (second-to-last extension)
                    parts = file_name.split(".")
                    file_type = parts[1] if len(parts) > 1 else "unknown"

                    # Skip "(all files)" entry
                    if file_name.lower() == "(all files)":
                        continue

                    row_dict = data.copy()
                    row_dict.update({
                        "Supplementary file": file_name,
                        "Size": size,  # unified name
                        "File type/resource": file_type
                    })
                    supp_data.append(row_dict)
        finally:
            driver.quit()

        if not supp_data:
            supp_data.append(data)

    # -------------------------
    # Case 2: Default parsing (original code)
    # -------------------------
    else:
        tables = soup.find_all('table')
        supp_table = None
        for table in tables[::-1]:
            header_row = table.find('tr')
            if not header_row:
                continue
            headers = [cell.get_text(strip=True) for cell in header_row.find_all(['td', 'th'])]
            if (
                "Supplementary file" in headers
                and "Size" in headers
                and "File type/resource" in headers
            ):
                supp_table = table
                break

        if supp_table:
            rows = supp_table.find_all('tr')[1:]  # skip header
            for row in rows:
                cells = row.find_all('td')
                if len(cells) >= 4:
                    file_name = cells[0].get_text(strip=True)
                    size = cells[1].get_text(strip=True)
                    file_type = cells[3].get_text(strip=True)
                    row_dict = data.copy()
                    row_dict.update({
                        "Supplementary file": file_name,
                        "Size": size,
                        "File type/resource": file_type
                    })
                    supp_data.append(row_dict)
        else:
            # If no supplementary table, at least record metadata once
            supp_data.append(data)

    return supp_data

In [2]:
# Set output dir
dir_base = "./"

## Working already

### Getting GSEs from manual query on GEO DataSets

In [3]:
# Parameters
gds_file_path = os.path.join(dir_base, "gds_result.txt")
proximity_window = 10  # Define how close the GSE numeric IDs should be to belong in the same group

# Step 1: Read the file content
with open(gds_file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Step 2: Extract all GSE accession IDs
gse_ids = gse_ids = re.findall(r'GSE(\d+)\b', text)  # Just numeric parts as strings
gse_ids = sorted(set(gse_ids), key=int)  # Sort numerically and unique

# Step 3: Cluster GSEs by numeric proximity
clusters = []
cluster_index = 0
current_cluster = []
prev_num = None

for gse_num_str in gse_ids:
    gse_num = int(gse_num_str)
    if prev_num is None:
        # start first cluster
        current_cluster = [gse_num]
        cluster_index = 1
    else:
        # check if current gse_num is "close" to prev_num to be in the same cluster
        if gse_num - prev_num <= proximity_window:
            current_cluster.append(gse_num)
        else:
            # finalize current cluster, start new
            clusters.append((cluster_index, current_cluster))
            cluster_index += 1
            current_cluster = [gse_num]
    prev_num = gse_num

# Add last cluster if not empty
if current_cluster:
    clusters.append((cluster_index, current_cluster))

# Step 4: Create a mapping from GSE number to cluster index
gse_to_cluster = {}
for cluster_id, gse_list in clusters:
    for val in gse_list:
        gse_to_cluster[val] = cluster_id

# Step 5: Build DataFrame with columns 'GSE' and 'Cluster'
data = []
for gse_num_str in gse_ids:
    gse_num = int(gse_num_str)
    cluster_id = gse_to_cluster.get(gse_num, None)
    gse_code = f"GSE{gse_num_str}"
    data.append({"GSE": gse_code, "Cluster": f"Cluster{cluster_id}"})

df = pd.DataFrame(data)
df = df.drop_duplicates(subset=['GSE']).reset_index(drop=True)


# Optional: save to CSV
df.to_csv(dir_base + "/gds_processed.csv", index=False)

print(df)

          GSE    Cluster
0    GSE40407   Cluster1
1    GSE48568   Cluster2
2    GSE59620   Cluster3
3    GSE94758   Cluster4
4   GSE112996   Cluster5
..        ...        ...
75  GSE285327  Cluster65
76  GSE285328  Cluster65
77  GSE285332  Cluster65
78  GSE285341  Cluster65
79  GSE291687  Cluster66

[80 rows x 2 columns]


### Web scraping each GSE into a dataframe output 

In [4]:
# List of GSE IDs to process
gse_list = df['GSE']  # replace with your GSEs

all_data = []
for gse in gse_list:
    print(f"Processing {gse} ...")
    all_data.extend(process_gse(gse))

df_combined = pd.DataFrame(all_data)

# Save all results to CSV
os.makedirs(dir_base, exist_ok=True)
combined_path = os.path.join(dir_base, "geo_webscrap.csv")
df_combined.to_csv(combined_path, index=False)

print(df_combined)

Processing GSE40407 ...
Processing GSE48568 ...
Processing GSE59620 ...
Processing GSE94758 ...
Processing GSE112996 ...
Processing GSE114989 ...
Processing GSE122688 ...
Processing GSE122689 ...
Processing GSE146161 ...
Processing GSE147196 ...
Processing GSE147580 ...
Processing GSE147665 ...
Processing GSE155722 ...
Processing GSE155723 ...
Processing GSE155724 ...
Processing GSE156231 ...
Processing GSE156232 ...
Processing GSE157711 ...
Processing GSE158803 ...
Processing GSE160711 ...
Processing GSE162798 ...
Processing GSE168233 ...
Processing GSE172416 ...
Processing GSE174743 ...
Processing GSE174749 ...
Processing GSE189925 ...
Processing GSE193460 ...
Processing GSE200563 ...
Processing GSE200916 ...
Processing GSE207592 ...
Processing GSE215858 ...
Processing GSE216055 ...
Processing GSE216069 ...
Processing GSE218989 ...
Processing GSE221322 ...
Processing GSE221733 ...
Processing GSE222859 ...
Processing GSE222901 ...
Processing GSE223501 ...
Processing GSE227560 ...
Proc