# Setup and Data Acquisition

This notebook downloads the necessary spatial transcriptomics and single-cell RNA-seq data for the PDAC project.
We use GEO datasets: GSE235315 (spatial Visium) and GSE194247, CD45- and GSE235449,CD45+ (scRNA-seq reference).

In [1]:
# Install GEOparse if not already installed
# !pip install geopy GEOparse --quiet

In [2]:
# Import libraries
import os
import GEOparse
import pandas as pd

In [3]:
# Create data directories
os.makedirs('./data/scRNA_seq/GSE194247', exist_ok=True)
os.makedirs('./data/spatial/GSE235315', exist_ok=True)

In [4]:
# Download GSE194247 (scRNA-seq) metadata only for now
gse_sc = GEOparse.get_GEO(geo='GSE194247', destdir='./data/scRNA_seq/GSE194247')
print(gse_sc.metadata)

09-Jun-2025 21:50:10 DEBUG utils - Directory ./data/scRNA_seq/GSE194247 already exists. Skipping.
09-Jun-2025 21:50:10 INFO GEOparse - File already exist: using local version.
09-Jun-2025 21:50:10 INFO GEOparse - Parsing ./data/scRNA_seq/GSE194247\GSE194247_family.soft.gz: 
09-Jun-2025 21:50:10 DEBUG GEOparse - DATABASE: GeoMiame
09-Jun-2025 21:50:10 DEBUG GEOparse - SERIES: GSE194247
09-Jun-2025 21:50:10 DEBUG GEOparse - PLATFORM: GPL24676
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM5831620
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM5831621
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM5831622
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM5831623
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM5831624


{'title': ['Deep single-cell transcriptome analysis reveals transitional cancer cell state in the pancreatic cancer tumor microenvironment associated with poor prognosis'], 'geo_accession': ['GSE194247'], 'status': ['Public on May 01 2022'], 'submission_date': ['Jan 24 2022'], 'last_update_date': ['Feb 13 2024'], 'pubmed_id': ['38297291'], 'summary': ['Single cell transcriptome analysis of non-immune cell population in human pancreatic cancer tumor microenvironment'], 'overall_design': ['Integrative analysis of single-cell transcriptome of non-immune cell population from 17 pancreatic cancer tumor tissues'], 'type': ['Expression profiling by high throughput sequencing'], 'contributor': ['Galam,,Leem', 'Seongryong,,Kim'], 'sample_id': ['GSM5831620', 'GSM5831621', 'GSM5831622', 'GSM5831623', 'GSM5831624'], 'contact_name': ['Jong-Eun,,Park'], 'contact_institute': ['Korea Advanced Institute of Science and Technology'], 'contact_address': ['291, Daehak-ro, Yuseong-gu'], 'contact_city': ['Da

In [5]:
# Download GSE235315 (Visium spatial data) metadata only
gse_spatial = GEOparse.get_GEO(geo='GSE235315', destdir='./data/spatial/GSE235315')
print(gse_spatial.metadata)

09-Jun-2025 21:50:10 DEBUG utils - Directory ./data/spatial/GSE235315 already exists. Skipping.
09-Jun-2025 21:50:10 INFO GEOparse - File already exist: using local version.
09-Jun-2025 21:50:10 INFO GEOparse - Parsing ./data/spatial/GSE235315\GSE235315_family.soft.gz: 
09-Jun-2025 21:50:10 DEBUG GEOparse - DATABASE: GeoMiame
09-Jun-2025 21:50:10 DEBUG GEOparse - SERIES: GSE235315
09-Jun-2025 21:50:10 DEBUG GEOparse - PLATFORM: GPL24676
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM7498811
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM7498812
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM7498813
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM7498814
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM7498815
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM7498816
09-Jun-2025 21:50:10 DEBUG GEOparse - SAMPLE: GSM7498817


{'title': ['Integrative single-cell transcriptome analysis of human pancreatic cancer reveals an intermediate cancer cell population associated with poor prognosis [Spatial]'], 'geo_accession': ['GSE235315'], 'status': ['Public on Nov 17 2023'], 'submission_date': ['Jun 20 2023'], 'last_update_date': ['Feb 13 2024'], 'pubmed_id': ['38297291'], 'summary': ['We identified five distinct functional subclusters of pancreatic cancer cells and six distinct cancer-associated fibroblast subclusters. We deeply profiled their characteristics, and we found that these subclusters successfully deconvoluted most of the features suggested in bulk transcriptome analysis of pancreatic cancer. Among those subclusters, we identified a novel cancer cell subcluster, Ep_VGLL1, showing intermediate characteristics between the extremities of basal-like and classical dichotomy, despite its prognostic value. Molecular features of Ep_VGLL1 suggest its transitional properties between basal-like and classical subty

You can manually download the H5AD files from the GEO supplementary links or continue with automated wget fetch if URLs are available.

The following cell will:
1. Download all 5 .tar.gz sample files from non-immune PDAC scRNA-seq data GSE194247
2. Extract each into a sample folder
3. Load filtered 10x matrices
4. Merge and save as adata_all_raw.h5ad
immune cell scRNA-seq dataset GSE235449 has 1 sample and is directly downloaded.  

In [6]:
import os
import tarfile
import scanpy as sc
import urllib.request

# Define samples and FTP base
samples = {
    "GSM5831620": "GSM5831620_5_GEX_4.tar.gz",
    "GSM5831621": "GSM5831621_5_GEX_5.tar.gz",
    "GSM5831622": "GSM5831622_5_GEX_6.tar.gz",
    "GSM5831623": "GSM5831623_5_GEX_9.tar.gz",
    "GSM5831624": "GSM5831624_GEX_45_MM.tar.gz"
}
ftp_base = "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5831nnn/"

# Create directories
os.makedirs("./data/scRNA_seq/GSE194247/raw", exist_ok=True)

# Download and extract
for gsm, fname in samples.items():
    url = f"{ftp_base}{gsm}/suppl/{fname}"
    out_path = f"./data/scRNA_seq/GSE194247/raw/{fname}"
    print(f"Downloading {fname}...")
    urllib.request.urlretrieve(url, out_path)

    print(f"Extracting {fname}...")
    with tarfile.open(out_path, "r:gz") as tar:
        extract_path = f"./data/scRNA_seq/GSE194247/raw/{gsm}"
        os.makedirs(extract_path, exist_ok=True)
        tar.extractall(path=extract_path)

# Read and merge using Scanpy
adatas = []
for gsm in samples.keys():
    mtx_path = f"./data/scRNA_seq/GSE194247/raw/{gsm}/filtered_feature_bc_matrix"
    if os.path.exists(mtx_path):
        print(f"Reading {gsm}...")
        adata = sc.read_10x_mtx(mtx_path, var_names='gene_symbols')
        adata.obs['sample_id'] = gsm
        adatas.append(adata)
    else:
        print(f"Missing: {mtx_path}")

print("Concatenating all samples...")
adata_all = adatas[0].concatenate(adatas[1:], batch_key="sample")

# Save to disk
adata_all.write("./data/scRNA_seq/GSE194247/adata_all_raw.h5ad")
print("Saved merged scRNA-seq file: adata_all_raw.h5ad")

Downloading GSM5831620_5_GEX_4.tar.gz...
Extracting GSM5831620_5_GEX_4.tar.gz...
Downloading GSM5831621_5_GEX_5.tar.gz...
Extracting GSM5831621_5_GEX_5.tar.gz...
Downloading GSM5831622_5_GEX_6.tar.gz...
Extracting GSM5831622_5_GEX_6.tar.gz...
Downloading GSM5831623_5_GEX_9.tar.gz...
Extracting GSM5831623_5_GEX_9.tar.gz...
Downloading GSM5831624_GEX_45_MM.tar.gz...
Extracting GSM5831624_GEX_45_MM.tar.gz...
Reading GSM5831620...
Reading GSM5831621...
Reading GSM5831622...
Reading GSM5831623...
Reading GSM5831624...
Concatenating all samples...


  adata_all = adatas[0].concatenate(adatas[1:], batch_key="sample")


Saved merged scRNA-seq file: adata_all_raw.h5ad


In [19]:
# Single sample
mtx_path = f"data/scRNA_seq/GSE235449/raw/GSM7502530/filtered_feature_bc_matrix"
adata = sc.read_10x_mtx(mtx_path, var_names='gene_symbols')
adata.obs['sample_id'] = gsm
adata.write("./data/scRNA_seq/GSE235449/adata_all_raw.h5ad")

The following cell will:
1. Download all 7 .tar.gz spatial sample files from GSE235315
2. Extract each into its own folder under data/spatial/GSE235315/raw/GSM74988xx/
3. Prepare them for scanpy.read_visium() usage

In [8]:
import anndata as ad
from pathlib import Path

# Define samples and filenames
samples = {
    "GSM7498811": "GSM7498811_SS1905133_processed.tar.gz",
    "GSM7498812": "GSM7498812_SS1923404_processed.tar.gz",
    "GSM7498813": "GSM7498813_SS1945070_processed.tar.gz",
    "GSM7498814": "GSM7498814_SS1960050_processed.tar.gz",
    "GSM7498815": "GSM7498815_SS2002154_processed.tar.gz",
    "GSM7498816": "GSM7498816_SS2005094_processed.tar.gz",
    "GSM7498817": "GSM7498817_SS2021309_processed.tar.gz"
}

ftp_base = "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM7498nnn/"

# Make output directory
os.makedirs("./data/spatial/GSE235315/raw", exist_ok=True)

# Download and extract each sample
for gsm, fname in samples.items():
    url = f"{ftp_base}{gsm}/suppl/{fname}"
    out_path = f"./data/spatial/GSE235315/raw/{fname}"
    print(f"Downloading {fname}...")
    urllib.request.urlretrieve(url, out_path)

    print(f"Extracting {fname}...")
    extract_dir = f"./data/spatial/GSE235315/raw/{gsm}"
    os.makedirs(extract_dir, exist_ok=True)
    with tarfile.open(out_path, "r:gz") as tar:
        tar.extractall(path=extract_dir)

# Example: Load one sample with Scanpy (edit path if different layout)
#print("To load a sample in Scanpy, use:")
#print(">>> sc.read_visium('data/spatial/GSE235315/raw/GSM7498811/')")

Downloading GSM7498811_SS1905133_processed.tar.gz...
Extracting GSM7498811_SS1905133_processed.tar.gz...
Downloading GSM7498812_SS1923404_processed.tar.gz...
Extracting GSM7498812_SS1923404_processed.tar.gz...
Downloading GSM7498813_SS1945070_processed.tar.gz...
Extracting GSM7498813_SS1945070_processed.tar.gz...
Downloading GSM7498814_SS1960050_processed.tar.gz...
Extracting GSM7498814_SS1960050_processed.tar.gz...
Downloading GSM7498815_SS2002154_processed.tar.gz...
Extracting GSM7498815_SS2002154_processed.tar.gz...
Downloading GSM7498816_SS2005094_processed.tar.gz...
Extracting GSM7498816_SS2005094_processed.tar.gz...
Downloading GSM7498817_SS2021309_processed.tar.gz...
Extracting GSM7498817_SS2021309_processed.tar.gz...


In [9]:
# Define base directory
sample_ids = [
    'GSM7498811', 'GSM7498812', 'GSM7498813',
    'GSM7498814', 'GSM7498815', 'GSM7498816', 'GSM7498817']

# Read and append spatial samples
adatas = []
for sid in sample_ids:
    path = f"./data/spatial/GSE235315/raw/{sid}/"
    print(f'Loading {sid}...')
    adata = sc.read_visium(path)
    adata.var_names_make_unique()
    adata.obs['sample_id'] = sid
    adatas.append(adata)

# Merge all into one AnnData object
adata_merged = ad.concat(adatas, label='batch', keys=sample_ids)
print(adata_merged)

# Save merged AnnData for downstream use
adata_merged.write('./data/spatial/GSE235315/adata_spatial_merged.h5ad')
print('Saved merged Visium data.')

Loading GSM7498811...


  adata = sc.read_visium(path)
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  adata = sc.read_visium(path)


Loading GSM7498812...


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  adata = sc.read_visium(path)


Loading GSM7498813...


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  adata = sc.read_visium(path)


Loading GSM7498814...


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  adata = sc.read_visium(path)


Loading GSM7498815...


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  adata = sc.read_visium(path)


Loading GSM7498816...


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  adata = sc.read_visium(path)


Loading GSM7498817...


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 25435 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'sample_id', 'batch'
    obsm: 'spatial'
Saved merged Visium data.
