## Downloading Gene Expression datasets for EMT signature genes in BRCA from GEO (TCGA-BRCA): GSE96058



In [1]:
from pathlib import Path
import pandas as pd
import gzip
import shutil
import requests

PROJECT = Path.home() / "emt_network"
RAW = PROJECT / "data" / "raw"
PROCESSED = PROJECT / "data" / "processed"
SIG = PROJECT / "resources" / "signatures"

RAW.mkdir(parents=True, exist_ok=True)
PROCESSED.mkdir(parents=True, exist_ok=True)

gse = "GSE96058"
print("OK:", PROJECT, RAW, PROCESSED)


OK: /home/sameeksha/emt_network /home/sameeksha/emt_network/data/raw /home/sameeksha/emt_network/data/processed


In [2]:
ks_tumor_path = SIG / "EM_gene_signature_tumor_KS.xlsx"
gs76_path = SIG / "EMT_signature_76GS.xlsx"

ks_tumor = pd.read_excel(ks_tumor_path, header=None)
ks_genes = ks_tumor.iloc[:, 0].astype(str).str.strip().tolist()

gs76 = pd.read_excel(gs76_path)
gs76_genes = gs76.iloc[:, 1].astype(str).str.strip().dropna().tolist()

emt_markers = ["CDH1","VIM","EPCAM","ZEB1","ZEB2","SNAI1","SNAI2","TWIST1"]

gene_set = sorted(set(ks_genes + gs76_genes + emt_markers))
len(gene_set), gene_set[:15]


(359,
 ['2006-09-01 00:00:00',
  'ABCC3',
  'ABHD11',
  'AGR2',
  'AKAP12',
  'AKAP2',
  'AKR1B10',
  'AKT3',
  'ANGPTL2',
  'ANK2',
  'ANKRD22',
  'ANTXR2',
  'AP1M2',
  'AP1S2',
  'ARHGAP32'])

In [3]:
from pathlib import Path
import requests

PROJECT = Path.home() / "emt_network"
RAW = PROJECT / "data" / "raw"
RAW.mkdir(parents=True, exist_ok=True)

gse = "GSE96058"
geo_expr_gz = RAW / f"{gse}_gene_expression_transformed.csv.gz"

url = (
    "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE96nnn/GSE96058/suppl/"
    "GSE96058_gene_expression_3273_samples_and_136_replicates_transformed.csv.gz"
)

print("Downloading:", url)
r = requests.get(url, stream=True, timeout=180)
r.raise_for_status()

with open(geo_expr_gz, "wb") as f:
    for chunk in r.iter_content(chunk_size=1024 * 1024):
        if chunk:
            f.write(chunk)

geo_expr_gz, geo_expr_gz.stat().st_size


Downloading: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE96nnn/GSE96058/suppl/GSE96058_gene_expression_3273_samples_and_136_replicates_transformed.csv.gz


(PosixPath('/home/sameeksha/emt_network/data/raw/GSE96058_gene_expression_transformed.csv.gz'),
 591676211)

In [4]:
import pandas as pd

# read only a few rows to see columns/format
preview = pd.read_csv(geo_expr_gz, compression="gzip", nrows=5)
preview.head(), preview.columns[:10], preview.shape


(  Unnamed: 0        F1        F2        F3        F4        F5        F6  \
 0  5_8S_rRNA -3.321928 -3.321928 -3.321928 -3.321928 -3.321928 -3.321928   
 1    5S_rRNA  4.911099 -3.321928 -3.321928  3.656393  4.190104  2.556304   
 2     6M1-18 -3.321928 -3.321928 -3.321928 -3.321928 -3.321928 -3.321928   
 3      7M1-2 -3.321928 -3.321928 -3.321928 -3.321928 -3.321928 -3.321928   
 4        7SK -0.539253 -0.576620 -1.651323  0.126633  0.783715 -1.759556   
 
          F7        F8        F9  ...  F2974repl  F3006repl  F3028repl  \
 0 -3.321928 -3.321928 -3.321928  ...  -3.321928  -3.321928  -3.321928   
 1  2.590351  5.691788 -3.321928  ...   4.084251   2.287523   3.205371   
 2 -3.321928 -3.321928 -3.321928  ...  -3.321928  -3.321928  -3.321928   
 3 -3.321928 -3.321928 -3.321928  ...  -3.321928  -3.321928  -3.321928   
 4 -1.033968 -0.129513  0.494308  ...   0.233159  -0.026301   1.249039   
 
    F3057repl  F3058repl  F3085repl  F3127repl  F3135repl  F3250repl  F3265repl  
 0  -3.3

In [5]:
import pandas as pd

RAW = PROJECT / "data" / "raw"
PROCESSED = PROJECT / "data" / "processed"
PROCESSED.mkdir(parents=True, exist_ok=True)

geo_expr_gz = RAW / "GSE96058_gene_expression_transformed.csv.gz"
out_path = PROCESSED / "geo_brca_expression.tsv"

gene_col = "Unnamed: 0"
chunksize = 20000

kept = []
found = set()

for chunk in pd.read_csv(geo_expr_gz, compression="gzip", chunksize=chunksize):
    chunk[gene_col] = chunk[gene_col].astype(str).str.strip()
    sub = chunk[chunk[gene_col].isin(gene_set)]
    if not sub.empty:
        kept.append(sub)
        found.update(sub[gene_col].tolist())

geo_sub = pd.concat(kept, axis=0).drop_duplicates(subset=[gene_col])
geo_sub = geo_sub.set_index(gene_col)
geo_sub = geo_sub.apply(pd.to_numeric, errors="coerce")

print("Extracted shape (genes x samples):", geo_sub.shape)
print("Genes found:", len(found), "out of", len(gene_set))

geo_sub.to_csv(out_path, sep="\t")
print("Saved:", out_path)


Extracted shape (genes x samples): (346, 3409)
Genes found: 346 out of 359
Saved: /home/sameeksha/emt_network/data/processed/geo_brca_expression.tsv


In [6]:
#Sanity check
for g in ["CDH1", "VIM", "EPCAM", "ZEB1", "TWIST1"]:
    print(g, g in geo_sub.index)


CDH1 True
VIM True
EPCAM True
ZEB1 True
TWIST1 True
