In [2]:
## Make sure the notebook is in the project directory
from pathlib import Path

# Start from the notebook directory
PROJECT_ROOT = Path.cwd().resolve()

# Go up two levels: notebooks -> code -> project root
PROJECT_ROOT = PROJECT_ROOT.parents[1]  

print(PROJECT_ROOT)


/Users/sanyabadole/Desktop/gbm-drug-discovery


In [3]:
## Check if the downloaded raw data  exists
DATA_RAW = PROJECT_ROOT / "data" / "raw" / "GSE174554"
print(DATA_RAW, DATA_RAW.exists())

/Users/sanyabadole/Desktop/gbm-drug-discovery/data/raw/GSE174554 True


In [39]:
## Read the metadata.txt and create a dataframe 

import pandas as pd

metadata_path = DATA_RAW / "GSE174554_Tumor_normal_metadata.txt.gz"
print(metadata_path, metadata_path.exists())

df_meta = pd.read_csv(metadata_path, sep="\s", compression="gzip", engine = "python")
df_meta.shape


/Users/sanyabadole/Desktop/gbm-drug-discovery/data/raw/GSE174554/GSE174554_Tumor_normal_metadata.txt.gz True


(254288, 2)

In [40]:
## Quick check of what the table looks like
df_meta.columns.tolist()
df_meta.head()

Unnamed: 0,Sample#_Barcode,Tumor_Normal_annotation
0,SF10022_CTATCTAAGCAAGCCA,Tumor
1,SF10022_AAACCCAGTCTACGAT,Normal
2,SF10022_AAAGGGCTCACCCTGT,Normal
3,SF10022_AACAACCAGACCCGCT,Normal
4,SF10022_AACAAGAGTGTAAACA,Normal


In [41]:
# Renaming some of the columns
df_meta = df_meta.rename(
    columns={
        "Sample#_Barcode": "sample_barcode",
        "Tumor_Normal_annotation": "tumor_annotation", 
    }
)

# Split on the first underscore: left part = sample ID, right part = barcode
df_meta[["sample_id", "barcode"]] = df_meta["sample_barcode"].str.split(
    "_", n=1, expand=True
)

df_meta.head()


Unnamed: 0,sample_barcode,tumor_annotation,sample_id,barcode
0,SF10022_CTATCTAAGCAAGCCA,Tumor,SF10022,CTATCTAAGCAAGCCA
1,SF10022_AAACCCAGTCTACGAT,Normal,SF10022,AAACCCAGTCTACGAT
2,SF10022_AAAGGGCTCACCCTGT,Normal,SF10022,AAAGGGCTCACCCTGT
3,SF10022_AACAACCAGACCCGCT,Normal,SF10022,AACAACCAGACCCGCT
4,SF10022_AACAAGAGTGTAAACA,Normal,SF10022,AACAAGAGTGTAAACA


In [43]:
import re # for regex

# 1. patient_id: strip trailing 'v' + digits (v2)
df_meta["patient_id"] = df_meta["sample_id"].str.replace(
    r"v[0-9]+$",
    "",
    regex=True
)

# 2. tumor_stage: label as primary vs recurrent
df_meta["tumor_stage"] = df_meta["sample_id"].str.contains(
    r"v[0-9]+$",
    regex=True
).map({False: "primary", True: "recurrent"})

df_meta[["sample_id", "patient_id", "tumor_stage"]].drop_duplicates().head(10)


Unnamed: 0,sample_id,patient_id,tumor_stage
0,SF10022,SF10022,primary
3082,SF10127,SF10127,primary
4134,SF12090,SF12090,primary
4260,SF4297,SF4297,primary
8405,SF6996,SF6996,primary
11226,SF9259R,SF9259R,primary
12824,SF9259S,SF9259S,primary
13778,SF11979,SF11979,primary
14525,SF7062,SF7062,primary
17652,SF9510,SF9510,primary


In [44]:
# Keep only the columns we really need going forward
df_meta_clean = df_meta[[
    "sample_barcode",
    "sample_id",
    "patient_id",
    "tumor_stage",
    "tumor_annotation",
]].copy()

df_meta_clean.head()


Unnamed: 0,sample_barcode,sample_id,patient_id,tumor_stage,tumor_annotation
0,SF10022_CTATCTAAGCAAGCCA,SF10022,SF10022,primary,Tumor
1,SF10022_AAACCCAGTCTACGAT,SF10022,SF10022,primary,Normal
2,SF10022_AAAGGGCTCACCCTGT,SF10022,SF10022,primary,Normal
3,SF10022_AACAACCAGACCCGCT,SF10022,SF10022,primary,Normal
4,SF10022_AACAAGAGTGTAAACA,SF10022,SF10022,primary,Normal


In [49]:
## Number of nuclei
df_meta_clean.groupby(
    ["patient_id", "tumor_stage", "tumor_annotation"]
).size().head(10)

patient_id  tumor_stage  tumor_annotation
SF10022     primary      Normal              2490
                         Tumor                592
SF10099     primary      Normal               119
                         Tumor                449
            recurrent    Normal               232
                         Tumor                  2
SF10108     primary      Normal              1594
                         Tumor               2852
SF10127     primary      Normal               155
                         Tumor                897
dtype: int64

In [47]:
# Define processed data folder
processed_dir = PROJECT_ROOT / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

# Save clean metadata
output_path = processed_dir / "GSE174554_metadata_clean.csv"
df_meta_clean.to_csv(output_path, index=False)

print("Savedd clean metadata to:", output_path)


Savedd clean metadata to: /Users/sanyabadole/Desktop/gbm-drug-discovery/data/processed/GSE174554_metadata_clean.csv
