# 01 - Download and Organize Data

This notebook collates the RNA-seq gene count tables previously downloaded from the GDC portal, cleans a combined expression matrix, and saves the organized outputs under `data/processed/` for downstream analysis.


**Workflow overview**

1. Discover every `.tsv` expression file that lives inside the raw GDC download bundles (ignoring manifest `.txt` files).
2. Build a tidy manifest so we can trace each file back to its case identifier.
3. Load the TPM counts, align the genes across all samples, and write a single expression matrix that other notebooks can use.


In [2]:
from __future__ import annotations

import sys
from pathlib import Path

import pandas as pd


def find_project_root(start: Path) -> Path:
    for candidate in [start, *start.parents]:
        if (candidate / 'README.md').exists():
            return candidate
    raise FileNotFoundError('Unable to locate repository root (README.md not found)')


PROJECT_ROOT = find_project_root(Path.cwd())
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import ProjectConfig

config = ProjectConfig()
RAW_DATA_DIR = config.raw_data_dir / 'star gene counts'
CLINICAL_DATA_DIR = config.clinical_data_dir
PROCESSED_DATA_DIR = config.processed_data_dir
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

RAW_DATA_DIR, CLINICAL_DATA_DIR, PROCESSED_DATA_DIR


(PosixPath('/Users/lennonmccartney/Desktop/tcga-brca-multiomics-subtyping/data/raw/star gene counts'),
 PosixPath('/Users/lennonmccartney/Desktop/tcga-brca-multiomics-subtyping/data/processed'))

## Discover downloaded expression bundles

Each download bundle from GDC sits in its own UUID-named directory. We walk the tree, capture every `.tsv` file (ignoring plain `.txt` documents such as manifests), and track minimal metadata for reproducibility.


In [3]:
import pandas as pd


def collect_expression_files(root: Path) -> pd.DataFrame:
    '''Return metadata for every RNA-seq expression TSV found under ``root``.'''

    records: list[dict[str, object]] = []
    bundle_dirs = sorted(p for p in root.iterdir() if p.is_dir())
    for bundle in bundle_dirs:
        for path in sorted(bundle.glob('*.tsv')):
            if path.name.lower().endswith('.txt'):
                continue
            sample_id = path.name.split('.')[0]
            relative = path.relative_to(root)
            case_id = bundle.name
            records.append(
                {
                    'sample_id': sample_id,
                    'case_id': case_id,
                    'file_name': path.name,
                    'path': path,
                    'relative_path': relative.as_posix(),
                }
            )
    if not records:
        raise FileNotFoundError(f'No TSV files discovered under {root}')
    return pd.DataFrame.from_records(records)


expression_index = collect_expression_files(RAW_DATA_DIR)
print(f'Discovered {len(expression_index)} expression TSV files.')
expression_index.head()


Discovered 1197 expression TSV files.


Unnamed: 0,sample_id,case_id,file_name,path,relative_path
0,ba295155-272e-43eb-9d6a-e4c9c392e68b,0019c951-16c5-48d0-85c8-58d96b12d330,ba295155-272e-43eb-9d6a-e4c9c392e68b.rna_seq.a...,/Users/lennonmccartney/Desktop/tcga-brca-multi...,0019c951-16c5-48d0-85c8-58d96b12d330/ba295155-...
1,8d1641ea-7552-4d23-9298-094e0056386a,0022cd20-f64f-4773-b9ff-a3de0b71b259,8d1641ea-7552-4d23-9298-094e0056386a.rna_seq.a...,/Users/lennonmccartney/Desktop/tcga-brca-multi...,0022cd20-f64f-4773-b9ff-a3de0b71b259/8d1641ea-...
2,2f51534b-248b-4999-bc3f-e42a2e98332e,00469928-b243-4cae-acd7-134508e99ceb,2f51534b-248b-4999-bc3f-e42a2e98332e.rna_seq.a...,/Users/lennonmccartney/Desktop/tcga-brca-multi...,00469928-b243-4cae-acd7-134508e99ceb/2f51534b-...
3,b321a3f5-043d-42c6-8c9d-5784d45cb85c,0081f507-b104-4214-9ea1-31dd69130991,b321a3f5-043d-42c6-8c9d-5784d45cb85c.rna_seq.a...,/Users/lennonmccartney/Desktop/tcga-brca-multi...,0081f507-b104-4214-9ea1-31dd69130991/b321a3f5-...
4,cafc9e36-c5f0-45df-ad03-16210ff0d870,0094f9d0-45ec-4aad-bca0-71c60bdd7113,cafc9e36-c5f0-45df-ad03-16210ff0d870.rna_seq.a...,/Users/lennonmccartney/Desktop/tcga-brca-multi...,0094f9d0-45ec-4aad-bca0-71c60bdd7113/cafc9e36-...


## Helper to read a single expression table

We only need the `gene_id` column plus a quantitative abundance metric (`tpm_unstranded`). The helper below returns one `pd.Series` per sample and removes the technical summary rows (the `N_*` counters at the top of each file).


In [4]:
from pathlib import Path


def read_expression_table(path: Path, value_column: str = "unstranded") -> pd.Series:
    '''Load one RNA-seq table and return the chosen value column indexed by gene ID.'''

    usecols = ["gene_id", value_column]
    df = pd.read_csv(
        path,
        sep="\t",
        comment="#",
        usecols=usecols,
        dtype={value_column: "float32"},
    ).dropna(subset=["gene_id"])
    series = df.set_index("gene_id")[value_column]
    series = series[~series.index.str.startswith("N_")]
    series.name = path.name.split(".")[0]
    return series


# Quick sanity check on the first file
first_sample = read_expression_table(Path(expression_index.loc[0, "path"]))
first_sample.head()


gene_id
ENSG00000000003.15    4370.0
ENSG00000000005.6        7.0
ENSG00000000419.13    2625.0
ENSG00000000457.14    3005.0
ENSG00000000460.17    1578.0
Name: ba295155-272e-43eb-9d6a-e4c9c392e68b, dtype: float32

## Build the combined expression matrix

We iterate through every discovered file, ensure the genes line up across samples, and concatenate the resulting vectors column-wise. Progress messages every 100 samples make it easier to monitor long runs.


In [5]:
from typing import Optional

expression_series: list[pd.Series] = []
gene_index: Optional[pd.Index] = None

total_files = len(expression_index)
for idx, path in enumerate(expression_index["path"], start=1):
    series = read_expression_table(Path(path))
    if gene_index is None:
        gene_index = series.index
    elif not series.index.equals(gene_index):
        # Align to the reference order if a file arrives with an unexpected layout.
        series = series.reindex(gene_index)
    expression_series.append(series)
    if idx % 100 == 0 or idx == total_files:
        print(f"Loaded {idx}/{total_files} samples")

expression_matrix = pd.concat(expression_series, axis=1)
expression_matrix.head()


Loaded 100/1197 samples
Loaded 200/1197 samples
Loaded 300/1197 samples
Loaded 400/1197 samples
Loaded 500/1197 samples
Loaded 600/1197 samples
Loaded 700/1197 samples
Loaded 800/1197 samples
Loaded 900/1197 samples
Loaded 1000/1197 samples
Loaded 1100/1197 samples
Loaded 1197/1197 samples


Unnamed: 0_level_0,ba295155-272e-43eb-9d6a-e4c9c392e68b,8d1641ea-7552-4d23-9298-094e0056386a,2f51534b-248b-4999-bc3f-e42a2e98332e,b321a3f5-043d-42c6-8c9d-5784d45cb85c,cafc9e36-c5f0-45df-ad03-16210ff0d870,c763a483-415e-4cb4-9cdf-4e6c31e8a9c9,7135f14b-e84f-4ebf-8d95-b2a3c843fd4d,5fd42405-ebfe-4210-bc2f-d8310e3e14ee,9ccd787f-fde1-4fe6-a11f-d6203eaf9faf,4a88d54f-c88c-4ffd-84c9-069b53f2cb28,...,ca9d1ab5-ea78-46f4-9225-8147639d013c,97a50d88-e662-43d5-9cef-2915545e8968,8a27980e-a506-4cb3-91f2-3f0e5a19acfe,fe00aecb-7b06-4e76-b999-99ee41ad20ea,2c1b1cbb-6e9f-4416-8faa-abb31d6b4e0e,967f7008-e212-4114-ab43-dc2a6295f80c,deb7967f-9339-4bb1-ae0b-81a72a472bba,574d0a5f-8cb7-4783-8d5e-b07c1b3460dc,64b12ba7-a481-4fdb-9c74-38c94c7ef3c9,74179e5e-2d3c-417e-8844-6740ea9fb2e5
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.15,4370.0,2443.0,3508.0,6928.0,2890.0,1635.0,3456.0,1410.0,1899.0,3362.0,...,4518.0,3958.0,5469.0,2212.0,3321.0,2586.0,2127.0,8020.0,1417.0,4263.0
ENSG00000000005.6,7.0,144.0,7.0,17.0,4.0,101.0,22.0,14.0,4.0,35.0,...,10.0,18.0,9.0,0.0,0.0,286.0,0.0,22722.0,2.0,9.0
ENSG00000000419.13,2625.0,2322.0,2421.0,1812.0,4025.0,1565.0,1779.0,1431.0,2167.0,3346.0,...,1657.0,2113.0,2153.0,1184.0,3880.0,1655.0,742.0,1676.0,1138.0,2071.0
ENSG00000000457.14,3005.0,1466.0,839.0,1651.0,2769.0,1183.0,2176.0,1556.0,1516.0,2135.0,...,2009.0,1830.0,985.0,1426.0,1559.0,1723.0,911.0,1065.0,904.0,1101.0
ENSG00000000460.17,1578.0,409.0,744.0,366.0,663.0,419.0,864.0,318.0,417.0,973.0,...,739.0,931.0,1452.0,343.0,880.0,600.0,328.0,918.0,233.0,717.0


In [6]:
print(f"Expression matrix shape: {expression_matrix.shape}")
expression_matrix.iloc[:5, :5]

Expression matrix shape: (60660, 1197)


Unnamed: 0_level_0,ba295155-272e-43eb-9d6a-e4c9c392e68b,8d1641ea-7552-4d23-9298-094e0056386a,2f51534b-248b-4999-bc3f-e42a2e98332e,b321a3f5-043d-42c6-8c9d-5784d45cb85c,cafc9e36-c5f0-45df-ad03-16210ff0d870
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003.15,4370.0,2443.0,3508.0,6928.0,2890.0
ENSG00000000005.6,7.0,144.0,7.0,17.0,4.0
ENSG00000000419.13,2625.0,2322.0,2421.0,1812.0,4025.0
ENSG00000000457.14,3005.0,1466.0,839.0,1651.0,2769.0
ENSG00000000460.17,1578.0,409.0,744.0,366.0,663.0


## Persist processed outputs

The TPM matrix is saved as a gzipped TSV to keep downstream tooling simple. A companion manifest captures where each column originated and the original file sizes.


In [7]:
expression_output_path = PROCESSED_DATA_DIR / "tcga_brca_expression_tpm.tsv.gz"
manifest_output_path = PROCESSED_DATA_DIR / "expression_file_index.tsv"

expression_matrix.to_csv(expression_output_path, sep="	", compression="gzip")

manifest_df = expression_index.copy()
manifest_df["file_size_mb"] = manifest_df["path"].map(lambda p: round(p.stat().st_size / 1024 ** 2, 3))
manifest_df = manifest_df.drop(columns=["path"])
manifest_df.to_csv(manifest_output_path, sep="	", index=False)

expression_output_path, manifest_output_path


(PosixPath('/Users/lennonmccartney/Desktop/tcga-brca-multiomics-subtyping/data/processed/tcga_brca_expression_tpm.tsv.gz'),
 PosixPath('/Users/lennonmccartney/Desktop/tcga-brca-multiomics-subtyping/data/processed/expression_file_index.tsv'))

## Quick summary statistics

Verify the number of samples/genes and inspect the distribution of file sizes to catch obvious anomalies.


In [8]:
summary = pd.Series(
    {
        "n_samples": expression_matrix.shape[1],
        "n_genes": expression_matrix.shape[0],
        "min_file_size_mb": expression_index["path"].map(lambda p: p.stat().st_size / 1024 ** 2).min(),
        "max_file_size_mb": expression_index["path"].map(lambda p: p.stat().st_size / 1024 ** 2).max(),
    }
)
summary


n_samples            1197.000000
n_genes             60660.000000
min_file_size_mb        4.007236
max_file_size_mb        4.075985
dtype: float64

## Inspect clinical XML files

Review the clinical, biospecimen, and SSF XML bundles to understand their structure before parsing them downstream.


In [4]:
from pathlib import Path

def collect_clinical_xml(root: Path) -> pd.DataFrame:
    records: list[dict[str, object]] = []
    if not root.exists():
        raise FileNotFoundError(f'Clinical directory not found: {root}')
    bundle_dirs = sorted(p for p in root.iterdir() if p.is_dir())
    for bundle in bundle_dirs:
        for xml_path in sorted(bundle.glob('*.xml')):
            filename = xml_path.name
            parts = filename.split('.')
            case_id = parts[-2] if len(parts) >= 2 else filename
            file_type = parts[1] if len(parts) >= 3 else 'unknown'
            records.append(
                {
                    'bundle_id': bundle.name,
                    'case_id': case_id,
                    'file_type': file_type,
                    'file_name': filename,
                    'relative_path': xml_path.relative_to(root).as_posix(),
                    'path': str(xml_path),
                }
            )
    if not records:
        raise FileNotFoundError(f'No XML files discovered under {root}')
    return pd.DataFrame.from_records(records)

clinical_index = collect_clinical_xml(CLINICAL_DATA_DIR)
bundle_count = clinical_index['bundle_id'].nunique()
print(f'Discovered {len(clinical_index)} XML files across {bundle_count} bundles.')
clinical_index.head()


Discovered 3369 XML files across 3369 bundles.


Unnamed: 0,bundle_id,case_id,file_type,file_name,relative_path,path
0,00049989-fa21-48fb-8dda-710c0dd5932e,TCGA-A2-A0CT,org_clinical,nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml,00049989-fa21-48fb-8dda-710c0dd5932e/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
1,0026e7b3-6e38-44cb-83cb-6618a7681f0a,TCGA-A7-A0DA,org_biospecimen,nationwidechildrens.org_biospecimen.TCGA-A7-A0...,0026e7b3-6e38-44cb-83cb-6618a7681f0a/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
2,002ce63d-8c5d-4dcc-b919-ed5dbeb0be55,TCGA-BH-A0BD,org_ssf,nationwidechildrens.org_ssf.TCGA-BH-A0BD.xml,002ce63d-8c5d-4dcc-b919-ed5dbeb0be55/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
3,004b6bd4-19d0-4b40-99ef-1a76313fe7a5,TCGA-GM-A2DD,org_clinical,nationwidechildrens.org_clinical.TCGA-GM-A2DD.xml,004b6bd4-19d0-4b40-99ef-1a76313fe7a5/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
4,00a012e7-e97f-4fea-9402-37ac734217ca,TCGA-B6-A1KC,org_biospecimen,nationwidechildrens.org_biospecimen.TCGA-B6-A1...,00a012e7-e97f-4fea-9402-37ac734217ca/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...


In [5]:
import xml.etree.ElementTree as ET

if clinical_index.empty:
    raise ValueError('Clinical XML index is empty; nothing to inspect.')
sample_xml_path = Path(clinical_index.loc[0, 'path'])
tree = ET.parse(sample_xml_path)
root = tree.getroot()

print(f'Previewing XML file: {sample_xml_path.name}')
print(f'Root tag: {root.tag}')
for element in list(root)[:5]:
    tag = element.tag.split('}')[-1]
    text = (element.text or '').strip().replace('', ' ')
    snippet = text[:80] + ('…' if len(text) > 80 else '')
    print(f'- {tag}: {snippet}')
    sub_elements = list(element)[:3]
    for child in sub_elements:
        child_tag = child.tag.split('}')[-1]
        child_text = (child.text or '').strip().replace('', ' ')
        child_snippet = child_text[:80] + ('…' if len(child_text) > 80 else '')
        print(f'    · {child_tag}: {child_snippet}')
    if sub_elements:
        print('')


Previewing XML file: nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml
Root tag: {http://tcga.nci/bcr/xml/clinical/brca/2.7}tcga_bcr
- admin:  
    · bcr:  N a t i o n w i d e   C h i l d r e n ' s   H o s p i t a l 
    · file_uuid:  1 7 2 B 8 2 5 D - 7 A 5 C - 4 5 4 1 - 8 1 9 0 - 3 3 2 0 7 A F 4 7 4 F 9 
    · batch_number:  6 1 . 8 9 . 0 

- patient:  
    · additional_studies:  
    · tumor_tissue_site:  B r e a s t 
    · tumor_tissue_site_other:  



In [6]:
from collections import Counter

def element_text_map(element):
    data: dict[str, str] = {}
    for child in element:
        text = (child.text or '').strip()
        if not text:
            continue
        tag = child.tag.split('}')[-1]
        data[tag] = text
    return data

sample_xml_path = Path(clinical_index.loc[0, 'path'])
tree = ET.parse(sample_xml_path)
root = tree.getroot()

patient_elem = root.find('.//{*}patient')
patient_series = pd.Series(element_text_map(patient_elem), name='patient')

drug_rows = [element_text_map(node) for node in root.findall('.//{*}drug')]
drug_df = pd.DataFrame(drug_rows)

followup_rows = [element_text_map(node) for node in root.findall('.//{*}follow_up')]
followup_df = pd.DataFrame(followup_rows)

uniq_tags = Counter(elem.tag.split('}')[-1] for elem in root.iter())

print(f'Sample XML: {sample_xml_path.name}')
print(f'Patient fields captured: {patient_series.shape[0]}')
display(patient_series.sort_index())

print(f'Drug records: {len(drug_df)}')
display(drug_df)

print(f'Follow-up records: {len(followup_df)}')
display(followup_df)

print(f'Unique XML tags in file: {len(uniq_tags)}')
pd.Series(uniq_tags).sort_values(ascending=False).head(15)


Sample XML: nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml
Patient fields captured: 44


age_at_initial_pathologic_diagnosis                                                                                          71
axillary_lymph_node_stage_method_type                                                                Sentinel node biopsy alone
bcr_patient_barcode                                                                                                TCGA-A2-A0CT
bcr_patient_uuid                                                                           378778d2-b331-4867-a93b-c64028c8b4c7
breast_carcinoma_estrogen_receptor_status                                                                              Positive
breast_carcinoma_immunohistochemistry_pos_cell_score                                                                          0
breast_carcinoma_progesterone_receptor_status                                                                          Negative
breast_carcinoma_surgical_procedure_name                                                                

Drug records: 4


Unnamed: 0,regimen_number,bcr_drug_barcode,bcr_drug_uuid,prescribed_dose,prescribed_dose_units,days_to_drug_therapy_start,days_to_drug_therapy_end,drug_name,regimen_indication,therapy_ongoing,day_of_form_completion,month_of_form_completion,year_of_form_completion,total_dose,total_dose_units,number_cycles
0,4,TCGA-A2-A0CT-D1976,fe69a660-e936-4a3f-8e5d-8cbfbb6dfeb4,20,mg/day,559,1537,Tamoxifen,ADJUVANT,NO,9,9,2010,,,
1,2,TCGA-A2-A0CT-D1974,76a54d5e-e978-4768-b8d6-d25277ef9803,900,mg,75,138,Cytoxan,ADJUVANT,NO,9,9,2010,3600.0,mg,4.0
2,3,TCGA-A2-A0CT-D1975,b6f4f507-e6ee-45bc-b50b-d9a2bb53bcaa,1,mg,160,433,Arimidex,ADJUVANT,NO,9,9,2010,,,
3,1,TCGA-A2-A0CT-D1973,46d7f184-146c-4978-a041-32b27f642b48,90,mg,75,138,Adriamycin,ADJUVANT,NO,9,9,2010,360.0,mg,4.0


Follow-up records: 1


Unnamed: 0,bcr_followup_barcode,bcr_followup_uuid,followup_case_report_form_submission_reason,radiation_therapy,postoperative_rx_tx,vital_status,days_to_last_followup,person_neoplasm_cancer_status,new_tumor_event_after_initial_treatment,day_of_form_completion,month_of_form_completion,year_of_form_completion
0,TCGA-A2-A0CT-F13730,E5069DEC-D655-42B1-81A6-4F008FC9EF78,Scheduled Follow-up Submission,NO,YES,Alive,2289,TUMOR FREE,NO,9,6,2011


Unique XML tags in file: 187


day_of_form_completion        6
year_of_form_completion       6
month_of_form_completion      6
days_to_drug_therapy_start    4
drug                          4
tx_on_clinical_trial          4
regimen_number                4
bcr_drug_barcode              4
bcr_drug_uuid                 4
total_dose                    4
total_dose_units              4
prescribed_dose_units         4
number_cycles                 4
prescribed_dose               4
days_to_drug_therapy_end      4
dtype: int64

## Inspect biospecimen XML files

Biospecimen files capture sample-level metadata (sample barcodes, aliquots, and vial identifiers) that map directly to the expression matrix columns.


In [19]:
def collect_biospecimen_xml(root: Path) -> pd.DataFrame:
    records: list[dict[str, object]] = []
    for bundle in sorted(p for p in root.iterdir() if p.is_dir()):
        for xml_path in sorted(bundle.glob('*.xml')):
            if 'biospecimen' not in xml_path.name.lower():
                continue
            filename = xml_path.name
            case_id = filename.split('.')[-2]
            records.append(
                {
                    'bundle_id': bundle.name,
                    'case_id': case_id,
                    'file_name': filename,
                    'relative_path': xml_path.relative_to(CLINICAL_DATA_DIR).as_posix(),
                    'path': str(xml_path),
                }
            )
    if not records:
        raise FileNotFoundError('No biospecimen XML files discovered')
    return pd.DataFrame.from_records(records)

biospecimen_index = collect_biospecimen_xml(CLINICAL_DATA_DIR)
print(f'Discovered {len(biospecimen_index)} biospecimen XML files.')
biospecimen_index.head()


Discovered 1098 biospecimen XML files.


Unnamed: 0,bundle_id,case_id,file_name,relative_path,path
0,0026e7b3-6e38-44cb-83cb-6618a7681f0a,TCGA-A7-A0DA,nationwidechildrens.org_biospecimen.TCGA-A7-A0...,0026e7b3-6e38-44cb-83cb-6618a7681f0a/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
1,00a012e7-e97f-4fea-9402-37ac734217ca,TCGA-B6-A1KC,nationwidechildrens.org_biospecimen.TCGA-B6-A1...,00a012e7-e97f-4fea-9402-37ac734217ca/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
2,00d770a2-3013-40b6-acf9-5b2d1f11caad,TCGA-GM-A2DO,nationwidechildrens.org_biospecimen.TCGA-GM-A2...,00d770a2-3013-40b6-acf9-5b2d1f11caad/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
3,01318ff3-7bfb-49cf-9427-16914cadb09e,TCGA-AC-A2QH,nationwidechildrens.org_biospecimen.TCGA-AC-A2...,01318ff3-7bfb-49cf-9427-16914cadb09e/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
4,015bb72b-c035-4274-8318-b0f4020a3a1e,TCGA-AR-A0U0,nationwidechildrens.org_biospecimen.TCGA-AR-A0...,015bb72b-c035-4274-8318-b0f4020a3a1e/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...


In [20]:
def parse_biospecimen(path: Path) -> dict[str, pd.DataFrame]:
    tree = ET.parse(path)
    root = tree.getroot()
    def map_children(element):
        row = {}
        for child in element:
            text = (child.text or '').strip()
            if not text:
                continue
            tag = child.tag.split('}')[-1]
            row[tag] = text
        return row
    patient = root.find('.//{*}patient')
    patient_df = pd.DataFrame([map_children(patient)]) if patient is not None else pd.DataFrame()
    samples = [map_children(sample) for sample in root.findall('.//{*}samples//{*}sample')]
    portions = [map_children(portion) for portion in root.findall('.//{*}portions//{*}portion')]
    aliquots = [map_children(aliquot) for aliquot in root.findall('.//{*}aliquots//{*}aliquot')]
    return {
        'patient': patient_df,
        'samples': pd.DataFrame(samples),
        'portions': pd.DataFrame(portions),
        'aliquots': pd.DataFrame(aliquots),
    }

biospecimen_path = Path(biospecimen_index.loc[0, 'path'])
biospecimen_tables = parse_biospecimen(biospecimen_path)
print(f'Sample biospecimen file: {biospecimen_path.name}')
for table_name, df in biospecimen_tables.items():
    print(f"{table_name.title()} rows: {len(df)}")
    if not df.empty:
        display(df.head())

sample_barcodes = biospecimen_tables['samples'].get('bcr_sample_barcode')
if sample_barcodes is not None:
    print('Example sample barcodes:')
    print(sample_barcodes[:5])

Sample biospecimen file: nationwidechildrens.org_biospecimen.TCGA-A7-A0DA.xml
Patient rows: 1


Unnamed: 0,bcr_patient_barcode,bcr_patient_uuid,tissue_source_site,patient_id,days_to_index
0,TCGA-A7-A0DA,3afa1e93-1df8-4e4c-aaa4-557463f4bb77,A7,A0DA,0


Samples rows: 2


Unnamed: 0,sample_type_id,vial_number,sample_type,initial_weight,oct_embedded,days_to_collection,bcr_sample_barcode,bcr_sample_uuid,is_ffpe,pathology_report_uuid,pathology_report_file_name
0,1,A,Primary Tumor,350.0,False,177,TCGA-A7-A0DA-01A,4f441e61-6bea-4a12-841d-def270804bbe,NO,69AC5937-3FFD-40FB-9922-79DB3CED7510,TCGA-A7-A0DA.69AC5937-3FFD-40FB-9922-79DB3CED7...
1,10,A,Blood Derived Normal,,False,177,TCGA-A7-A0DA-10A,bb28c682-0071-4642-bab6-768b0fc322c6,NO,,


Portions rows: 2


Unnamed: 0,portion_number,portion_sequence,day_of_creation,month_of_creation,year_of_creation,weight,bcr_portion_barcode,bcr_portion_uuid,is_ffpe
0,31,3,14,12,2010,30.0,TCGA-A7-A0DA-01A-31,51866cc5-8c50-41fa-a490-f23b3adb541d,NO
1,1,1,14,5,2010,,TCGA-A7-A0DA-10A-01,dba0ceee-ecdd-4b46-b79c-0bed0553933d,NO


Aliquots rows: 11


Unnamed: 0,plate_id,center_id,day_of_shipment,month_of_shipment,year_of_shipment,bcr_aliquot_barcode,bcr_aliquot_uuid,concentration,quantity,volume,plate_row,plate_column,biospecimen_barcode_bottom,source_center,is_derived_from_ffpe
0,A10X,2,12,1,2011,TCGA-A7-A0DA-01A-31D-A10X-02,013b8901-2f00-46f4-b864-0a5e7d4efa6c,0.15,1.95,13.0,D,3,100490585,23,NO
1,A10Y,9,12,1,2011,TCGA-A7-A0DA-01A-31D-A10Y-09,878337fe-9f41-44f5-9760-3977e7d75308,0.08,2.08,26.0,D,3,100490489,23,NO
2,A111,1,12,1,2011,TCGA-A7-A0DA-01A-31D-A111-01,91c8c373-e923-4e7f-b8b3-2417e92760c7,0.15,1.0,6.67,D,3,100487609,23,NO
3,A112,5,12,1,2011,TCGA-A7-A0DA-01A-31D-A112-05,dd1d7840-21e5-4eb8-9795-91d0659cf8d9,0.15,4.01,26.7,D,3,100488857,23,NO
4,A114,13,12,1,2011,TCGA-A7-A0DA-01A-31R-A114-13,c8638f56-52c7-4aab-9a57-f2318f44df97,0.15,3.0,20.0,G,3,99014268,23,NO


Example sample barcodes:
0    TCGA-A7-A0DA-01A
1    TCGA-A7-A0DA-10A
Name: bcr_sample_barcode, dtype: object


## Inspect SSF XML files

Site-specific factor (SSF) files summarize additional pathologic measurements for each case. Parse one to understand the available fields.


In [25]:
def collect_ssf_xml(root: Path) -> pd.DataFrame:
    records: list[dict[str, object]] = []
    for bundle in sorted(p for p in root.iterdir() if p.is_dir()):
        for xml_path in sorted(bundle.glob('*.xml')):
            if 'ssf' not in xml_path.name.lower():
                continue
            filename = xml_path.name
            case_id = filename.split('.')[-2]
            records.append(
                {
                    'bundle_id': bundle.name,
                    'case_id': case_id,
                    'file_name': filename,
                    'relative_path': xml_path.relative_to(CLINICAL_DATA_DIR).as_posix(),
                    'path': str(xml_path),
                }
            )
    if not records:
        raise FileNotFoundError('No SSF XML files discovered')
    return pd.DataFrame.from_records(records)

ssf_index = collect_ssf_xml(CLINICAL_DATA_DIR)
print(f'Discovered {len(ssf_index)} SSF XML files.')
ssf_index.head()


Discovered 1097 SSF XML files.


Unnamed: 0,bundle_id,case_id,file_name,relative_path,path
0,002ce63d-8c5d-4dcc-b919-ed5dbeb0be55,TCGA-BH-A0BD,nationwidechildrens.org_ssf.TCGA-BH-A0BD.xml,002ce63d-8c5d-4dcc-b919-ed5dbeb0be55/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
1,00a19cf8-7c4d-4fc8-8919-709454bebaf7,TCGA-A8-A07I,nationwidechildrens.org_ssf.TCGA-A8-A07I.xml,00a19cf8-7c4d-4fc8-8919-709454bebaf7/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
2,00d12d5e-af74-4078-94e0-312e5b0b224e,TCGA-PL-A8LV,nationwidechildrens.org_ssf.TCGA-PL-A8LV.xml,00d12d5e-af74-4078-94e0-312e5b0b224e/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
3,01078de8-87d5-4e09-97b6-4c65d9c52a11,TCGA-C8-A274,nationwidechildrens.org_ssf.TCGA-C8-A274.xml,01078de8-87d5-4e09-97b6-4c65d9c52a11/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...
4,011b4145-c839-4e70-803f-8c2045828d27,TCGA-BH-A0C0,nationwidechildrens.org_ssf.TCGA-BH-A0C0.xml,011b4145-c839-4e70-803f-8c2045828d27/nationwid...,/Users/lennonmccartney/Desktop/tcga-brca-multi...


In [24]:
def flatten_element(element, prefix: str = '') -> dict[str, str]:
    data: dict[str, str] = {}
    if element is None:
        return data
    for child in element:
        tag = child.tag.split('}')[-1]
        key = f'{prefix}_{tag}' if prefix else tag
        text = (child.text or '').strip()
        if len(child):
            if text:
                data[key] = text
            data.update(flatten_element(child, key))
        elif text:
            data[key] = text
    return data

def parse_ssf(path: Path) -> dict[str, pd.DataFrame]:
    tree = ET.parse(path)
    root = tree.getroot()

    patient_df = pd.DataFrame([flatten_element(root.find('.//{*}patient'))])

    tumor_samples = [flatten_element(node) for node in root.findall('.//{*}tumor_sample')]
    tumor_locations = [flatten_element(node) for node in root.findall('.//{*}tumor_location')]
    tumor_histologies = [flatten_element(node) for node in root.findall('.//{*}tumor_histology')]
    normal_controls = [flatten_element(node) for node in root.findall('.//{*}normal_control')]

    return {
        'patient': patient_df,
        'tumor_samples': pd.DataFrame(tumor_samples),
        'tumor_locations': pd.DataFrame(tumor_locations),
        'tumor_histologies': pd.DataFrame(tumor_histologies),
        'normal_controls': pd.DataFrame(normal_controls),
    }

ssf_path = Path(ssf_index.loc[0, 'path'])
ssf_tables = parse_ssf(ssf_path)
print(f'Sample SSF file: {ssf_path.name}')
for table_name, df in ssf_tables.items():
    print(f"{table_name.replace('_', ' ').title()} rows: {len(df)}")
    if not df.empty:
        display(df.head())


Sample SSF file: nationwidechildrens.org_ssf.TCGA-BH-A0BD.xml
Patient rows: 1


Unnamed: 0,bcr_patient_barcode,bcr_patient_uuid,tissue_source_site,patient_id,tumor_samples_tumor_sample_bcr_sample_uuid,tumor_samples_tumor_sample_days_to_sample_procurement,tumor_samples_tumor_sample_method_of_sample_procurement,tumor_samples_tumor_sample_other_method_of_sample_procurement,tumor_samples_tumor_sample_vessel_used,tumor_samples_tumor_sample_other_vessel_used,...,tumor_samples_tumor_sample_digital_image_submitted,tumor_samples_tumor_sample_ffpe_tumor_slide_submitted,tumor_samples_tumor_sample_other_dx,tumor_samples_tumor_sample_history_of_neoadjuvant_treatment,tumor_samples_tumor_sample_consent_or_death_status,tumor_samples_tumor_sample_days_to_consent,tumor_samples_tumor_sample_tumor_histologies_tumor_histology_histological_type,tumor_samples_tumor_sample_tumor_locations_laterality,tumor_samples_tumor_sample_tumor_locations_tumor_location_site_of_disease,normal_controls_normal_control_bcr_sample_uuid
0,TCGA-BH-A0BD,b379bfb8-284e-4300-8325-85cfd6809cb8,BH,A0BD,832b8766-0e59-4ad4-b4c9-1e39b2ce4fb1,16,"Other Method, specify",SEGMENTAL MASTECTOMY,Other,FISHERBRAND SPECIMEN STORAGE BAGS,...,NO,NO,No,No,Consented,5,Infiltrating Ductal Carcinoma,Left,Breast,b4ff4763-e3a8-4e6a-bd4e-a6b1e1b722a0


Tumor Samples rows: 1


Unnamed: 0,bcr_sample_uuid,days_to_sample_procurement,method_of_sample_procurement,other_method_of_sample_procurement,vessel_used,other_vessel_used,tumor_weight,sample_prescreened,tumor_nuclei_percent,tumor_necrosis_percent,...,top_slide_submitted,digital_image_submitted,ffpe_tumor_slide_submitted,other_dx,history_of_neoadjuvant_treatment,consent_or_death_status,days_to_consent,tumor_histologies_tumor_histology_histological_type,tumor_locations_laterality,tumor_locations_tumor_location_site_of_disease
0,832b8766-0e59-4ad4-b4c9-1e39b2ce4fb1,16,"Other Method, specify",SEGMENTAL MASTECTOMY,Other,FISHERBRAND SPECIMEN STORAGE BAGS,100,YES,70,30,...,YES,NO,NO,No,No,Consented,5,Infiltrating Ductal Carcinoma,Left,Breast


Tumor Locations rows: 1


Unnamed: 0,site_of_disease
0,Breast


Tumor Histologies rows: 1


Unnamed: 0,histological_type
0,Infiltrating Ductal Carcinoma


Normal Controls rows: 1


Unnamed: 0,bcr_sample_uuid
0,b4ff4763-e3a8-4e6a-bd4e-a6b1e1b722a0


In [71]:
from pathlib import Path
import xml.etree.ElementTree as ET

def parse_all_clinical_patients(index: pd.DataFrame) -> pd.DataFrame:
    rows: list[dict[str, object]] = []
    for row in index.itertuples():
        try:
            root = ET.parse(Path(row.path)).getroot()
        except ET.ParseError:
            continue
        patient = root.find('.//{*}patient')
        if patient is None:
            continue
        record = element_text_map(patient)
        record['source_case_id'] = row.case_id
        record['source_path'] = row.path
        rows.append(record)
    return pd.DataFrame(rows)

def parse_all_biospecimen_tables(index: pd.DataFrame) -> dict[str, pd.DataFrame]:
    sample_rows: list[pd.DataFrame] = []
    aliquot_rows: list[pd.DataFrame] = []
    patient_rows: list[pd.DataFrame] = []
    for row in index.itertuples():
        parsed = parse_biospecimen(Path(row.path))
        for key, container in [('samples', sample_rows), ('aliquots', aliquot_rows), ('patient', patient_rows)]:
            df = parsed.get(key)
            if df is None or df.empty:
                continue
            df = df.copy()
            df['source_case_id'] = row.case_id
            df['source_path'] = row.path
            container.append(df)
    def combine(frames: list[pd.DataFrame]) -> pd.DataFrame:
        return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    return {
        'samples': combine(sample_rows),
        'aliquots': combine(aliquot_rows),
        'patient': combine(patient_rows),
    }

def parse_all_ssf_tables(index: pd.DataFrame) -> dict[str, pd.DataFrame]:
    tumor_sample_rows: list[pd.DataFrame] = []
    patient_rows: list[pd.DataFrame] = []
    for row in index.itertuples():
        parsed = parse_ssf(Path(row.path))
        tumor_df = parsed.get('tumor_samples')
        if tumor_df is not None and not tumor_df.empty:
            df = tumor_df.copy()
            df['source_case_id'] = row.case_id
            df['source_path'] = row.path
            tumor_sample_rows.append(df)
        patient_df = parsed.get('patient')
        if patient_df is not None and not patient_df.empty:
            df = patient_df.copy()
            df['source_case_id'] = row.case_id
            df['source_path'] = row.path
            patient_rows.append(df)
    def combine(frames: list[pd.DataFrame]) -> pd.DataFrame:
        return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    return {
        'tumor_samples': combine(tumor_sample_rows),
        'patient': combine(patient_rows),
    }

clinical_patients_df = parse_all_clinical_patients(clinical_index) if not clinical_index.empty else pd.DataFrame()
biospecimen_all_tables = parse_all_biospecimen_tables(biospecimen_index) if not biospecimen_index.empty else {'samples': pd.DataFrame(), 'aliquots': pd.DataFrame(), 'patient': pd.DataFrame()}
ssf_all_tables = parse_all_ssf_tables(ssf_index) if not ssf_index.empty else {'tumor_samples': pd.DataFrame(), 'patient': pd.DataFrame()}

print('Agg clinical patient rows:', len(clinical_patients_df))
print('Agg biospecimen samples:', len(biospecimen_all_tables['samples']))
print('Agg biospecimen aliquots:', len(biospecimen_all_tables['aliquots']))
print('Agg SSF tumor samples:', len(ssf_all_tables['tumor_samples']))


Agg clinical patient rows: 3369
Agg biospecimen samples: 2305
Agg biospecimen aliquots: 14941
Agg SSF tumor samples: 1105


In [72]:
# Validate that metadata UUIDs exist in the expression outputs
expression_sample_ids = set(expression_matrix.columns)

def check_ids(series: pd.Series, label: str):
    clean = series.dropna().astype(str).str.strip()
    matches = clean[clean.isin(expression_sample_ids)]
    missing = clean[~clean.isin(expression_sample_ids)]
    print(f"{label}: {matches.nunique()} matches, {missing.nunique()} missing")
    if not missing.empty:
        display(pd.Series(missing.unique()[:10], name=f'{label} (examples)'))

biospecimen_samples_df = biospecimen_all_tables.get('samples', pd.DataFrame())
if not biospecimen_samples_df.empty and 'bcr_sample_uuid' in biospecimen_samples_df:
    check_ids(biospecimen_samples_df['bcr_sample_uuid'], 'Biospecimen sample UUIDs')

ssf_tumor_samples_df = ssf_all_tables.get('tumor_samples', pd.DataFrame())
if not ssf_tumor_samples_df.empty and 'bcr_sample_uuid' in ssf_tumor_samples_df:
    check_ids(ssf_tumor_samples_df['bcr_sample_uuid'], 'SSF tumor sample UUIDs')

biospecimen_aliquots_df = biospecimen_all_tables.get('aliquots', pd.DataFrame())
if not biospecimen_aliquots_df.empty and 'bcr_aliquot_uuid' in biospecimen_aliquots_df:
    check_ids(biospecimen_aliquots_df['bcr_aliquot_uuid'], 'Biospecimen aliquot UUIDs')

biospecimen_patients_df = biospecimen_all_tables.get('patient', pd.DataFrame())
if not biospecimen_patients_df.empty and 'bcr_patient_uuid' in biospecimen_patients_df:
    check_ids(biospecimen_patients_df['bcr_patient_uuid'], 'Biospecimen patient UUIDs (expect mismatch)')


Biospecimen sample UUIDs: 0 matches, 2305 missing


0    4f441e61-6bea-4a12-841d-def270804bbe
1    bb28c682-0071-4642-bab6-768b0fc322c6
2    0cc10a1e-a90a-4c55-87a2-a74d47474a4a
3    018b2eaa-216b-4726-a26e-ac726f4b47d3
4    c87025b5-5188-4f26-a73b-00d30ff77188
5    B0763E0D-56A7-4953-AD36-C3B218C059B3
6    8E23740C-1EE6-4CCC-A5A4-2E89F3E3557D
7    FD735DC5-2EC3-43BD-B1B1-8FA811BE0633
8    11F78E42-43E5-4BD5-9F88-E5A195AECE83
9    96E7BCE1-7B31-4513-ADE8-CFCE32FA8D92
Name: Biospecimen sample UUIDs (examples), dtype: object

SSF tumor sample UUIDs: 0 matches, 1105 missing


0    832b8766-0e59-4ad4-b4c9-1e39b2ce4fb1
1    0d2e50fa-daef-45a3-97b2-5d71ab03cfbf
2    11B582C7-95BF-4A07-8996-EF4D0D4F897E
3    7a3cd962-7512-4797-864a-cf374b5cb385
4    c1898b32-a9c5-410b-a197-572b2a6a2d7a
5    09b414c4-a314-43c5-823a-d91442b6b02b
6    405B2B8F-7E2E-4D80-89BE-299DA48374EF
7    61DB743A-627F-4666-B958-2542DBF3F320
8    357e1eca-0f5c-49d6-b437-0ed0afe3b178
9    c6f8aca1-259f-48ab-a470-a1afecb8a2a9
Name: SSF tumor sample UUIDs (examples), dtype: object

Biospecimen aliquot UUIDs: 0 matches, 14941 missing


0    013b8901-2f00-46f4-b864-0a5e7d4efa6c
1    878337fe-9f41-44f5-9760-3977e7d75308
2    91c8c373-e923-4e7f-b8b3-2417e92760c7
3    dd1d7840-21e5-4eb8-9795-91d0659cf8d9
4    c8638f56-52c7-4aab-9a57-f2318f44df97
5    9d04c180-7c23-490e-92cf-c018629b8b7f
6    8b368315-4ed7-432f-b3e5-8bff48b32293
7    31f208e7-dfcd-431e-9a78-765befdc6ccf
8    47723567-39d1-40e9-a8ea-1bf9548db03e
9    72c79f9e-a770-4a6c-a0a4-655ad717463c
Name: Biospecimen aliquot UUIDs (examples), dtype: object

Biospecimen patient UUIDs (expect mismatch): 0 matches, 1098 missing


0    3afa1e93-1df8-4e4c-aaa4-557463f4bb77
1    1502c7d7-1535-4e56-9f34-30623acd50d5
2    5FD37868-4762-4109-9DCF-6FDBAB5B645D
3    67C5F371-3FA9-47C5-8B15-C2DD9ACC8519
4    e3c336f5-c32f-4c5d-81fb-e2408ae145b2
5    8cf8b620-7ab6-4b6e-84bc-ff5a83f381fa
6    75113445-d2d6-44a0-866c-c9175e6d214b
7    332148f5-f070-4c20-8eb1-4d8c0673aa52
8    39de7761-e762-4811-b95c-8216b79ae06b
9    1c40b84e-a0e3-429f-a48c-21566cf881c0
Name: Biospecimen patient UUIDs (expect mismatch) (examples), dtype: object

In [73]:
# Validate that case-level identifiers align with the expression manifest
expression_case_ids = set(expression_index['case_id'].astype(str).str.strip())

def check_case_ids(series: pd.Series, label: str):
    clean = series.dropna().astype(str).str.strip()
    matches = clean[clean.isin(expression_case_ids)]
    missing = clean[~clean.isin(expression_case_ids)]
    print(f"{label}: {matches.nunique()} matches, {missing.nunique()} missing")
    if not missing.empty:
        display(pd.Series(missing.unique()[:10], name=f'{label} (examples)'))

if not clinical_patients_df.empty and 'bcr_patient_barcode' in clinical_patients_df:
    check_case_ids(clinical_patients_df['bcr_patient_barcode'], 'Clinical patient barcodes')

biospecimen_patients_df = biospecimen_all_tables.get('patient', pd.DataFrame())
if not biospecimen_patients_df.empty and 'bcr_patient_barcode' in biospecimen_patients_df:
    check_case_ids(biospecimen_patients_df['bcr_patient_barcode'], 'Biospecimen patient barcodes')

ssf_patients_df = ssf_all_tables.get('patient', pd.DataFrame())
if not ssf_patients_df.empty and 'bcr_patient_barcode' in ssf_patients_df:
    check_case_ids(ssf_patients_df['bcr_patient_barcode'], 'SSF patient barcodes')


Clinical patient barcodes: 0 matches, 1098 missing


0    TCGA-A2-A0CT
1    TCGA-A7-A0DA
2    TCGA-BH-A0BD
3    TCGA-GM-A2DD
4    TCGA-B6-A1KC
5    TCGA-A8-A07I
6    TCGA-D8-A1JM
7    TCGA-BH-A18Q
8    TCGA-PL-A8LV
9    TCGA-GM-A2DO
Name: Clinical patient barcodes (examples), dtype: object

Biospecimen patient barcodes: 0 matches, 1098 missing


0    TCGA-A7-A0DA
1    TCGA-B6-A1KC
2    TCGA-GM-A2DO
3    TCGA-AC-A2QH
4    TCGA-AR-A0U0
5    TCGA-A8-A07G
6    TCGA-A2-A0D4
7    TCGA-D8-A1XU
8    TCGA-AN-A0XW
9    TCGA-D8-A143
Name: Biospecimen patient barcodes (examples), dtype: object

SSF patient barcodes: 0 matches, 1097 missing


0    TCGA-BH-A0BD
1    TCGA-A8-A07I
2    TCGA-PL-A8LV
3    TCGA-C8-A274
4    TCGA-BH-A0C0
5    TCGA-E2-A1L7
6    TCGA-BH-A28O
7    TCGA-AC-A62V
8    TCGA-B6-A0IK
9    TCGA-E9-A247
Name: SSF patient barcodes (examples), dtype: object