In [1]:
import pandas as pd
import numpy as np
import glob
import yaml

import xml.etree.ElementTree as ET

In [2]:
with open("config_dataset_ffpe.yaml", "r") as stream:
    config_dataset = yaml.safe_load(stream)

metadata_path = config_dataset['metadata_path']
metadata_path

'data/metadata_ffpe.csv'

In [4]:
metadata = pd.read_csv(metadata_path)
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type,data_type_info,id_pair
0,TCGA-B0-4815-01Z-00-DX1.dd230cfa-5952-4fe7-b73...,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,TCGA-B0-4815,TCGA-B0-4815-01Z,TCGA-B0-4815-01A,Primary Tumor,DX,0
1,TCGA-B0-5088-01Z-00-DX1.69bb79f8-33cc-4c9c-be6...,4ab7d102-f5eb-4922-be66-9b516bebaf0a.rna_seq.a...,TCGA-B0-5088,TCGA-B0-5088-01Z,TCGA-B0-5088-01A,Primary Tumor,DX,1
2,TCGA-B0-4712-01Z-00-DX1.b584d650-4bdd-452c-992...,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,TCGA-B0-4712,TCGA-B0-4712-01Z,TCGA-B0-4712-01A,Primary Tumor,DX,2
3,TCGA-BP-4334-01Z-00-DX1.68cf009b-04da-4173-98d...,98967392-eeb4-4c7f-8578-1faf0e5dc3d6.rna_seq.a...,TCGA-BP-4334,TCGA-BP-4334-01Z,TCGA-BP-4334-01A,Primary Tumor,DX,3
4,TCGA-CJ-6030-01Z-00-DX1.A762AB76-62E5-4680-991...,21273777-9444-4ea4-9e68-9ff6303e9997.rna_seq.a...,TCGA-CJ-6030,TCGA-CJ-6030-01Z,TCGA-CJ-6030-01A,Primary Tumor,DX,4
...,...,...,...,...,...,...,...,...
519,TCGA-BP-4165-01Z-00-DX1.1f8b54d9-1285-4226-a8f...,e328dc67-2ba2-408a-a137-75792cc02e52.rna_seq.a...,TCGA-BP-4165,TCGA-BP-4165-01Z,TCGA-BP-4165-01A,Primary Tumor,DX,519
520,TCGA-B0-5081-01Z-00-DX1.2fed8120-8e9c-4a23-bd1...,4b04816b-80fa-410b-95d7-9b1f815b59e7.rna_seq.a...,TCGA-B0-5081,TCGA-B0-5081-01Z,TCGA-B0-5081-01A,Primary Tumor,DX,520
521,TCGA-A3-3367-01Z-00-DX1.9fca6a91-bb71-4632-8b2...,acffe97d-bcaf-4c3c-877f-a48d9f5e81db.rna_seq.a...,TCGA-A3-3367,TCGA-A3-3367-01Z,TCGA-A3-3367-01A,Primary Tumor,DX,521
522,TCGA-A3-3317-01Z-00-DX1.ca503755-bfa6-462b-b37...,0416ba1c-a9d5-4e25-a95b-7cb73bfeeb65.rna_seq.a...,TCGA-A3-3317,TCGA-A3-3317-01Z,TCGA-A3-3317-01A,Primary Tumor,DX,522


In [5]:
def load_last_vist_day(sample_id):
    try:
        path =  glob.glob(f'data/*/xml/*/*.{sample_id}.xml')[0]
        tree = ET.parse(path)
        root = tree.getroot()
        
        # Define the variable you're searching for
        variable_name = "vital_status"
        
        # Search for the variable in the XML tree
        for elem in root.iter():
            if "vital_status" in elem.tag:
                status = elem.text
            if "days_to_death" in elem.tag:
                days_to_death = elem.text 
            if "days_to_last_followup" in elem.tag:
                days_to_last_followup = elem.text
    
        if status == "Alive":
            return (sample_id, status, days_to_last_followup)
        else:
            return (sample_id, status, days_to_death)
    except:
        return (sample_id, np.nan, np.nan)

In [6]:
survival_metadata = metadata.case_id.apply(lambda x: load_last_vist_day(x))
survival_metadata = pd.DataFrame([(r) for r in survival_metadata.values], columns=["case_id", "censored", "event_time"])
survival_metadata

Unnamed: 0,case_id,censored,event_time
0,TCGA-B0-4815,,
1,TCGA-B0-5088,,
2,TCGA-B0-4712,,
3,TCGA-BP-4334,Dead,645
4,TCGA-CJ-6030,Dead,2299
...,...,...,...
519,TCGA-BP-4165,Alive,3037
520,TCGA-B0-5081,Dead,362
521,TCGA-A3-3367,Alive,2270
522,TCGA-A3-3317,Alive,1491


In [7]:
survival_metadata = survival_metadata.drop_duplicates('case_id')
survival_metadata = survival_metadata[~survival_metadata.event_time.isna()]
survival_metadata = survival_metadata[survival_metadata.event_time.astype(int) > 0]
survival_metadata

Unnamed: 0,case_id,censored,event_time
3,TCGA-BP-4334,Dead,645
4,TCGA-CJ-6030,Dead,2299
5,TCGA-BP-4965,Alive,1871
7,TCGA-DV-5565,Alive,1329
9,TCGA-CZ-4856,Alive,18
...,...,...,...
519,TCGA-BP-4165,Alive,3037
520,TCGA-B0-5081,Dead,362
521,TCGA-A3-3367,Alive,2270
522,TCGA-A3-3317,Alive,1491


In [8]:
survival_metadata.censored.value_counts()

censored
Alive    309
Dead     156
Name: count, dtype: int64

In [9]:
survival_metadata.to_csv(f'{metadata_path.replace(".csv", "")}_survival.csv', index=False)