In [1]:
import pandas as pd
import numpy as np
import glob
import yaml

import xml.etree.ElementTree as ET

In [2]:
with open("config_dataset.yaml", "r") as stream:
    config_dataset = yaml.safe_load(stream)

metadata_path = config_dataset['metadata_path']
metadata_path

'data/metadata_ff.csv'

In [4]:
metadata = pd.read_csv(metadata_path)
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type,data_type_info,id_pair
0,TCGA-BP-4775-01A-01-TS1.e9eb861b-be79-49fd-a90...,b87fe250-4800-4a9f-945e-7544057449d9.rna_seq.a...,TCGA-BP-4775,TCGA-BP-4775-01A,TCGA-BP-4775-01A,Primary Tumor,TS,0
1,TCGA-B0-4815-01A-01-TS1.24541590-fdf9-4f7d-b6d...,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,TCGA-B0-4815,TCGA-B0-4815-01A,TCGA-B0-4815-01A,Primary Tumor,TS,1
2,TCGA-B0-5088-01A-01-TS1.d6eb9c0d-c866-4473-862...,4ab7d102-f5eb-4922-be66-9b516bebaf0a.rna_seq.a...,TCGA-B0-5088,TCGA-B0-5088-01A,TCGA-B0-5088-01A,Primary Tumor,TS,2
3,TCGA-B0-4712-01A-01-TS1.c7ac6556-e3c1-4bb2-9c2...,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,TCGA-B0-4712,TCGA-B0-4712-01A,TCGA-B0-4712-01A,Primary Tumor,TS,3
4,TCGA-CJ-6030-11A-01-TS1.9be20b43-0879-4510-877...,be992b6a-3878-46dd-8e18-9fd0ee7e7b00.rna_seq.a...,TCGA-CJ-6030,TCGA-CJ-6030-11A,TCGA-CJ-6030-11A,Solid Tissue Normal,TS,4
...,...,...,...,...,...,...,...,...
527,TCGA-B0-5706-01A-01-TS1.89d9608f-f7ce-4e04-a82...,d278e333-b5e0-4ce8-9511-6c31d29c8957.rna_seq.a...,TCGA-B0-5706,TCGA-B0-5706-01A,TCGA-B0-5706-01A,Primary Tumor,TS,527
528,TCGA-B8-5553-01A-01-TS1.3ec5146b-9b14-4e53-b94...,297e8576-6af1-4f8d-b709-a14355728c0c.rna_seq.a...,TCGA-B8-5553,TCGA-B8-5553-01A,TCGA-B8-5553-01A,Primary Tumor,TS,528
529,TCGA-CJ-5684-01A-01-TS1.33ab47f6-eb89-4b4b-a6d...,628efe45-6269-4c4b-8f50-15be89877383.rna_seq.a...,TCGA-CJ-5684,TCGA-CJ-5684-01A,TCGA-CJ-5684-01A,Primary Tumor,TS,529
530,TCGA-CZ-5988-11A-01-TS1.365becdb-a9e2-4538-9e1...,fb312dd9-da8b-4190-999e-37cad1d683d5.rna_seq.a...,TCGA-CZ-5988,TCGA-CZ-5988-11A,TCGA-CZ-5988-11A,Solid Tissue Normal,TS,530


In [5]:
def load_last_vist_day(sample_id):
    try:
        path =  glob.glob(f'data/*/xml/*/*.{sample_id}.xml')[0]
        tree = ET.parse(path)
        root = tree.getroot()
        
        # Define the variable you're searching for
        variable_name = "vital_status"
        
        # Search for the variable in the XML tree
        for elem in root.iter():
            if "vital_status" in elem.tag:
                status = elem.text
            if "days_to_death" in elem.tag:
                days_to_death = elem.text 
            if "days_to_last_followup" in elem.tag:
                days_to_last_followup = elem.text
    
        if status == "Alive":
            return (sample_id, status, days_to_last_followup)
        else:
            return (sample_id, status, days_to_death)
    except:
        return (sample_id, np.nan, np.nan)

In [6]:
survival_metadata = metadata.case_id.apply(lambda x: load_last_vist_day(x))
survival_metadata = pd.DataFrame([(r) for r in survival_metadata.values], columns=["case_id", "censored", "event_time"])
survival_metadata

Unnamed: 0,case_id,censored,event_time
0,TCGA-BP-4775,Alive,1843
1,TCGA-B0-4815,,
2,TCGA-B0-5088,,
3,TCGA-B0-4712,,
4,TCGA-CJ-6030,Dead,2299
...,...,...,...
527,TCGA-B0-5706,Alive,3205
528,TCGA-B8-5553,Alive,435
529,TCGA-CJ-5684,Alive,2231
530,TCGA-CZ-5988,Alive,693


In [7]:
survival_metadata = survival_metadata.drop_duplicates('case_id')
survival_metadata = survival_metadata[~survival_metadata.event_time.isna()]
survival_metadata = survival_metadata[survival_metadata.event_time.astype(int) > 0]
survival_metadata

Unnamed: 0,case_id,censored,event_time
0,TCGA-BP-4775,Alive,1843
4,TCGA-CJ-6030,Dead,2299
6,TCGA-DV-5565,Alive,1329
8,TCGA-BP-4994,Alive,1308
10,TCGA-BP-4986,Alive,785
...,...,...,...
522,TCGA-B0-4819,Dead,183
525,TCGA-BP-5170,Alive,2412
526,TCGA-B0-4814,Dead,168
528,TCGA-B8-5553,Alive,435


In [8]:
survival_metadata.censored.value_counts()

censored
Alive    281
Dead     142
Name: count, dtype: int64

In [9]:
survival_metadata.to_csv(f'{metadata_path.replace(".csv", "")}_survival.csv', index=False)