In [1]:
import pandas as pd
import numpy as np
import glob
import yaml

import xml.etree.ElementTree as ET

In [4]:
with open("config_dataset_ffpe.yaml", "r") as stream:
    config_dataset = yaml.safe_load(stream)

metadata_path = config_dataset['metadata_path']
metadata_path

'data/metadata_ffpe.csv'

In [5]:
metadata = pd.read_csv(metadata_path)
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type,data_type_info,id_pair
0,TCGA-EE-A3J5-06Z-00-DX1.47202780-2B18-4661-AD4...,05d365ce-b1d0-46af-b74a-b254f2c7de8e.rna_seq.a...,TCGA-EE-A3J5,TCGA-EE-A3J5-06Z,TCGA-EE-A3J5-06A,Metastatic,DX,0
1,TCGA-D3-A8GE-06Z-00-DX1.757AE9F7-823E-4167-B12...,102d912c-5882-4ae4-9eaf-cbd2f55f66af.rna_seq.a...,TCGA-D3-A8GE,TCGA-D3-A8GE-06Z,TCGA-D3-A8GE-06A,Metastatic,DX,1
2,TCGA-D3-A1Q4-06Z-00-DX1.6AD7BBBD-BB47-4D71-B46...,ee3e7969-ff2a-4b27-8f15-6cb46eed7c61.rna_seq.a...,TCGA-D3-A1Q4,TCGA-D3-A1Q4-06Z,TCGA-D3-A1Q4-06A,Metastatic,DX,2
3,TCGA-D3-A2JF-06Z-00-DX1.1AD134CC-6844-45CC-BEC...,39927c92-75d7-424c-9bcd-63def0075a1d.rna_seq.a...,TCGA-D3-A2JF,TCGA-D3-A2JF-06Z,TCGA-D3-A2JF-06A,Metastatic,DX,3
4,TCGA-EB-A3XF-01Z-00-DX1.D381AAB4-242B-45E1-B6A...,393a470f-8641-493a-98fe-e0ec38c8d7f9.rna_seq.a...,TCGA-EB-A3XF,TCGA-EB-A3XF-01Z,TCGA-EB-A3XF-01A,Primary Tumor,DX,4
...,...,...,...,...,...,...,...,...
271,TCGA-XV-AAZW-01Z-00-DX1.26C215F6-0EFA-42D9-A3E...,0e8bcafe-b201-4c66-8a0b-d89a4fd7c7ae.rna_seq.a...,TCGA-XV-AAZW,TCGA-XV-AAZW-01Z,TCGA-XV-AAZW-01A,Primary Tumor,DX,271
272,TCGA-EB-A3XD-01Z-00-DX1.B0C8A3FE-21A5-4807-934...,7645a3b1-639d-4677-8bcf-c16df24fcc2b.rna_seq.a...,TCGA-EB-A3XD,TCGA-EB-A3XD-01Z,TCGA-EB-A3XD-01A,Primary Tumor,DX,272
273,TCGA-ER-A2NB-01Z-00-DX1.323F02C6-D07A-41B6-BEE...,22e9b9d1-ac71-43f5-8606-f9f53c6ecc08.rna_seq.a...,TCGA-ER-A2NB,TCGA-ER-A2NB-01Z,TCGA-ER-A2NB-01A,Primary Tumor,DX,273
274,TCGA-D3-A3C8-06Z-00-DX1.FE6A00E4-C1B4-42D8-9B0...,4ff81b7f-2dbc-453f-a68b-fe40315b2ef7.rna_seq.a...,TCGA-D3-A3C8,TCGA-D3-A3C8-06Z,TCGA-D3-A3C8-06A,Metastatic,DX,274


In [6]:
def load_last_vist_day(sample_id):
    try:
        path =  glob.glob(f'data/*/xml/*/*.{sample_id}.xml')[0]
        tree = ET.parse(path)
        root = tree.getroot()
        
        # Define the variable you're searching for
        variable_name = "vital_status"
        
        # Search for the variable in the XML tree
        for elem in root.iter():
            if "vital_status" in elem.tag:
                status = elem.text
            if "days_to_death" in elem.tag:
                days_to_death = elem.text 
            if "days_to_last_followup" in elem.tag:
                days_to_last_followup = elem.text
    
        if status == "Alive":
            return (sample_id, status, days_to_last_followup)
        else:
            return (sample_id, status, days_to_death)
    except:
        return (sample_id, np.nan, np.nan)

In [7]:
survival_metadata = metadata.case_id.apply(lambda x: load_last_vist_day(x))
survival_metadata = pd.DataFrame([(r) for r in survival_metadata.values], columns=["case_id", "censored", "event_time"])
survival_metadata

Unnamed: 0,case_id,censored,event_time
0,TCGA-EE-A3J5,Dead,1124
1,TCGA-D3-A8GE,Alive,804
2,TCGA-D3-A1Q4,Alive,3408
3,TCGA-D3-A2JF,Alive,1888
4,TCGA-EB-A3XF,Alive,278
...,...,...,...
271,TCGA-XV-AAZW,Dead,393
272,TCGA-EB-A3XD,Alive,1160
273,TCGA-ER-A2NB,Dead,857
274,TCGA-D3-A3C8,,


In [8]:
survival_metadata = survival_metadata.drop_duplicates('case_id')
survival_metadata = survival_metadata[~survival_metadata.event_time.isna()]
survival_metadata = survival_metadata[survival_metadata.event_time.astype(int) > 0]
survival_metadata

Unnamed: 0,case_id,censored,event_time
0,TCGA-EE-A3J5,Dead,1124
1,TCGA-D3-A8GE,Alive,804
2,TCGA-D3-A1Q4,Alive,3408
3,TCGA-D3-A2JF,Alive,1888
4,TCGA-EB-A3XF,Alive,278
...,...,...,...
269,TCGA-D3-A2J6,Dead,1321
271,TCGA-XV-AAZW,Dead,393
272,TCGA-EB-A3XD,Alive,1160
273,TCGA-ER-A2NB,Dead,857


In [9]:
survival_metadata.censored.value_counts()

censored
Alive    124
Dead      88
Name: count, dtype: int64

In [10]:
survival_metadata.to_csv(f'{metadata_path.replace(".csv", "")}_survival.csv', index=False)