In [188]:
import numpy as np
import pandas as pd

In [189]:
ROOT = '/mnt/c/Users/sergio/Desktop/TFG'
TCIA_ROOT = f'{ROOT}/TCIA'
GM_ROOT = f'{ROOT}/GM'

In [190]:
TCIA = f'{TCIA_ROOT}/participants.csv'
GM = f'{GM_ROOT}/GM-BRATS+NORM/participants.tsv'

In [193]:
# negative = wildtype
# positive = mutated

In [284]:
def idh_status_to_label(idh_status):
    idh_status = idh_status.lower()
    if 'mutated' in idh_status or 'idh' in idh_status or 'positivo' in idh_status:
        return 1  # 'mutated'
    elif 'wildtype' in idh_status or 'negativo' in idh_status:
        return 0  # 'wildtype'
    else:
        return ValueError(idh_status)

def codeletion_to_label(codeletion):
    if pd.isna(codeletion):
        return codeletion
    codeletion = codeletion.lower()
    if 'deletion' in codeletion or 'presente' in codeletion:
        return 1  # 'co-deleted'
    elif 'intact' in codeletion or 'ausente' in codeletion:
        return 0  # 'intact'
    else:
        raise ValueError(codeletion)

### **TCIA**

In [280]:
tcia_df = pd.read_csv(TCIA, sep=';')
print(len(tcia_df))

501


In [285]:
tcia_processed = tcia_df[['participant_id', 'sex', 'age', 'who_cns_grade', 'diagnosis', 'idh_status', 'codeletion_1p19q_status']]
# tcia_processed = tcia_processed[~tcia_processed['codeletion_1p19q_status'].isnull()].copy()

tcia_processed = tcia_processed.rename(columns={'diagnosis': 'histologic_subtype'})
tcia_processed = tcia_processed.drop(tcia_processed[tcia_processed['participant_id'] == 'sub-138'].index, axis=0)  # not present
tcia_processed = tcia_processed.drop(tcia_processed[tcia_processed['participant_id'] == 'sub-175'].index, axis=0)  # not present
tcia_processed = tcia_processed.drop(tcia_processed[tcia_processed['participant_id'] == 'sub-181'].index, axis=0)  # not present
tcia_processed = tcia_processed.drop(tcia_processed[tcia_processed['participant_id'] == 'sub-278'].index, axis=0)  # not present
tcia_processed = tcia_processed.drop(tcia_processed[tcia_processed['participant_id'] == 'sub-289'].index, axis=0)  # not present
tcia_processed = tcia_processed.drop(tcia_processed[tcia_processed['participant_id'] == 'sub-315'].index, axis=0)  # not present
tcia_processed = tcia_processed.drop(tcia_processed[tcia_processed['participant_id'] == 'sub-541'].index, axis=0)  # corrupted segmentation
tcia_processed['participant_id'] = tcia_processed['participant_id'].str.replace(r'sub-(\d{1,3})$', lambda x: 'sub-' + x.group(1).zfill(4), regex=True)
tcia_processed['histologic_subtype'] = tcia_processed['histologic_subtype'].str.split(',').str[0]
tcia_processed['idh_status'] = tcia_processed['idh_status'].apply(idh_status_to_label)
tcia_processed['codeletion_1p19q_status'] = tcia_processed['codeletion_1p19q_status'].apply(codeletion_to_label)
tcia_processed['database'] = ['TCIA'] * len(tcia_processed)
tcia_processed = tcia_processed.reset_index(drop=True)
tcia_processed.head(2)

Unnamed: 0,participant_id,sex,age,who_cns_grade,histologic_subtype,idh_status,codeletion_1p19q_status,database
0,sub-0004,M,66,4,Glioblastoma,0,,TCIA
1,sub-0005,F,80,4,Glioblastoma,0,,TCIA


In [286]:
print(len(tcia_processed))

494


### **Gregorio Marañón**

In [272]:
gm_df = pd.read_csv(GM, sep='\t', encoding='latin1')
print(len(gm_df))

42


In [287]:
gm_processed = gm_df[['participant_id', 'sex', 'age', 'who_cns_grade', 'histologic_subtype', 'idh_status', 'codeletion_1p19q_status']]
# gm_processed = gm_processed[~gm_processed['codeletion_1p19q_status'].isnull()].copy()

gm_processed = gm_processed.drop(gm_processed[gm_processed['participant_id'] == 'sub-0008'].index)
gm_processed['histologic_subtype'] = gm_processed['histologic_subtype'].str.title()
gm_processed['histologic_subtype'] = gm_processed['histologic_subtype'].apply(lambda x: x.replace('Astrocitoma', 'Astrocytoma'))
gm_processed = gm_processed[gm_processed['histologic_subtype'] != 'Glioma Difuso Linea Media']                                 
gm_processed['idh_status'] = gm_processed['idh_status'].apply(idh_status_to_label)
gm_processed['codeletion_1p19q_status'] = gm_processed['codeletion_1p19q_status'].apply(codeletion_to_label)
gm_processed['database'] = ['GM'] * len(gm_processed)
gm_processed = gm_processed.reset_index(drop=True)
gm_processed.head(2)

Unnamed: 0,participant_id,sex,age,who_cns_grade,histologic_subtype,idh_status,codeletion_1p19q_status,database
0,sub-0000,M,50,4,Glioblastoma,0,0.0,GM
1,sub-0002,F,57,4,Glioblastoma,0,0.0,GM


In [288]:
set(gm_df['participant_id']).difference(gm_processed['participant_id'])

{'sub-0001', 'sub-0008'}

In [289]:
print(len(gm_processed))

40


In [290]:
final_df = pd.concat([tcia_processed, gm_processed], axis=0).reset_index(drop=True)
final_df.insert(0, 'index', final_df.index)
final_df.head(2)

Unnamed: 0,index,participant_id,sex,age,who_cns_grade,histologic_subtype,idh_status,codeletion_1p19q_status,database
0,0,sub-0004,M,66,4,Glioblastoma,0,,TCIA
1,1,sub-0005,F,80,4,Glioblastoma,0,,TCIA


## Save dataset

In [291]:
final_df.to_csv(f'{ROOT}/participants.csv')