In [5]:
import pandas as pd
import yaml
import os
from modelling_prep import *

In [None]:
configFile = None
config = yaml.safe_load(open(configFile))

clinicalDataPath = config['clinical_data_path']
radiomicDirPath = config['radiomic_data_dir_path']

datasetName = config["dataset_name"]
negControlName = config['neg_control_name']
outcomeStatus = 

subsetExcludeVariables = config['exclusion_variables']

radiomicDataPath = os.path.join(radiomicDataPath, ("radiomic-features_" + negControlName + "_" + datasetName + ".csv"))

In [2]:
negControlName = 'randomized_sampled_full'
outputDir = "/Users/katyscott/Documents/HNC Project/RADCURE/aerts_signature_features"

In [20]:
# Set paths to clinical and radiomic data files
clinicalDataPath = "/Users/katyscott/Documents/RADCURE/RADCURE-DA-CLINICAL-2.xlsx"
radiomicDataPath = "/Users/katyscott/Documents/HNC Project/RADCURE/radiomic_features/RADCURE_complete_" + negControlName + "_radiomics_features.csv"

In [3]:
# Set any conditions for variables to drop from the analysis 
subsetExcludeVariables = {'RADCURE-challenge': [0],
                          'Ds Site': ['Sarcoma', 'Unknown', 'Paraganglioma', 'Salivary Glands', 'Other', 'benign tumor', 'Orbit', 'Lacrimal gland'] }

In [4]:
# Load in clinical and radiomic data
completeClinicalData = pd.read_excel(clinicalDataPath)
completeRadiomicData = pd.read_csv(radiomicDataPath)

In [5]:
# Find the patient ID column in the clinical and radiomic data to set these as the index for all the modelling_prep functionality
clinicalPatID = getPatientIdentifierLabel(completeClinicalData)
radiomicPatID = getPatientIdentifierLabel(completeRadiomicData)

In [6]:
# Set patient ID as index in clinical and radiomic data for modelling_prep functions
completeClinicalData = completeClinicalData.set_index(clinicalPatID)
completeRadiomicData = completeRadiomicData.set_index(radiomicPatID)

In [7]:
# Drop any of the excluded variables
filteredClinical, filteredRadiomics = filterDataSetup(completeClinicalData, completeRadiomicData, subsetExcludeVariables=subsetExcludeVariables)

In [8]:
# Get only radiomic features, remove Pyradiomics diagnostics columns
featsOnlyRadiomics = dropPyradiomicsDiagnostics(filteredRadiomics)

In [9]:
# Set up the outcome labels for the R script that applies the CPH models
outcomeLabels = outcomeLabelSetup(filteredClinical, statusLabel='Status', statusValues=['Alive', 'Dead'], followupLabel="Length FU")

In [11]:
# Save out the filtered clinical data that removed non-RADCURE challenge and low instance disease sites
filteredClinical.to_csv("/Users/katyscott/Documents/HNC Project/RADCURE/clinical_data/updated_filtered_clinical_data_RADCURE.csv")

In [12]:
filteredClinical['Ds Site'].value_counts()

Ds Site
Oropharynx           1076
Larynx                713
Nasopharynx           319
Hypopharynx           139
Lip & Oral Cavity      81
nasal cavity           28
Paranasal Sinus        24
Nasal Cavity           21
esophagus              13
Esophagus               9
Name: count, dtype: int64

## Set up Aerts signature

In [13]:
# Get subset of radiomic features optionally
aertsSignature = ['original_firstorder_Energy',
                  'original_shape_Compactness1',
                  'original_glrlm_GrayLevelNonUniformity',
                  'wavelet-HLH_glrlm_GrayLevelNonUniformity']

aertsRadiomics = featsOnlyRadiomics[aertsSignature]

aertsLabeledFeats = pd.merge(outcomeLabels, aertsRadiomics, left_index=True, right_index=True)

In [14]:
# Separate clinical into train/test based on column
# Separate radiomic into train/test based on clinical

trainTestSplitInfo = {"RADCURE-challenge": ["training", "test"]}

splitClinical, splitAertsLabeledFeats = splitDataSetup(filteredClinical, aertsLabeledFeats, splitVariables=trainTestSplitInfo)

Getting split for  RADCURE-challenge


In [17]:
trainClinical = splitClinical["training"]
testClinical = splitClinical["test"]

trainAertsLabeledFeats = splitAertsLabeledFeats['training']
testAertsLabeledFeats = splitAertsLabeledFeats['test']

In [16]:
# Save out train/test 
trainAertsLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/training/" + negControlName + "_aerts_w_labels.csv")
testAertsLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/test/test_nc_random_aerts_w_labels.csv")

In [None]:
trainClinical.shape

In [None]:
testClinical.shape

In [None]:
trainAertsLabeledFeats["Status_bool"].sum()

## Set up volume only data

In [17]:
volumeRadiomics = featsOnlyRadiomics["original_shape_MeshVolume"]
volumeLabeledFeats = pd.merge(outcomeLabels, volumeRadiomics, left_index=True, right_index=True)

In [18]:
trainTestSplitInfo = {"RADCURE-challenge": ["training", "test"]}

splitClinical, splitVolumeLabeledFeats = splitDataSetup(filteredClinical, volumeLabeledFeats, splitVariables=trainTestSplitInfo)

Getting split for  RADCURE-challenge


In [19]:
trainClinical = splitClinical["training"]
testClinical = splitClinical["test"]

trainVolumeLabeledFeats = splitVolumeLabeledFeats['training']
testVolumeLabeledFeats = splitVolumeLabeledFeats['test']

In [None]:
trainClinical.shape

In [None]:
trainVolumeLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/training/volume_w_labels.csv")
testVolumeLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/test/test_volume_w_labels.csv")

## Oropharynx Data Only (Kwan model)

In [20]:
# Get OPC-1 patient list to confirm there's no overlap with the test set from RADCURE 
opc1_patient_list = pd.read_csv("/Users/katyscott/Documents/RADCURE/RADCURE patient id to OPC-Radiomics patient id mapping.csv")

In [21]:
opcIdentifier = {'Ds Site': ["Oropharynx"]}
testOPCClinical, testOPCAerts = splitDataSetup(testClinical, testAertsLabeledFeats, opcIdentifier)

Getting split for  Ds Site


In [22]:
testOPCClinical, testOPCVolume = splitDataSetup(testClinical, testVolumeLabeledFeats, opcIdentifier)

Getting split for  Ds Site


In [23]:
dfTestOPClinical = testOPCClinical['Oropharynx']
dfTestOPCVolume = testOPCVolume['Oropharynx']
dfTestOPCAerts = testOPCAerts['Oropharynx']

In [None]:
dfTestOPCAerts.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/test/OPC_test_nc_random_aerts_w_labels.csv")
dfTestOPCVolume.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/test/OPC_test_volume_w_labels.csv")

### Confirming Test OPC set doesn't overlap with the OPC-Radiomics dataset

In [None]:
test_opc_pat_list = dfTestOPClinical.index.tolist()

In [None]:
list(set(opc1_patient_list['RADCURE']).intersection(test_opc_pat_list))

## Get training OPC-1 dataset

In [25]:
opc1_train_radiomic = completeRadiomicData[completeRadiomicData.index.isin(opc1_patient_list['RADCURE'])]
opc1_train_clinical = completeClinicalData[completeClinicalData.index.isin(opc1_train_radiomic.index)]

In [26]:
opc1_outcome_labels = outcomeLabelSetup(opc1_train_clinical, statusLabel='Status', statusValues=['Alive', 'Dead'], followupLabel="Length FU")

In [28]:
opc1_train_featsOnly = dropPyradiomicsDiagnostics(opc1_train_radiomic)
opc1_train_aerts_radiomics = opc1_train_featsOnly[aertsSignature]

opc1AertsLabeledFeats = pd.merge(opc1_outcome_labels, opc1_train_aerts_radiomics, left_index=True, right_index=True)

In [29]:
opc1AertsLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/training/OPC1_train_aerts_w_labels.csv")

### Getting actual OPC-Radiomics Kwan subset from p16 status

In [36]:
opc1_p16_positive_clinical = pd.read_excel("/Users/katyscott/Documents/RADCURE/OPC1_p16_positive_subset.xlsx")
pat_ids_opc1_p16_positive = opc1_p16_positive_clinical['Trial PatientID']

In [40]:
pat_ids_radcure_opc1_p16_pos = opc1_patient_list[opc1_patient_list['OPC-Radiomics'].isin(pat_ids_opc1_p16_positive)]

In [42]:
multi_lesion = pd.read_csv("/Users/katyscott/Downloads/multiple-lesions-ids.csv")

pat_id_multilesion_p16_pos = pat_ids_radcure_opc1_p16_pos[pat_ids_radcure_opc1_p16_pos['RADCURE'].isin(multi_lesion['USUBJID'])]