In [1]:
import pandas as pd
import yaml
import os
from modelling_prep import *

  from pandas.core import (
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Config setup

In [2]:
# LOAD IN CONFIGURATION FILE AND SET UP VARIABLES
configFile = "../config/Head-Neck-PET-CT_config.yaml"
config = yaml.safe_load(open(configFile))

clinicalDataPath = config['clinical_data_path']
radiomicDirPath = config['radiomic_data_dir_path']

datasetName = config["dataset_name"]
negControlNames = config['negative_control_names']
outcomeStatus = config['outcome_status']

subsetExcludeVariables = config['exclusion_variables']
splitVariables = config['split_variables']

outputDir = config['output_dir_path']

## Clinical data setup

In [3]:
# Load in clinical data
completeClinicalData = pd.read_excel(clinicalDataPath)
# Get and set patient ID label as index for prep functions
clinicalPatID = getPatientIdentifierLabel(completeClinicalData)
completeClinicalData = completeClinicalData.set_index(clinicalPatID)

## Helper Functions

In [4]:
def dataSetup(completeRadiomicData):
    # Set patient ID label and set as index for prep functions
    radiomicPatID = getPatientIdentifierLabel(completeRadiomicData)
    setupRadiomicData = completeRadiomicData.set_index(radiomicPatID)

    if subsetExcludeVariables:
        # Drop any of the excluded variables
        filteredClinicalData, filteredRadiomicData = filterDataSetup(completeClinicalData, setupRadiomicData, subsetExcludeVariables=subsetExcludeVariables)
    
    else:
       filteredClinicalData, filteredRadiomicData = filterDataSetup(completeClinicalData, setupRadiomicData)

    # Get only radiomic features, remove Pyradiomics diagnostics columns
    featsOnlyRadiomics = dropPyradiomicsDiagnostics(filteredRadiomicData)

    return filteredClinicalData, filteredRadiomicData, featsOnlyRadiomics

In [5]:
def setupOutcome(filteredClinicalData, outcomeStatus):
    statusLabel = outcomeStatus['event_label']
    statusValues = outcomeStatus['event_values']
    followupLabel = outcomeStatus['time_label']

    if not isinstance(statusValues[0], int):
        # Set up the outcome labels for the R script that applies the CPH models
        outcomeLabels = outcomeLabelSetup(filteredClinicalData, statusLabel, statusValues, followupLabel)

    else:
        outcomeLabels = filteredClinicalData[[statusLabel, followupLabel]]

    return outcomeLabels

In [6]:
def makeSignature(featsOnlyRadiomics, outcomeLabels, features = []):
    if not features:
        labeledFeats = pd.merge(outcomeLabels, featsOnlyRadiomics, left_index=True, right_index=True)
    else:
        signatureRadiomics = featsOnlyRadiomics[features]
        labeledFeats = pd.merge(outcomeLabels, signatureRadiomics, left_index=True, right_index=True)

    return labeledFeats

# Running preprocessing

## For original image data

In [7]:
# Load in original radiomic data
radiomicDataPath = os.path.join(radiomicDirPath, ("radiomicfeatures_" + datasetName + ".csv"))
completeRadiomicData = pd.read_csv(radiomicDataPath)

In [8]:
# RUN DATA SETUP
filteredClinical, filteredOriginalRadData, featsOnlyOriginalRadiomics = dataSetup(completeRadiomicData)
outcomeLabels = setupOutcome(filteredClinical, outcomeStatus)
allFeatures = makeSignature(featsOnlyOriginalRadiomics, outcomeLabels)

Multiple patient identifier labels found. Using the first one.


In [None]:
# SAVE OUT FILTERED CLINICAL DATA
filteredClinicalOutFile = os.path.join(os.path.dirname(clinicalDataPath), "filtered_r2r_clinical_data_" + datasetName + ".xlsx")
filteredClinical.to_excel(filteredClinicalOutFile, index_label="patientID")

In [None]:
# Set up output file name
outputAllFeatFile = "labeled_radiomic_features_" + datasetName + ".csv"
# Set up output directory
allFeatsDirPath = os.path.join(outputDir, "all_features")
if not os.path.exists(allFeatsDirPath):
    os.makedirs(allFeatsDirPath)
# Save out cph model features
allFeatures.to_csv(os.path.join(allFeatsDirPath, outputAllFeatFile), index_label="patientID")

In [None]:
# Make signatures
aertsData = makeSignature(featsOnlyOriginalRadiomics, outcomeLabels, 
                          features=['original_firstorder_Energy', 
                                    'original_shape_Compactness1', 
                                    'original_glrlm_GrayLevelNonUniformity',
                                    'wavelet-HLH_glrlm_GrayLevelNonUniformity'])

volumeData = makeSignature(featsOnlyOriginalRadiomics, outcomeLabels,
                           features=["original_shape_MeshVolume"])

In [None]:
# Save out Aerts signature
outputAertsFile = "aerts_radiomic_features_" + datasetName + ".csv"

# Set up output directory
aertsFeatsDirPath = os.path.join(outputDir,"aerts_signature")
if not os.path.exists(aertsFeatsDirPath):
    os.makedirs(aertsFeatsDirPath)

aertsData.to_csv(os.path.join(aertsFeatsDirPath, outputAertsFile), index_label="patientID")

In [None]:
# Save out volume signature
outputVolumeFile = "vol_only_radiomic_features_" + datasetName + ".csv"

# Set up output directory
volumeFeatsDirPath = os.path.join(outputDir,"volume_signature")
if not os.path.exists(volumeFeatsDirPath):
    os.makedirs(volumeFeatsDirPath)

volumeData.to_csv(os.path.join(volumeFeatsDirPath, outputVolumeFile), index_label="patientID")

## For negative controls

In [None]:
for negControl in negControlNames:
    # Put negative control at front of dataset
    datasetName = negControl + "_" + config["dataset_name"]
    radiomicDataPath = os.path.join(radiomicDirPath, ("radiomicfeatures_" + datasetName + ".csv"))
    completeRadiomicData = pd.read_csv(radiomicDataPath)
    
    filteredClinical, filteredOriginalRadData, featsOnlyOriginalRadiomics = dataSetup(completeRadiomicData)
    outcomeLabels = setupOutcome(filteredClinical, outcomeStatus)
    allFeatures = makeSignature(featsOnlyOriginalRadiomics, outcomeLabels)

    # Save out cph model features
    outputAllFeatFile = "labeled_radiomic_features_" + datasetName + ".csv"
    allFeatures.to_csv(os.path.join(outputDir, "all_features", outputAllFeatFile), index_label="patientID")

    # AERTS SIGNATURE
    aertsData = makeSignature(featsOnlyOriginalRadiomics, outcomeLabels, 
                            features=['original_firstorder_Energy', 
                                        'original_shape_Compactness1', 
                                        'original_glrlm_GrayLevelNonUniformity',
                                        'wavelet-HLH_glrlm_GrayLevelNonUniformity'])
    
    # Save out Aerts signature
    outputAertsFile = "aerts_radiomic_features_" + datasetName + ".csv"
    aertsData.to_csv(os.path.join(outputDir, "aerts_signature", outputAertsFile), index_label="patientID")

    # VOLUME SIGNATURE
    volumeData = makeSignature(featsOnlyOriginalRadiomics, outcomeLabels,
                            features=["original_shape_MeshVolume"])
    
    # Save out volume signature
    outputVolumeFile = "vol_only_radiomic_features_" + datasetName + ".csv"
    volumeData.to_csv(os.path.join(outputDir, "volume_signature", outputVolumeFile), index_label="patientID")

## Train / Test split

In [None]:
signatures={'all': allFeatures, 'aerts': aertsData, 'volume': volumeData}

for sig, features in signatures.items():
    trainOutputDir = os.path.join(outputDir, "training", (sig + "_signature"))
    testOutputDir = os.path.join(outputDir, "test", (sig + "_signature"))

    if not os.path.exists(trainOutputDir):
        os.makedirs(trainOutputDir)
    if not os.path.exists(testOutputDir):
        os.makedirs(testOutputDir)

    splitClinical, splitFeatures = splitDataSetup(filteredClinical, features, splitVariables=splitVariables)

    trainFeatures = splitFeatures['training']
    testFeatures = splitFeatures['test']

    outputTrainFeatsFile = "training_" + sig + "_radiomic_features_" + datasetName + ".csv"
    outputTestFeatsFile = "test_" + sig + "_radiomic_features_" + datasetName + ".csv"

    trainFeatures.to_csv(os.path.join(trainOutputDir, outputTrainFeatsFile), index_label="patientID")
    testFeatures.to_csv(os.path.join(testOutputDir, outputTestFeatsFile), index_label="patientID")
    

## Demographic analysis

In [None]:
# Data Demographic Analysis
filteredClinical['Sex'].value_counts(dropna=False)

In [None]:
ageVariable = "Age"
print("Median:", filteredClinical[ageVariable].median())
print("Min:", filteredClinical[ageVariable].min())
print("Max:", filteredClinical[ageVariable].max())
print("")
print("  <40:", len(filteredClinical[filteredClinical[ageVariable] <= 40]))
print("40-60:", len(filteredClinical[filteredClinical[ageVariable] >= 40]) - len(filteredClinical[filteredClinical[ageVariable] >= 60]))
print("60-80:", len(filteredClinical[filteredClinical[ageVariable] >= 60]) - len(filteredClinical[filteredClinical[ageVariable] >= 80]))
print("  >80:", len(filteredClinical[filteredClinical[ageVariable] >= 80]))

# OLD CODE

In [None]:
negControlName = 'randomized_sampled_full'
outputDir = "/Users/katyscott/Documents/HNC Project/RADCURE/aerts_signature_features"

In [None]:
# Set paths to clinical and radiomic data files
clinicalDataPath = "/Users/katyscott/Documents/RADCURE/RADCURE-DA-CLINICAL-2.xlsx"
radiomicDataPath = "/Users/katyscott/Documents/HNC Project/RADCURE/radiomic_features/RADCURE_complete_" + negControlName + "_radiomics_features.csv"

In [None]:
# Set any conditions for variables to drop from the analysis 
subsetExcludeVariables = {'RADCURE-challenge': [0],
                          'Ds Site': ['Sarcoma', 'Unknown', 'Paraganglioma', 'Salivary Glands', 'Other', 'benign tumor', 'Orbit', 'Lacrimal gland'] }

In [None]:
# Load in clinical and radiomic data
completeClinicalData = pd.read_excel(clinicalDataPath)
completeRadiomicData = pd.read_csv(radiomicDataPath)

In [None]:
# Find the patient ID column in the clinical and radiomic data to set these as the index for all the modelling_prep functionality
clinicalPatID = getPatientIdentifierLabel(completeClinicalData)
radiomicPatID = getPatientIdentifierLabel(completeRadiomicData)

In [None]:
# Set patient ID as index in clinical and radiomic data for modelling_prep functions
completeClinicalData = completeClinicalData.set_index(clinicalPatID)
completeRadiomicData = completeRadiomicData.set_index(radiomicPatID)

In [None]:
# Drop any of the excluded variables
filteredClinical, filteredRadiomics = filterDataSetup(completeClinicalData, completeRadiomicData, subsetExcludeVariables=subsetExcludeVariables)

In [None]:
# Get only radiomic features, remove Pyradiomics diagnostics columns
featsOnlyRadiomics = dropPyradiomicsDiagnostics(filteredRadiomics)

In [None]:
# Set up the outcome labels for the R script that applies the CPH models
outcomeLabels = outcomeLabelSetup(filteredClinical, statusLabel='Status', statusValues=['Alive', 'Dead'], followupLabel="Length FU")

In [None]:
# Save out the filtered clinical data that removed non-RADCURE challenge and low instance disease sites
filteredClinical.to_csv("/Users/katyscott/Documents/HNC Project/RADCURE/clinical_data/updated_filtered_clinical_data_RADCURE.csv")

In [None]:
filteredClinical['Ds Site'].value_counts()

## Set up Aerts signature

In [None]:
# Get subset of radiomic features optionally
aertsSignature = ['original_firstorder_Energy',
                  'original_shape_Compactness1',
                  'original_glrlm_GrayLevelNonUniformity',
                  'wavelet-HLH_glrlm_GrayLevelNonUniformity']

aertsRadiomics = featsOnlyRadiomics[aertsSignature]

aertsLabeledFeats = pd.merge(outcomeLabels, aertsRadiomics, left_index=True, right_index=True)

In [None]:
# Separate clinical into train/test based on column
# Separate radiomic into train/test based on clinical

trainTestSplitInfo = {"RADCURE-challenge": ["training", "test"]}

splitClinical, splitAertsLabeledFeats = splitDataSetup(filteredClinical, aertsLabeledFeats, splitVariables=trainTestSplitInfo)

In [None]:
trainClinical = splitClinical["training"]
testClinical = splitClinical["test"]

trainAertsLabeledFeats = splitAertsLabeledFeats['training']
testAertsLabeledFeats = splitAertsLabeledFeats['test']

In [None]:
# Save out train/test 
trainAertsLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/training/" + negControlName + "_aerts_w_labels.csv")
testAertsLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/test/test_nc_random_aerts_w_labels.csv")

In [None]:
trainClinical.shape

In [None]:
testClinical.shape

In [None]:
trainAertsLabeledFeats["Status_bool"].sum()

## Set up volume only data

In [None]:
volumeRadiomics = featsOnlyRadiomics["original_shape_MeshVolume"]
volumeLabeledFeats = pd.merge(outcomeLabels, volumeRadiomics, left_index=True, right_index=True)

In [None]:
trainTestSplitInfo = {"RADCURE-challenge": ["training", "test"]}

splitClinical, splitVolumeLabeledFeats = splitDataSetup(filteredClinical, volumeLabeledFeats, splitVariables=trainTestSplitInfo)

In [None]:
trainClinical = splitClinical["training"]
testClinical = splitClinical["test"]

trainVolumeLabeledFeats = splitVolumeLabeledFeats['training']
testVolumeLabeledFeats = splitVolumeLabeledFeats['test']

In [None]:
trainClinical.shape

In [None]:
trainVolumeLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/training/volume_w_labels.csv")
testVolumeLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/test/test_volume_w_labels.csv")

## Oropharynx Data Only (Kwan model)

In [None]:
# Get OPC-1 patient list to confirm there's no overlap with the test set from RADCURE 
opc1_patient_list = pd.read_csv("/Users/katyscott/Documents/RADCURE/RADCURE patient id to OPC-Radiomics patient id mapping.csv")

In [None]:
opcIdentifier = {'Ds Site': ["Oropharynx"]}
testOPCClinical, testOPCAerts = splitDataSetup(testClinical, testAertsLabeledFeats, opcIdentifier)

In [None]:
testOPCClinical, testOPCVolume = splitDataSetup(testClinical, testVolumeLabeledFeats, opcIdentifier)

In [None]:
dfTestOPClinical = testOPCClinical['Oropharynx']
dfTestOPCVolume = testOPCVolume['Oropharynx']
dfTestOPCAerts = testOPCAerts['Oropharynx']

In [None]:
dfTestOPCAerts.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/test/OPC_test_nc_random_aerts_w_labels.csv")
dfTestOPCVolume.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/test/OPC_test_volume_w_labels.csv")

### Confirming Test OPC set doesn't overlap with the OPC-Radiomics dataset

In [None]:
test_opc_pat_list = dfTestOPClinical.index.tolist()

In [None]:
list(set(opc1_patient_list['RADCURE']).intersection(test_opc_pat_list))

## Get training OPC-1 dataset

In [None]:
opc1_train_radiomic = completeRadiomicData[completeRadiomicData.index.isin(opc1_patient_list['RADCURE'])]
opc1_train_clinical = completeClinicalData[completeClinicalData.index.isin(opc1_train_radiomic.index)]

In [None]:
opc1_outcome_labels = outcomeLabelSetup(opc1_train_clinical, statusLabel='Status', statusValues=['Alive', 'Dead'], followupLabel="Length FU")

In [None]:
opc1_train_featsOnly = dropPyradiomicsDiagnostics(opc1_train_radiomic)
opc1_train_aerts_radiomics = opc1_train_featsOnly[aertsSignature]

opc1AertsLabeledFeats = pd.merge(opc1_outcome_labels, opc1_train_aerts_radiomics, left_index=True, right_index=True)

In [None]:
opc1AertsLabeledFeats.to_csv("/Users/katyscott/Documents/RADCURE/RADCURE_updated_data/uhn_radcure_plus_aerts/training/OPC1_train_aerts_w_labels.csv")

### Getting actual OPC-Radiomics Kwan subset from p16 status

In [None]:
opc1_p16_positive_clinical = pd.read_excel("/Users/katyscott/Documents/RADCURE/OPC1_p16_positive_subset.xlsx")
pat_ids_opc1_p16_positive = opc1_p16_positive_clinical['Trial PatientID']

In [None]:
pat_ids_radcure_opc1_p16_pos = opc1_patient_list[opc1_patient_list['OPC-Radiomics'].isin(pat_ids_opc1_p16_positive)]

In [None]:
multi_lesion = pd.read_csv("/Users/katyscott/Downloads/multiple-lesions-ids.csv")

pat_id_multilesion_p16_pos = pat_ids_radcure_opc1_p16_pos[pat_ids_radcure_opc1_p16_pos['RADCURE'].isin(multi_lesion['USUBJID'])]