In [1]:
import pandas as pd
import yaml
import os
from modelling_prep import *
import numpy as np 

  from pandas.core import (
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Helper Functions

In [2]:
def makeSignature(featsOnlyRadiomics, outcomeLabels, features = []):
    if not features:
        labeledFeats = pd.merge(outcomeLabels, featsOnlyRadiomics, left_index=True, right_index=True)
    else:
        signatureRadiomics = featsOnlyRadiomics[features]
        labeledFeats = pd.merge(outcomeLabels, signatureRadiomics, left_index=True, right_index=True)

    return labeledFeats

In [3]:
def setupOutcome(filteredClinicalData, outcomeStatus):
    statusLabel = outcomeStatus['event_label']
    statusValues = outcomeStatus['event_values']
    followupLabel = outcomeStatus['time_label']

    if not isinstance(statusValues[0], int):
        # Set up the outcome labels for the R script that applies the CPH models
        outcomeLabels = outcomeLabelSetup(filteredClinicalData, statusLabel, statusValues, followupLabel)

    else:
        outcomeLabels = filteredClinicalData[[statusLabel, followupLabel]]

    return outcomeLabels

In [4]:
def dataSetup(completeRadiomicData):
    # Set patient ID label and set as index for prep functions
    radiomicPatID = getPatientIdentifierLabel(completeRadiomicData)
    setupRadiomicData = completeRadiomicData.set_index(radiomicPatID)

    if subsetExcludeVariables:
        # Drop any of the excluded variables
        filteredClinicalData, filteredRadiomicData = filterDataSetup(completeClinicalData, setupRadiomicData, subsetExcludeVariables=subsetExcludeVariables)
    
    else:
       filteredClinicalData, filteredRadiomicData = filterDataSetup(completeClinicalData, setupRadiomicData)

    # Get only radiomic features, remove Pyradiomics diagnostics columns
    featsOnlyRadiomics = dropPyradiomicsDiagnostics(filteredRadiomicData)

    return filteredClinicalData, filteredRadiomicData, featsOnlyRadiomics

## Config setup

In [5]:
# LOAD IN CONFIGURATION FILE AND SET UP VARIABLES
configFile = "../config/HEAD-NECK-RADIOMICS-HN1_config.yaml"
config = yaml.safe_load(open(configFile))

clinicalDataPath = config['clinical_data_path']
radiomicDirPath = config['radiomic_data_dir_path']

datasetName = config["dataset_name"]
negControlNames = config['negative_control_names']
outcomeStatus = config['outcome_status']

subsetExcludeVariables = config['exclusion_variables']
splitVariables = config['split_variables']
trainTestSplit = config['train_test_split']

outputDir = config['output_dir_path']

## Data setup

In [6]:
# Load in clinical data
completeClinicalData = pd.read_excel(clinicalDataPath)
# Get and set patient ID label as index for prep functions
clinicalPatID = getPatientIdentifierLabel(completeClinicalData)
completeClinicalData = completeClinicalData.set_index(clinicalPatID)

In [7]:
# Load in original radiomic data
radiomicDataPath = os.path.join(radiomicDirPath, ("radiomicfeatures_" + datasetName + ".csv"))
completeRadiomicData = pd.read_csv(radiomicDataPath)

In [8]:
# RUN DATA SETUP
filteredClinical, filteredOriginalRadData, featsOnlyOriginalRadiomics = dataSetup(completeRadiomicData)
outcomeLabels = setupOutcome(filteredClinical, outcomeStatus)
allFeatures = makeSignature(featsOnlyOriginalRadiomics, outcomeLabels)

Multiple patient identifier labels found. Using the first one.


## Demographic analysis

In [9]:
# Data Demographic Analysis
sexVariable = config['sex_variable']
filteredClinical[sexVariable].value_counts(dropna=False)

biological_sex
male      111
female     26
Name: count, dtype: int64

In [10]:
ageVariable = config["age_variable"]
print("Median:", filteredClinical[ageVariable].median())
print("Min:", filteredClinical[ageVariable].min())
print("Max:", filteredClinical[ageVariable].max())
print("")
print("  <40:", len(filteredClinical[filteredClinical[ageVariable] <= 40]))
print("40-60:", len(filteredClinical[filteredClinical[ageVariable] >= 40]) - len(filteredClinical[filteredClinical[ageVariable] >= 60]))
print("60-80:", len(filteredClinical[filteredClinical[ageVariable] >= 60]) - len(filteredClinical[filteredClinical[ageVariable] >= 80]))
print("  >80:", len(filteredClinical[filteredClinical[ageVariable] >= 80]))
print("Quantiles:", np.quantile(filteredClinical[ageVariable], [0, 0.25, 0.5, 0.75, 1]))

Median: 61.0
Min: 44
Max: 83

  <40: 0
40-60: 55
60-80: 78
  >80: 4
Quantiles: [44. 56. 61. 67. 83.]
