In [1]:
import sys
import pandas as pd
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')

data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)

In [2]:
from jarvais.analyzer import Analyzer
from pprint import pprint

df.drop(columns=["Study ID"], inplace=True)
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)

config = Analyzer.dry_run(df)

pprint(config)

  from .autonotebook import tqdm as notebook_tqdm


Config file not found. Creating custom...
Used a heuristic to define categorical and continuous columns. Please review!


Feature Types:
  - Categorical: ['N Stage', 'Smoking Status', 'Sex', 'T Stage', 'Disease Site', 'Stage', 'Dose', 'HPV Combined', 'event', 'Chemotherapy']
  - Continuous: ['time', 'age at dx']


Outlier Analysis:
  - Outliers found in N Stage: ['N3b: 28 out of 3333', 'N3a: 13 out of 3333', 'NX: 1 out of 3333']
  - No Outliers found in Smoking Status
  - No Outliers found in Sex
  - Outliers found in T Stage: ['T2b: 5 out of 3334', 'T2a: 4 out of 3334', 'TX: 4 out of 3334', 'T3 (2): 3 out of 3334', 'T2 (2): 1 out of 3334', 'rT0: 1 out of 3334', 'T1 (2): 1 out of 3334']
  - Outliers found in Disease Site: ['paranasal sinus: 28 out of 3346', 'skin: 24 out of 3346', 'sarcoma: 20 out of 3346', 'paraganglioma: 7 out of 3346', 'salivary glands: 4 out of 3346', 'other: 2 out of 3346', 'benign tumor: 1 out of 3346', 'orbit: 1 out of 3346', 'lacrimal gland: 1 out of 3346']
  -

In [3]:
import yaml
from pathlib import Path

config['columns']['categorical'].remove('Dose')
config['columns']['continuous'].append('Dose') 

pprint(config)

analyzer_path = Path('radcure_outputs/analyzer')
analyzer_path.mkdir(parents=True, exist_ok=True)

with open(analyzer_path / 'config.yaml', 'w') as f:
            yaml.dump(config, f)

{'columns': {'categorical': ['N Stage',
                             'Smoking Status',
                             'Sex',
                             'T Stage',
                             'Disease Site',
                             'Stage',
                             'HPV Combined',
                             'event',
                             'Chemotherapy'],
             'continuous': ['time', 'age at dx', 'Dose'],
             'date': [],
             'other': []},
 'mapping': {'Chemotherapy': {'0': '0', '1': '1'},
             'Disease Site': {'benign tumor': 'Other',
                              'esophagus': 'esophagus',
                              'hypopharynx': 'hypopharynx',
                              'lacrimal gland': 'Other',
                              'larynx': 'larynx',
                              'lip & oral cavity': 'lip & oral cavity',
                              'nasal cavity': 'nasal cavity',
                              'nasopharynx': 'nasoph

In [4]:
from jarvais.analyzer import Analyzer

analyzer = Analyzer(df, task='survival', target_variable='event', output_dir='./radcure_outputs/analyzer', one_hot_encode=True, config='radcure_outputs/analyzer/config.yaml')

analyzer.run()

Feature Types:
  - Categorical: ['N Stage', 'Smoking Status', 'Sex', 'T Stage', 'Disease Site', 'Stage', 'HPV Combined', 'event', 'Chemotherapy']
  - Continuous: ['time', 'age at dx', 'Dose']


Outlier Analysis:
  - Outliers found in N Stage: ['N3b: 28 out of 3333', 'N3a: 13 out of 3333', 'NX: 1 out of 3333']
  - No Outliers found in Smoking Status
  - No Outliers found in Sex
  - Outliers found in T Stage: ['T2b: 5 out of 3334', 'T2a: 4 out of 3334', 'TX: 4 out of 3334', 'T3 (2): 3 out of 3334', 'T2 (2): 1 out of 3334', 'rT0: 1 out of 3334', 'T1 (2): 1 out of 3334']
  - Outliers found in Disease Site: ['paranasal sinus: 28 out of 3346', 'skin: 24 out of 3346', 'sarcoma: 20 out of 3346', 'paraganglioma: 7 out of 3346', 'salivary glands: 4 out of 3346', 'other: 2 out of 3346', 'benign tumor: 1 out of 3346', 'orbit: 1 out of 3346', 'lacrimal gland: 1 out of 3346']
  - Outliers found in Stage: ['IV: 12 out of 3319', 'X: 6 out of 3319', 'IIIC: 2 out of 3319', 'IIA: 2 out of 3319', 'IIIA: 2

In [5]:
from jarvais.trainer import TrainerSupervised

df = pd.read_csv('./radcure_outputs/analyzer/updated_data.csv', index_col=0)
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)

trainer = TrainerSupervised(task='survival', output_dir='./radcure_outputs/ED_trainer_explainer',)
trainer.run(df, ['event','time'])

Training MTLR...


[W 2025-02-06 15:47:30,167] Trial 6 failed with parameters: {'C1': 0.01, 'dropout': 0.322887614468196, 'dims': [512, 512]} because of the following error: The value nan is not acceptable.
[W 2025-02-06 15:47:30,170] Trial 6 failed with value nan.


  Best trial:
    Params: 
      C1: 0.01
      dropout: 0.35324720434582946
      dims: [64, 64]
Training DeepSurv...
  Best trial:
    Params: 
      l2_reg: 0.00256531183447817
      dropout: 0.2830909947364929
      dims: [256, 256, 256]
Training CoxPH...
Training GradientBoosting...
Training RandomForest...
Training SVM...

Consolidated C-index Scores:
MTLR: 0.6674
DeepSurv: 0.7045
CoxPH: 0.7224
GradientBoosting: 0.7158
RandomForest: 0.7126
SVM: 0.7185


In [6]:
from jarvais.explainer import Explainer

exp = Explainer.from_trainer(trainer)
exp.run()

⚠️  **Possible Bias Detected in Disease Site** ⚠️
=== Subgroup Analysis for 'Disease Site' Using Cox Proportional Hazards Model ===

Model Statistics:
    AIC (Partial):               2393.09
    Log-Likelihood:              -1187.55
    Log-Likelihood Ratio p-value: 0.0003
    Concordance Index (C-index):   0.59
Model Coefficients:
    ╒════════════════════════════════╤═══════════════╤══════════════════╕
    │ Feature                        │   Coefficient │   Standard Error │
    ╞════════════════════════════════╪═══════════════╪══════════════════╡
    │ Disease Site_Other             │         0.424 │            4.571 │
    ├────────────────────────────────┼───────────────┼──────────────────┤
    │ Disease Site_esophagus         │         0.795 │            4.577 │
    ├────────────────────────────────┼───────────────┼──────────────────┤
    │ Disease Site_hypopharynx       │         0.746 │            4.556 │
    ├────────────────────────────────┼───────────────┼──────────────────┤