In [1]:
import sys
import pandas as pd
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')

data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)

df.drop(columns=["Study ID"], inplace=True)

In [2]:
from jarvais.analyzer import Analyzer
from rich import print

analyzer = Analyzer(
    data=df, 
    output_dir='./survival_outputs/analyzer',
    categorical_columns= [
      "Sex",
      "T Stage",
      "N Stage",
      "Stage",
      "Smoking Status",
      "Disease Site",
      "death",
      "HPV Combined",
      "Chemotherapy"
    ],
    continuous_columns = [
      "survival_time",
      "age at dx",
      "Dose"
    ],
    target_variable='death', 
    task='classification'
)

print(analyzer)

analyzer.run()

  from .autonotebook import tqdm as notebook_tqdm


[2m        [0m [[32m[1minfo     [0m] [1mPerforming missingness analysis...[0m [[0m[1m[34mjarvais[0m][0m [36mcall[0m=[35mmissingness.__call__:43[0m
[2m        [0m [[32m[1minfo     [0m] [1mPerforming outlier analysis...[0m [[0m[1m[34mjarvais[0m][0m [36mcall[0m=[35moutlier.__call__:53[0m
[2m        [0m [[32m[1minfo     [0m] [1mPlotting Correlation Matrix...[0m [[0m[1m[34mjarvais[0m][0m [36mcall[0m=[35mvisualization.__call__:115[0m


+--------------------------+-------------------+-----------+-------------+
|                          |                   | Missing   | Overall     |
| n                        |                   |           | 3346        |
+--------------------------+-------------------+-----------+-------------+
| survival_time, mean (SD) |                   | 0         | 4.1 (2.7)   |
+--------------------------+-------------------+-----------+-------------+
| age at dx, mean (SD)     |                   | 0         | 62.3 (11.6) |
+--------------------------+-------------------+-----------+-------------+
| Dose, mean (SD)          |                   | 0         | 66.7 (5.8)  |
+--------------------------+-------------------+-----------+-------------+
| Sex, n (%)               | Female            |           | 686 (20.5)  |
+--------------------------+-------------------+-----------+-------------+
|                          | Male              |           | 2660 (79.5) |
+------------------------

[2m        [0m [[32m[1minfo     [0m] [1mPlotting Pairplot...          [0m [[0m[1m[34mjarvais[0m][0m [36mcall[0m=[35mvisualization.__call__:118[0m
[2m14:41:04[0m [[32m[1minfo     [0m] [1mPlotting UMAP...              [0m [[0m[1m[34mjarvais[0m][0m [36mcall[0m=[35mvisualization.__call__:124[0m
[2m14:41:13[0m [[32m[1minfo     [0m] [1mPlotting Frequency Table...   [0m [[0m[1m[34mjarvais[0m][0m [36mcall[0m=[35mvisualization.__call__:121[0m
[2m14:41:22[0m [[32m[1minfo     [0m] [1mPlotting Multiplot...         [0m [[0m[1m[34mjarvais[0m][0m [36mcall[0m=[35mvisualization.__call__:136[0m
Font MPDFAA+Inter28ptBold is missing the following glyphs: '
' (\n)


In [3]:
from jarvais.trainer import TrainerSupervised

df = pd.read_csv('./survival_outputs/analyzer/updated_data.csv')
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)

trainer = TrainerSupervised(task='survival', output_dir='./radcure_outputs/ED_trainer_explainer',)
trainer.run(df, ['event','time'])

Training MTLR...
  Best trial:
    Params: 
      C1: 0.01
      dropout: 0.3292835147016483
      dims: [64, 64]
Training DeepSurv...
  Best trial:
    Params: 
      l2_reg: 0.006451742136969566
      dropout: 0.360636124048913
      dims: [256, 256, 256]
Training CoxPH...
Training GradientBoosting...
Training RandomForest...
Training SVM...

Consolidated C-index Scores:
MTLR: 0.6482
DeepSurv: 0.6863
CoxPH: 0.7212
GradientBoosting: 0.7152
RandomForest: 0.7156
SVM: 0.7156


In [4]:
from jarvais.explainer import Explainer

exp = Explainer.from_trainer(trainer)
exp.run()

⚠️  **Possible Bias Detected in Disease Site** ⚠️
=== Subgroup Analysis for 'Disease Site' Using Cox Proportional Hazards Model ===

Model Statistics:
    AIC (Partial):               2393.09
    Log-Likelihood:              -1187.55
    Log-Likelihood Ratio p-value: 0.0003
    Concordance Index (C-index):   0.59
Model Coefficients:
    +--------------------------------+---------------+------------------+
    | Feature                        |   Coefficient |   Standard Error |
    | Disease Site_Other             |         0.424 |            4.571 |
    +--------------------------------+---------------+------------------+
    | Disease Site_esophagus         |         0.795 |            4.577 |
    +--------------------------------+---------------+------------------+
    | Disease Site_hypopharynx       |         0.746 |            4.556 |
    +--------------------------------+---------------+------------------+
    | Disease Site_larynx            |         0.202 |            4.552 |