In [1]:
import sys

import os

# Get the absolute path of the project root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

# Add the project root directory to the Python path
sys.path.append(project_root)

In [2]:
import pandas as pd


project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

# Define the path to the data directory
data_dir = os.path.join(project_root, 'data')

# Example: Access a specific data file in the data directory
data_file_path = os.path.join(data_dir, 'RADCURE_challenge_clinical.csv')
df = pd.read_csv(data_file_path)

In [None]:
from jarvais.analyzer import Analyzer
from pprint import pprint

df.drop(columns=["Study ID", "split"], inplace=True)
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)

config = Analyzer.dry_run(df)

pprint(config)

Config file not found. Creating custom...
Used a heuristic to define categorical and continuous columns. Please review!


Feature Types:
  - Categorical: ['T Stage', 'Stage', 'Disease Site', 'Sex', 'N Stage', 'Dose', 'EGFRI', 'event', 'Chemotherapy', 'HPV Combined']
  - Continuous: ['time', 'age at dx']


Outlier Analysis:
  - Outliers found in T Stage: ['T2 (2): 1 out of 2552', 'TX: 1 out of 2552', 'T3 (2): 1 out of 2552']
  - Outliers found in Stage: ['IV: 6 out of 2549', 'IIIC: 2 out of 2549', 'IIIA: 2 out of 2549', 'IIA: 1 out of 2549']
  - Outliers found in Disease Site: ['esophagus: 22 out of 2552', 'salivary glands: 4 out of 2552']
  - No Outliers found in Sex
  - Outliers found in N Stage: ['N3a: 14 out of 2552', 'NX: 1 out of 2552']
  - Outliers found in Dose: ['50.0: 9 out of 2552', '69.96: 2 out of 2552', '50.8: 1 out of 2552', '55.0: 1 out of 2552', '53.55: 1 out of 2552', '59.4: 1 out of 2552']
  - No Outliers found in EGFRI
  - No Outliers found in event
  - No Outliers f

In [4]:
import yaml
from pathlib import Path

config['columns']['categorical'].remove('Dose')
config['columns']['continuous'].append('Dose') 

pprint(config)

analyzer_path = Path('radcure_outputs/analyzer')
analyzer_path.mkdir(parents=True, exist_ok=True)

with open(analyzer_path / 'config.yaml', 'w') as f:
            yaml.dump(config, f)

{'columns': {'categorical': ['T Stage',
                             'Stage',
                             'Disease Site',
                             'Sex',
                             'N Stage',
                             'EGFRI',
                             'event',
                             'Chemotherapy',
                             'HPV Combined'],
             'continuous': ['time', 'age at dx', 'Dose'],
             'date': [],
             'other': []},
 'mapping': {'Chemotherapy': {'0': '0', '1': '1'},
             'Disease Site': {'esophagus': 'Other',
                              'hypopharynx': 'hypopharynx',
                              'larynx': 'larynx',
                              'lip & oral cavity': 'lip & oral cavity',
                              'nasal cavity': 'nasal cavity',
                              'nasopharynx': 'nasopharynx',
                              'oropharynx': 'oropharynx',
                              'paranasal sinus': 'paranasal

In [None]:
from jarvais.analyzer import Analyzer

analyzer = Analyzer(df, task='survival', target_variable='event', output_dir='./radcure_outputs/analyzer', one_hot_encode=True, config='radcure_outputs/analyzer/config.yaml')

analyzer.run()

Feature Types:
  - Categorical: ['T Stage', 'Stage', 'Disease Site', 'Sex', 'N Stage', 'EGFRI', 'event', 'Chemotherapy', 'HPV Combined']
  - Continuous: ['time', 'age at dx', 'Dose']


Outlier Analysis:
  - Outliers found in T Stage: ['T2 (2): 1 out of 2552', 'TX: 1 out of 2552', 'T3 (2): 1 out of 2552']
  - Outliers found in Stage: ['IV: 6 out of 2549', 'IIIC: 2 out of 2549', 'IIIA: 2 out of 2549', 'IIA: 1 out of 2549']
  - Outliers found in Disease Site: ['esophagus: 22 out of 2552', 'salivary glands: 4 out of 2552']
  - No Outliers found in Sex
  - Outliers found in N Stage: ['N3a: 14 out of 2552', 'NX: 1 out of 2552']
  - No Outliers found in EGFRI
  - No Outliers found in event
  - No Outliers found in Chemotherapy
  - No Outliers found in HPV Combined

Applying changes from config...

╒══════════════════════╤═══════════════════╤═══════════╤═════════════╕
│                      │                   │ Missing   │ Overall     │
╞══════════════════════╪═══════════════════╪═══════════╪

In [3]:
from jarvais.trainer import TrainerSupervised

df = pd.read_csv('./radcure_outputs/analyzer/updated_data.csv', index_col=0)
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)

trainer = TrainerSupervised(task='survival', output_dir='./radcure_outputs/ED_trainer_explainer',)
trainer.run(df, ['event','time'])

  from .autonotebook import tqdm as notebook_tqdm


Training MTLR...
  Best trial:
    Params: 
      C1: 0.01
      dropout: 0.48010344307101943
      dims: [16, 16]
Training DeepSurv...
  Best trial:
    Params: 
      l2_reg: 0.0067966243797537955
      dropout: 0.4685764374782581
      dims: [256, 256, 256]
Training CoxPH...
Training GradientBoosting...
Training RandomForest...
Training SVM...

Consolidated C-index Scores:
MTLR: 0.6201
DeepSurv: 0.6046
CoxPH: 0.7785
GradientBoosting: 0.7793
RandomForest: 0.7684
SVM: 0.7745


In [4]:
from jarvais.explainer import Explainer

exp = Explainer.from_trainer(trainer)
exp.run()