In [1]:
import sys

import os

# Get the absolute path of the project root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

# Add the project root directory to the Python path
sys.path.append(project_root)

In [2]:

import pandas as pd


project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

# Define the path to the data directory
data_dir = os.path.join(project_root, 'data')

# Example: Access a specific data file in the data directory
data_file_path = os.path.join(data_dir, 'RADCURE_challenge_clinical.csv')
df = pd.read_csv(data_file_path)

In [None]:
from jarvais.analyzer import Analyzer
from pprint import pprint

df.drop(columns=["Study ID", "split", "survival_time", "death"], inplace=True)

config = Analyzer.dry_run(df)

pprint(config)

Config file not found. Creating custom...
Used a heuristic to define categorical and continuous columns. Please review!


Feature Types:
  - Categorical: ['Sex', 'Disease Site', 'Stage', 'T Stage', 'N Stage', 'Dose', 'HPV Combined', 'EGFRI', 'Chemotherapy']
  - Continuous: ['age at dx']


Outlier Analysis:
  - No Outliers found in Sex
  - Outliers found in Disease Site: ['esophagus: 22 out of 2552', 'salivary glands: 4 out of 2552']
  - Outliers found in Stage: ['IV: 6 out of 2549', 'IIIC: 2 out of 2549', 'IIIA: 2 out of 2549', 'IIA: 1 out of 2549']
  - Outliers found in T Stage: ['T2 (2): 1 out of 2552', 'TX: 1 out of 2552', 'T3 (2): 1 out of 2552']
  - Outliers found in N Stage: ['N3a: 14 out of 2552', 'NX: 1 out of 2552']
  - Outliers found in Dose: ['50.0: 9 out of 2552', '69.96: 2 out of 2552', '50.8: 1 out of 2552', '55.0: 1 out of 2552', '53.55: 1 out of 2552', '59.4: 1 out of 2552']
  - No Outliers found in HPV Combined
  - No Outliers found in EGFRI
  - No Outliers found in Ch

In [4]:
import yaml
from pathlib import Path

config['columns']['categorical'].remove('Dose')
config['columns']['continuous'].append('Dose') 

pprint(config)

analyzer_path = Path('outputs/analyzer')
analyzer_path.mkdir(parents=True, exist_ok=True)

with open(analyzer_path / 'config.yaml', 'w') as f:
            yaml.dump(config, f)

{'columns': {'categorical': ['Sex',
                             'Disease Site',
                             'Stage',
                             'T Stage',
                             'N Stage',
                             'HPV Combined',
                             'EGFRI',
                             'Chemotherapy'],
             'continuous': ['age at dx', 'Dose'],
             'date': [],
             'other': []},
 'mapping': {'Chemotherapy': {'0': '0', '1': '1'},
             'Disease Site': {'esophagus': 'Other',
                              'hypopharynx': 'hypopharynx',
                              'larynx': 'larynx',
                              'lip & oral cavity': 'lip & oral cavity',
                              'nasal cavity': 'nasal cavity',
                              'nasopharynx': 'nasopharynx',
                              'oropharynx': 'oropharynx',
                              'paranasal sinus': 'paranasal sinus',
                              'saliva

In [None]:
from jarvais.analyzer import Analyzer

analyzer = Analyzer(df, target_variable='Chemotherapy', output_dir='./outputs/analyzer', config='outputs/analyzer/config.yaml')

analyzer.run()

Feature Types:
  - Categorical: ['Sex', 'Disease Site', 'Stage', 'T Stage', 'N Stage', 'HPV Combined', 'EGFRI', 'Chemotherapy']
  - Continuous: ['age at dx', 'Dose']


Outlier Analysis:
  - No Outliers found in Sex
  - Outliers found in Disease Site: ['esophagus: 22 out of 2552', 'salivary glands: 4 out of 2552']
  - Outliers found in Stage: ['IV: 6 out of 2549', 'IIIC: 2 out of 2549', 'IIIA: 2 out of 2549', 'IIA: 1 out of 2549']
  - Outliers found in T Stage: ['T2 (2): 1 out of 2552', 'TX: 1 out of 2552', 'T3 (2): 1 out of 2552']
  - Outliers found in N Stage: ['N3a: 14 out of 2552', 'NX: 1 out of 2552']
  - No Outliers found in HPV Combined
  - No Outliers found in EGFRI
  - No Outliers found in Chemotherapy

Applying changes from config...

╒══════════════════════╤═══════════════════╤═══════════╤═════════════╕
│                      │                   │ Missing   │ Overall     │
╞══════════════════════╪═══════════════════╪═══════════╪═════════════╡
│ n                    │         

In [None]:
from jarvais.trainer import TrainerSupervised

df = pd.read_csv('./outputs/analyzer/updated_data.csv', index_col=0)

trainer = TrainerSupervised(task='binary', output_dir='./outputs/trainer')
trainer.run(df, 'Chemotherapy')

Training fold 1/5...
Fold 1 score: 0.97099667293505
Training fold 2/5...
Fold 2 score: 0.9475378178307049
Training fold 3/5...
Fold 3 score: 0.9642795830778663
Training fold 4/5...
Fold 4 score: 0.9587982832618026
Training fold 5/5...
Fold 5 score: 0.9623400639744102

Model Leaderboard (Displays values in "mean [min, max]" format across training folds)
------------------------------------------------------------------------------------
╒═══════════════════════╤══════════════════════════╤══════════════════════════╤══════════════════════════╕
│ model                 │ score_test               │ score_val                │ score_train              │
╞═══════════════════════╪══════════════════════════╪══════════════════════════╪══════════════════════════╡
│ WeightedEnsemble_L2   │ AUROC 0.94 [0.93, 0.94]  │ AUROC 0.96 [0.95, 0.97]  │ AUROC 0.98 [0.97, 0.99]  │
│                       │ F1: 0.85 [0.84, 0.86]    │ F1: 0.89 [0.88, 0.91]    │ F1: 0.92 [0.9, 0.94]     │
│                       │

In [None]:
from jarvais.explainer import Explainer

exp = Explainer.from_trainer(trainer)
exp.run()

Subgroup Analysis(Age At Dx)
╒══════════════════════════════╤════════╤════════╤════════╤════════╕
│                              │ 21.0   │ 22.2   │ 27.4   │ 27.6   │
╞══════════════════════════════╪════════╪════════╪════════╪════════╡
│ mean_prediction              │ 1.0    │ 1.0    │ 0.0    │ 1.0    │
├──────────────────────────────┼────────┼────────┼────────┼────────┤
│ false_positive_rate          │ 0.0    │ 0.0    │ 0.0    │ 0.0    │
├──────────────────────────────┼────────┼────────┼────────┼────────┤
│ Relative mean_prediction     │ 3.0 ✅ │ 3.0 ✅ │ 0.0 ✅ │ 3.0 ✅ │
├──────────────────────────────┼────────┼────────┼────────┼────────┤
│ Relative false_positive_rate │ 0.0 ✅ │ 0.0 ✅ │ 0.0 ✅ │ 0.0 ✅ │
╘══════════════════════════════╧════════╧════════╧════════╧════════╛

Subgroup Analysis(Sex)
╒══════════════════════════════╤═══════════════════════╤═════════════════════╕
│                              │ Female                │ Male                │
╞══════════════════════════════╪══════

  0%|          | 0/100 [00:00<?, ?it/s]