In [1]:
import sys

import os

# Get the absolute path of the project root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

# Add the project root directory to the Python path
sys.path.append(project_root)

In [2]:

import pandas as pd


project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

# Define the path to the data directory
data_dir = os.path.join(project_root, 'data')

# Example: Access a specific data file in the data directory
data_file_path = os.path.join(data_dir, 'RADCURE_challenge_clinical.csv')
df = pd.read_csv(data_file_path)

In [3]:
from AutoML.analyzer import Analyzer
from pprint import pprint

df.drop(columns=["Study ID", "split", "survival_time", "death"], inplace=True)

config = Analyzer.dry_run(df)

pprint(config)

Config file not found. Creating custom...
Used a heuristic to define categorical and continuous columns. Please review!


Feature Types:
  - Categorical: ['Disease Site', 'N Stage', 'T Stage', 'Sex', 'Stage', 'Chemotherapy', 'EGFRI', 'HPV Combined', 'Dose']
  - Continuous: ['age at dx']


Outlier Analysis:
  - Outliers found in Disease Site: ['esophagus: 22 out of 2552', 'salivary glands: 4 out of 2552']
  - Outliers found in N Stage: ['N3a: 14 out of 2552', 'NX: 1 out of 2552']
  - Outliers found in T Stage: ['T2 (2): 1 out of 2552', 'TX: 1 out of 2552', 'T3 (2): 1 out of 2552']
  - No Outliers found in Sex
  - Outliers found in Stage: ['IV: 6 out of 2549', 'IIIC: 2 out of 2549', 'IIIA: 2 out of 2549', 'IIA: 1 out of 2549']
  - No Outliers found in Chemotherapy
  - No Outliers found in EGFRI
  - No Outliers found in HPV Combined
  - Outliers found in Dose: ['50.0: 9 out of 2552', '69.96: 2 out of 2552', '50.8: 1 out of 2552', '55.0: 1 out of 2552', '53.55: 1 out of 2552', '59.4: 1 out

In [4]:
import yaml
from pathlib import Path

config['columns']['categorical'].remove('Dose')
config['columns']['continuous'].append('Dose') 

pprint(config)

analyzer_path = Path('outputs/analyzer')
analyzer_path.mkdir(parents=True, exist_ok=True)

with open(analyzer_path / 'config.yaml', 'w') as f:
            yaml.dump(config, f)

{'columns': {'categorical': ['Disease Site',
                             'N Stage',
                             'T Stage',
                             'Sex',
                             'Stage',
                             'Chemotherapy',
                             'EGFRI',
                             'HPV Combined'],
             'continuous': ['age at dx', 'Dose'],
             'date': [],
             'other': []},
 'mapping': {'Chemotherapy': {'0': '0', '1': '1'},
             'Disease Site': {'esophagus': 'Other',
                              'hypopharynx': 'hypopharynx',
                              'larynx': 'larynx',
                              'lip & oral cavity': 'lip & oral cavity',
                              'nasal cavity': 'nasal cavity',
                              'nasopharynx': 'nasopharynx',
                              'oropharynx': 'oropharynx',
                              'paranasal sinus': 'paranasal sinus',
                              'saliva

In [5]:
from AutoML.analyzer import Analyzer

analyzer = Analyzer(df, target_variable='Dose', output_dir='./outputs/analyzer', config='outputs/analyzer/config.yaml')

analyzer.run()

Feature Types:
  - Categorical: ['Disease Site', 'N Stage', 'T Stage', 'Sex', 'Stage', 'Chemotherapy', 'EGFRI', 'HPV Combined']
  - Continuous: ['age at dx', 'Dose']


Outlier Analysis:
  - Outliers found in Disease Site: ['esophagus: 22 out of 2552', 'salivary glands: 4 out of 2552']
  - Outliers found in N Stage: ['N3a: 14 out of 2552', 'NX: 1 out of 2552']
  - Outliers found in T Stage: ['T2 (2): 1 out of 2552', 'TX: 1 out of 2552', 'T3 (2): 1 out of 2552']
  - No Outliers found in Sex
  - Outliers found in Stage: ['IV: 6 out of 2549', 'IIIC: 2 out of 2549', 'IIIA: 2 out of 2549', 'IIA: 1 out of 2549']
  - No Outliers found in Chemotherapy
  - No Outliers found in EGFRI
  - No Outliers found in HPV Combined

Applying changes from config...

╒══════════════════════╤═══════════════════╤═══════════╤═════════════╕
│                      │                   │ Missing   │ Overall     │
╞══════════════════════╪═══════════════════╪═══════════╪═════════════╡
│ n                    │         

In [6]:
from AutoML.trainer import TrainerSupervised

df = pd.read_csv('./outputs/analyzer/updated_data.csv', index_col=0)

trainer = TrainerSupervised(task='regression', output_dir='./outputs/trainer')
trainer.run(df, 'Dose')

Training fold 1/5...
Fold 1 score: 0.6415226629823407
Training fold 2/5...
Fold 2 score: 0.6354727494158782
Training fold 3/5...
Fold 3 score: 0.6784427999882303
Training fold 4/5...
Fold 4 score: 0.6340258425139446
Training fold 5/5...
Fold 5 score: 0.6137623216481571

Model Leaderboard (Displays values in "mean [min, max]" format across training folds)
------------------------------------------------------------------------------------
╒═══════════════════════╤════════════════════════════╤════════════════════════════╤════════════════════════════╕
│ model                 │ score_test                 │ score_val                  │ score_train                │
╞═══════════════════════╪════════════════════════════╪════════════════════════════╪════════════════════════════╡
│ WeightedEnsemble_L2   │ R2 0.62 [0.62, 0.63]       │ R2 0.64 [0.61, 0.68]       │ R2 0.7 [0.69, 0.74]        │
│                       │ RMSE: -3.48 [-3.49, -3.46] │ RMSE: -3.36 [-3.58, -3.19] │ RMSE: -3.05 [-3.15, -2

In [7]:
from AutoML.explainer import Explainer

exp = Explainer.from_trainer(trainer)
exp.run()

Subgroup Analysis(Age At Dx)
╒══════════════════════════╤══════════════════════╤═══════════════════════╤══════════════════════╤═══════════════════════╕
│                          │ 21.0                 │ 22.2                  │ 27.4                 │ 31.3                  │
╞══════════════════════════╪══════════════════════╪═══════════════════════╪══════════════════════╪═══════════════════════╡
│ mean_prediction          │ 68.77843475341797    │ 69.64189147949219     │ 67.40226745605469    │ 69.74332427978516     │
├──────────────────────────┼──────────────────────┼───────────────────────┼──────────────────────┼───────────────────────┤
│ Relative mean_prediction │ 1.013164638575229 ✅ │ 1.0258840879918192 ✅ │ 0.992892527884562 ✅ │ 1.0273782791117092 ✅ │
╘══════════════════════════╧══════════════════════╧═══════════════════════╧══════════════════════╧═══════════════════════╛

Subgroup Analysis(Disease Site)
╒══════════════════════════╤═══════════════════════╤═══════════════════════╤═════