In [1]:
import sys

# Add the parent directory to the system path
sys.path.append("../04_survival_models/src")

In [2]:
import os
from collections import Counter

import mlflow
import papermill as pm
from azureml.core import Workspace, Experiment
from tqdm import tqdm
from uc2_functions import *

# Goal

The goal is to run the notebook located at `PATH_NOTEBOOK` multiple times with different seeds (Monte Carlo simulations). This selection drives diverse train-test data shuffling and initializes imputers and models, ensuring full reproducibility of experiments.

# Parameters

Run the cell Raw or Larcher depending on the dataset to use

## Raw

In [3]:
EXPERIMENT_NAME = "UC2_raw_2024_02"
DIR_PAPERMILL = "papermill"
PATH_NOTEBOOK = "03_survival_feature_selection_raw_csm.ipynb"

## Larcher

In [4]:
EXPERIMENT_NAME = "UC2_larcher_2024_02"
DIR_PAPERMILL = "papermill"
PATH_NOTEBOOK = "03_survival_feature_selection_larcher_csm.ipynb"

# Sample random numbers between 0 and 1000

In [5]:
random_numbers = [0,
 1,
 6,
 8,
 23,
 25,
 27,
 30,
 32,
 40,
 42,
 62,
 73,
 89,
 90,
 91,
 95,
 104,
 114,
 129,
 136,
 142,
 160,
 163,
 166,
 178,
 200,
 203,
 207,
 209,
 217,
 223,
 225,
 228,
 237,
 238,
 250,
 255,
 269,
 281,
 284,
 342,
 367,
 376,
 379,
 391,
 394,
 395,
 429,
 432,
 433,
 457,
 459,
 460,
 462,
 517,
 533,
 535,
 539,
 551,
 554,
 558,
 574,
 586,
 592,
 597,
 603,
 604,
 616,
 619,
 654,
 665,
 667,
 692,
 694,
 697,
 704,
 718,
 733,
 734,
 754,
 755,
 758,
 759,
 771,
 775,
 790,
 805,
 818,
 825,
 826,
 828,
 885,
 890,
 914,
 932,
 963,
 968,
 975,
 996]

In [6]:
print(len(random_numbers))

100


# Exclude seeds with previous training

Some seeds may have previous simulations logged on mlflow, we don't need to retrain those simulations.

In [7]:
workspace = Workspace.from_config()

# Check if the experiment exists, if not, create it
if EXPERIMENT_NAME not in workspace.experiments:
    experiment = Experiment(workspace, EXPERIMENT_NAME)
else:
    experiment = workspace.experiments[EXPERIMENT_NAME]

# Set the MLflow tracking URI to point to your Azure ML workspace
mlflow.set_tracking_uri(workspace.get_mlflow_tracking_uri())
client = mlflow.tracking.MlflowClient()

In [8]:
l = []
for run in tqdm(experiment.get_runs(include_children=True)):
    # Access the run in MLflow
    data = client.get_run(run.id).data
    # Check if 'model_path' exists in the params
    if "model_path" in data.params and data.params["model_path"]:
        l.append(data.params["random_state"])
    else:
        continue
l = sorted([int(x) for x in l])

# Check if each unique value appears exactly n_models times
problematic_seeds = find_problematic_values(l, 3)
assert problematic_seeds == []

# Seeds where to start training from scratch
random_states = sorted([x for x in random_numbers if x not in list(set(l))])
print(len(random_states), "simulations left")

398it [01:21,  4.89it/s]

1 simulations left





# Run multiple notebooks

Run the same notebook multiple times changing seeds.

In [None]:
os.makedirs(DIR_PAPERMILL, exist_ok=True)
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)
with mlflow.start_run() as parent_run:
    parent_run_id = parent_run.info.run_id
    for random_state in tqdm(random_states):
        # Prepare parameters
        params = {
            "RANDOM_STATE": int(random_state),
            "EXPERIMENT_NAME": str(EXPERIMENT_NAME),
            "PARENT_RUN_ID": str(parent_run_id),
        }
        # Execute the training notebook with papermill
        pm.execute_notebook(
            input_path=PATH_NOTEBOOK,
            output_path=os.path.join(
                DIR_PAPERMILL,
                PATH_NOTEBOOK.split(".ipynb")[0] + "_{}.ipynb".format(random_state),
            ),
            parameters=params,
        )

  0%|          | 0/1 [00:00<?, ?it/s]

Executing:   0%|          | 0/93 [00:00<?, ?cell/s]

# End parent run

In [None]:
mlflow.end_run()