In [1]:
import sys

# Add the parent directory to the system path
sys.path.append("../04_survival_models/src")

In [2]:
import re
from collections import Counter

import mlflow
import pandas as pd
import seaborn as sns
import numpy as np
from azureml.core import Workspace
from uc2_functions import *
from tqdm import tqdm

In [3]:
sns.set(style="whitegrid")

# Goal

The goal is to collect results from Monte Carlo simulations.

# Parameters

In [4]:
# Type of dataset
EXPERIMENT_NAME = "UC2_raw_survival_models_2024_07"
# Metrics
PATH_METRICS = "df_metrics_{}.csv".format(EXPERIMENT_NAME)
# Feature importance
PATH_IMPORTANCES = "df_importances_{}.csv".format(EXPERIMENT_NAME)
# Directories
DIR_SC = os.path.join(os.path.dirname(os.getcwd()), "sc")  # Legend
DIR_ARTIFACTS = "artifacts"
# Number of models
N_MODELS = 14

# Get from mlflow

In [5]:
workspace = Workspace.from_config()
experiment = workspace.experiments[EXPERIMENT_NAME]
# Set the MLflow tracking URI to point to your Azure ML workspace
mlflow.set_tracking_uri(workspace.get_mlflow_tracking_uri())
client = mlflow.tracking.MlflowClient()

In [6]:
df_metrics = collect_simulations(experiment=experiment,
                                 client=client,
                                 dir_artifacts=DIR_ARTIFACTS)
print(df_metrics.shape)

1500it [12:43,  1.96it/s]

1400
(1400, 11)





# Spot useless runs

If some runs are not present on the UI but present in mlflow: remove them.

In [7]:
# Group by 'random_state' and check for multiple 'parent_run_id'
grouped = df_metrics.groupby('random_state', group_keys=False)

# Apply the function to each group and filter out None values
least_parent_run_ids = grouped.apply(find_least_parent_run_id).dropna()

# Convert the Series to a dictionary
least_parent_run_ids_dict = least_parent_run_ids.to_dict()
least_parent_run_ids_dict

  least_parent_run_ids = grouped.apply(find_least_parent_run_id).dropna()


{}

# Write

In [8]:
df_metrics = df_metrics.groupby('random_state').filter(lambda x: len(x) == N_MODELS).reset_index(drop=True)
print(df_metrics.shape)
df_metrics.to_csv(os.path.join(DIR_ARTIFACTS, PATH_METRICS), index=False)

(1400, 11)
