In [None]:
import sys

# Add the parent directory to the system path
sys.path.append("../04_survival_models/src")

In [None]:
import datetime
import json
import os
import pickle
import pprint
import time
import warnings

import joblib
import kaplanmeier as km
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
from azureml.core import Dataset, Workspace
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import make_scorer
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    ParameterGrid,
    RandomizedSearchCV,
    StratifiedKFold,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sksurv.functions import StepFunction
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sksurv.nonparametric import kaplan_meier_estimator
from uc2_functions import *
from tqdm import tqdm

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Goal

The goal is to fine-tune a Cox model using GRANT features as the baseline.

# Parameters

In [None]:
# Legend
PATH_LEGEND = "Legenda_Variabili_Uri_Larcher.xlsx"
# Directories
DIR_SC = os.path.join(os.path.dirname(os.getcwd()), "sc")  # Legend
DIR_MODEL_PKL = "../models_pkl"  # Weights for the models used during inference

In [None]:
RANDOM_STATE = 42
EXPERIMENT_NAME = "UC2_raw_survival_grant_finetune_2024_05"
PARENT_RUN_ID = None

# Data ingestion

## One-hot encoding version

In [None]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required

subscription_id = "753a0b42-95dc-4871-b53e-160ceb0e6bc1"
resource_group = "rg-s-race-aml-dev-we"
workspace_name = "amlsraceamldevwe01"

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name="UC2_raw_survival_csm_ohe_5yrs")
df_ohe = dataset.to_pandas_dataframe()
print(df_ohe.shape)
df_ohe.head()

### Use schema

Recreate the schema from tags:

In [None]:
tags = dataset.tags

dtypes = json.loads(tags["dtypes_json"])
is_ordinal = json.loads(tags["is_ordinal_json"])

for col in dtypes.keys():
    if dtypes[col] == "category":
        categories = (
            sorted(df_ohe[col].dropna().unique())
            if is_ordinal[col]
            else df_ohe[col].dropna().unique()
        )
        df_ohe[col] = pd.Categorical(
            df_ohe[col], categories=categories, ordered=is_ordinal[col]
        )
    else:
        df_ohe[col] = df_ohe[col].astype(dtypes[col])

In [None]:
count_columns_by_dtype(df_ohe)

## `.xlsx` Legend

In [None]:
df_legend = pd.read_excel(
    os.path.join(DIR_SC, PATH_LEGEND), sheet_name="Variabili Etichette DBURI"
)
df_legend.head()

### Create dictionary

In [None]:
dict_legend = pd.Series(
    df_legend["Etichetta"].values, index=df_legend["Variabile"]
).to_dict()

# Start mlflow run

In [None]:
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run(run_name=str(RANDOM_STATE))
if PARENT_RUN_ID:
    mlflow.set_tag("parent_run_id", PARENT_RUN_ID)

# Drop na on target columns

In [None]:
not_features = ["P_1_id", "death", "csm", "ocm", "ttdeath"]

In [None]:
print(df_ohe.shape[0])
df_ohe = df_ohe.dropna(subset=["ttdeath", "death"])
print(df_ohe.shape[0])

# Train test split

## List features

In [None]:
features_all = sorted(set(df_ohe.columns.tolist()) - set(not_features))
print(len(features_all))

## Train test split

In [None]:
# Define features and target
X = df_ohe[features_all]
y = np.array(
    [(event, time) for event, time in zip(df_ohe["death"], df_ohe["ttdeath"])],
    dtype=[("event", bool), ("time", float)],
)
ids = df_ohe["P_1_id"]
mlflow.log_param(
    "death_perc_5yrs",
    pd.Series(y["event"]).value_counts(sort=True, normalize=True)[True],
)

# Split data and IDs into training and testing sets
(
    X_train_missing,
    X_test_missing,
    y_train,
    y_test,
    ids_train,
    ids_test,
) = train_test_split(
    X,
    y,
    ids,
    test_size=0.2,
    stratify=y["event"],
    random_state=RANDOM_STATE,
)
del X, y, ids
# Check distributions of death event on train and test
print(pd.Series(y_train["event"]).value_counts(sort=True, normalize=True))
print(pd.Series(y_test["event"]).value_counts(sort=True, normalize=True))

# Imputation

## Fit and trasform on train

In [None]:
X_train = X_train_missing.copy()

imputer = IterativeImputer(
    max_iter=25, initial_strategy="median", random_state=RANDOM_STATE
)
imputer = imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_train = pd.DataFrame(X_train, columns=X_train_missing.columns)

# Assert
assert set(X_train.columns) == set(X_train_missing.columns)

del X_train_missing

## Transform on test

In [None]:
X_test = X_test_missing.copy()

X_test = imputer.transform(X_test)
X_test = pd.DataFrame(X_test, columns=X_test_missing.columns)

# Assert
assert set(X_test.columns) == set(X_test_missing.columns)

del X_test_missing

# Cox model - GRANT fine-tune

In [None]:
model_name = "CoxPHSurvivalAnalysis_grant_finetune_T1"
mlflow.start_run(run_name=model_name, nested=True)
mlflow.log_param("random_state", RANDOM_STATE)

As baseline we train a Cox model trained on features from prognostic model GRANT (Table 6.3 at https://uroweb.org/guidelines/renal-cell-carcinoma/chapter/prognostic-factors):

1. Age > 60 years

2. T classification = T3b, pT3c or pT4

3. N classification = pN1

4. (Fuhrman) grade = G3 or G4



0-1 factors: favourable-risk disease

2 or more factors: unfavourable-risk disease

## Binarize variables

Out mappings from preprocessing:

Used for `IST_1_kidney1PathologicalStage2009`:
```
mapping_t_8lev = {
    "T1a": 1.0,
    "T1b": 2.0,
    "T2a": 3.0,
    "T2b": 4.0,
    "T3a": 5.0,
    "T3b": 6.0,
    "T3c": 7.0,
    "T4": 8.0,
    "Tx": np.nan,
}  # Rare event
```

Used for `IST_1_kidney1PN2009`:
```
mapping_n = {
    "No": 0.0,
    "N1": 1.0,
    "Nx": 2.0,
}  # Unsing np.nan would lead to more than 25% of nans
```
Already binary due to one-hot encoding

Used for `IST_1_kidney1Grading`:
```
mapping_grade = {"G1": 1.0, "G2": 2.0, "G3": 3.0, "G4": 4.0}
```

In [None]:
## Binarize age
X_train['ANM_1_age_binary'] = (X_train['ANM_1_age'] > 60).astype("boolean")
X_test['ANM_1_age_binary'] = (X_test['ANM_1_age'] > 60).astype("boolean")

In [None]:
## Binarize pT
X_train['IST_1_kidney1PathologicalStage2009_binary'] = (X_train['IST_1_kidney1PathologicalStage2009'] >= 6).astype("boolean")
X_test['IST_1_kidney1PathologicalStage2009_binary'] = (X_test['IST_1_kidney1PathologicalStage2009'] >= 6).astype("boolean")

In [None]:
## Binarize grading
X_train['IST_1_kidney1Grading_binary'] = (X_train['IST_1_kidney1Grading'] >= 3).astype("boolean")
X_test['IST_1_kidney1Grading_binary'] = (X_test['IST_1_kidney1Grading'] >= 3).astype("boolean")

In [None]:
features_grant = [
    "ANM_1_age_binary",
    "IST_1_kidney1PathologicalStage2009_binary",
    "IST_1_kidney1PN2009_1_0",
    "IST_1_kidney1Grading_binary",
]

## Train

In [None]:
# Train the model
cox_grant = CoxPHSurvivalAnalysis()
cox_grant.fit(X_train[features_grant], y_train)
mlflow.log_param("feature_names_in", cox_grant.feature_names_in_)
mlflow.log_param("n_features_in", cox_grant.n_features_in_)

## Save model weights to pkl

In [None]:
# Save model weights to pkl
os.makedirs(DIR_MODEL_PKL, exist_ok=True)
model_path = os.path.join(DIR_MODEL_PKL, "raw_{}_{}.pkl".format(model_name, RANDOM_STATE))
joblib.dump(cox_grant, model_path)
mlflow.log_artifact(model_path)
mlflow.log_param("model_path", model_path)

## Validate

In [None]:
result_censored, result_ipcw, score_brier, mean_auc, fig = validate_sksurv_model(model=cox_grant,
                                                                                 y_train=y_train,
                                                                                 X_test=X_test[features_grant],
                                                                                 y_test=y_test,
                                                                                 tau=60)
print("concordance_index_censored", round(result_censored, 3))
mlflow.log_metric("concordance_index_censored", result_censored)
print("concordance_index_ipcw", round(result_ipcw, 3))
mlflow.log_metric("concordance_index_ipcw", result_ipcw)
print("integrated_brier_score", round(score_brier, 3))
mlflow.log_metric("integrated_brier_score", score_brier)
print("mean_cumulative_dynamic_auc", round(mean_auc, 3))
mlflow.log_metric("mean_cumulative_dynamic_auc", mean_auc)
mlflow.log_figure(fig, "time_dependent_auc.png")
plt.show(fig)

In [None]:
mlflow.end_run()

# End mlflow run

In [None]:
mlflow.end_run()