In [1]:
import sys

# Add the parent directory to the system path
sys.path.append("../04_survival_models/src")

In [2]:
import json
import os

import joblib
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
from azureml.core import Dataset, Workspace
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
)
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import (
    concordance_index_censored,
)
from uc2_functions import count_columns_by_dtype

# Goal

The goal is to train the SSIGN model that was validated internally to 100% of the internal dataset, to create a final model for external valiation.
Page 116 https://www.ncbi.nlm.nih.gov/books/NBK543527/pdf/Bookshelf_NBK543527.pdf

# Parameters

In [3]:
RANDOM_STATE = 42
PARENT_RUN_ID = None
EXPERIMENT_NAME = "UC2_review_ssign_final_2025_09_1"
DIR_MODEL_PKL = "../models_pkl_review"  # Weights for the models used during inference
DIR_ARTIFACTS = "artifacts"
S = 100
DIR_SC = os.path.join(os.path.dirname(os.getcwd()), "sc")
PATH_EXTERNAL = "External_Validation_of_AI_model_Florence_MAP.xlsx"
PATH_EXTERNAL_INTEGRATION = "External validation Florence - Missing variables.xlsx"
PATH_EXTERNAL_INTEGRATION_NECROSIS = "External validation Florence - Missing variable necrosis.xlsx"

# Functions

In [4]:
def calculate_ssign_score(df: pd.DataFrame) -> pd.DataFrame:
    """
    Computes the SSIGN score based on pre-processed features.

    This function calculates the score for each component of the SSIGN score
    (Pathological T, N, Metastasis, Tumor Size, Grade, Necrosis) and
    sums them to get the total score.

    Args:
        df: A pandas DataFrame containing the necessary columns:
            - 'IST_1_kidney1PathologicalStage2009' (numeric, mapped)
            - 'IST_1_kidney1PN2009_1_0' (boolean, from one-hot encoding)
            - 'IST_1_kidney1TumorDimension' (numeric, in cm)
            - 'IST_1_kidney1Grading' (numeric, mapped)
            - 'IST_1_kidney1Necrosis' (boolean)

    Returns:
        A new pandas DataFrame with added columns for each score component
        and the total 'ssign_score'.
    """
    # Make a copy to avoid modifying the original dataframe
    df_scores = df.copy()

    # 1. Pathological T category score
    # pT1 (1.0, 2.0) -> 0; pT2 (3.0, 4.0) -> +1; pT3 (5.0, 6.0, 7.0) -> +2; pT4 (8.0) -> +4
    pt_score_map = {1.0: 0, 2.0: 0, 3.0: 1, 4.0: 1, 5.0: 2, 6.0: 2, 7.0: 2, 8.0: 4}
    df_scores["ssign_component_pT"] = (
        df_scores["IST_1_kidney1PathologicalStage2009"].map(pt_score_map).fillna(0)
    )

    # 2. Regional lymph node status score
    # pNx/pN0 -> 0; pN1/pN2 -> +2
    # Based on your data, 'IST_1_kidney1PN2009_1_0' being True corresponds to pN1.
    df_scores["ssign_component_pN"] = np.where(
        df_scores["IST_1_kidney1PN2009_1_0"], 2, 0
    )

    # 3. Metastasis category score
    # M0 -> 0; M1 -> +4. Per your inclusion criteria, all are M0.
    df_scores["ssign_component_M"] = 0

    # 4. Tumor size score
    # <5 cm -> 0; >=5 cm -> +2
    df_scores["ssign_component_size"] = np.where(
        df_scores["IST_1_kidney1TumorDimension"] >= 5, 2, 0
    )

    # 5. Tumor (nuclear) grade score
    # Grade 1/2 -> 0; Grade 3 -> +1; Grade 4 -> +3
    grade_score_map = {1.0: 0, 2.0: 0, 3.0: 1, 4.0: 3}
    df_scores["ssign_component_grade"] = (
        df_scores["IST_1_kidney1Grading"].map(grade_score_map).fillna(0)
    )

    # 6. Tumor necrosis present score
    # No -> 0; Yes -> +2
    df_scores["ssign_component_necrosis"] = np.where(
        df_scores["IST_1_kidney1Necrosis"], 2, 0
    )

    # 7. Calculate the total SSIGN score
    score_components = [
        "ssign_component_pT",
        "ssign_component_pN",
        "ssign_component_M",
        "ssign_component_size",
        "ssign_component_grade",
        "ssign_component_necrosis",
    ]
    df_scores["ssign_score"] = df_scores[score_components].sum(axis=1)

    return df_scores


def plot_correlation_heatmap(df, method="pearson", threshold=0.5):
    """
    Plots a correlation heatmap for features in the DataFrame that have
    at least one correlation coefficient above the specified threshold.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame containing numerical features.

    method : str, optional (default='pearson')
        The method to use for calculating correlations.
        Options include 'pearson', 'spearman', and 'kendall'.

    threshold : float, optional (default=0.5)
        The minimum absolute correlation coefficient required to include a feature
        in the heatmap. Features with any correlation above this threshold
        (excluding self-correlation) will be included.

    Returns:
    --------
    None
        Displays the correlation heatmap plot.
    """

    # Validate the correlation method
    if method not in ["pearson", "spearman", "kendall"]:
        raise ValueError(
            "Invalid correlation method. Choose from 'pearson', 'spearman', or 'kendall'."
        )

    # Compute the correlation matrix using the specified method
    corr_matrix = df.corr(method=method)

    # Identify columns to keep: those with at least one correlation above the threshold
    # (excluding self-correlation)
    to_keep = [
        col
        for col in corr_matrix.columns
        if (corr_matrix[col].abs().drop(labels=col) >= threshold).any()
    ]

    # Check if any features meet the threshold criteria
    if not to_keep:
        print(f"No features found with correlation above the threshold of {threshold}.")
        return

    # Subset the correlation matrix to include only the selected features
    selected_corr = corr_matrix.loc[to_keep, to_keep]

    # Create a mask for the upper triangle
    mask = np.triu(np.ones_like(selected_corr, dtype=bool))

    # Set up the matplotlib figure
    plt.figure(figsize=(12, 10))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Create the heatmap
    sns.heatmap(
        selected_corr,
        mask=mask,  # Apply the mask to show only lower triangle
        annot=True,  # Show correlation coefficients
        fmt=".2f",  # Format for the annotations
        cmap=cmap,  # Color map
        vmax=1.0,  # Maximum value for the color scale
        vmin=-1.0,  # Minimum value for the color scale
        center=0,  # Center of the colormap
        square=True,  # Make cells square-shaped
        linewidths=0.5,  # Width of the lines that divide the cells
        cbar_kws={"shrink": 0.5},  # Color bar settings
    )

    # Customize the plot with titles and labels
    plt.title(
        f"Correlation Heatmap (Method: {method.capitalize()}, Threshold: {threshold})",
        fontsize=16,
    )
    plt.xticks(rotation=45, ha="right")  # Rotate x-axis labels for better readability
    plt.yticks(rotation=0)  # Keep y-axis labels horizontal

    # Adjust layout to make room for the rotated x-axis labels
    plt.tight_layout()

    # Display the heatmap
    plt.show()

# Data ingestion

## One-hot encoding version

In [5]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required

subscription_id = "753a0b42-95dc-4871-b53e-160ceb0e6bc1"
resource_group = "rg-s-race-aml-dev-we"
workspace_name = "amlsraceamldevwe01"

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name="UC2_raw_survival_csm_ohe_5yrs", version=23)
df_ohe = dataset.to_pandas_dataframe()
print(df_ohe.shape)
df_ohe.head()

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}
(2536, 211)


Unnamed: 0,P_1_id,ANM_1_previousAbdominalOperations,ANM_1_moduliOK,ANM_1_performanceStatus,ANM_1_asa,ANM_1_patientBMI,ANM_1_hypertension,ANM_1_TerapiaDiabeteIpoglicemizzanti,ANM_1_charlsonIndexComorbiditiesMalattiaPolmonare,ANM_1_charlsonIndexComorbiditiesUlceraPeptica,...,DEG_1_examEmCreatininemiaRange_14-18,DEG_1_examEmCalcioRange_14-18,"DEG_1_examEmCalcioRange_2,10-2,60",DEG_1_examEmCalcioRange_2_1-2_6,DEG_1_examEmCalcioRange_2_10-2_60,DEG_1_examEmCalcioRange_2_25-2_75,death,csm,ocm,ttdeath
0,1.0,False,False,0.0,1.0,24.82,False,,True,False,...,False,False,False,False,True,False,False,False,True,60.0
1,4.0,False,False,1.0,2.0,,True,False,False,False,...,False,False,False,False,True,False,False,,,60.0
2,5.0,True,False,1.0,2.0,,True,False,False,False,...,False,False,False,False,True,False,False,False,False,60.0
3,6.0,False,False,1.0,2.0,29.76,False,,False,False,...,False,False,False,False,True,False,False,False,False,60.0
4,7.0,True,False,1.0,2.0,26.57,False,False,False,False,...,False,False,False,False,False,False,False,False,False,60.0


### Use schema

Recreate the schema from tags:

In [6]:
tags = dataset.tags

dtypes = json.loads(tags["dtypes_json"])
is_ordinal = json.loads(tags["is_ordinal_json"])

for col in dtypes.keys():
    if dtypes[col] == "category":
        categories = (
            sorted(df_ohe[col].dropna().unique())
            if is_ordinal[col]
            else df_ohe[col].dropna().unique()
        )
        df_ohe[col] = pd.Categorical(
            df_ohe[col], categories=categories, ordered=is_ordinal[col]
        )
    else:
        df_ohe[col] = df_ohe[col].astype(dtypes[col])

In [7]:
count_columns_by_dtype(df_ohe)

float64: 31
boolean: 171
ordinal category: 9
non ordinal category: 0


## External dataet

Used only for getting common variables

### Original

First file shared by the external center, was missing 2 key variables

In [8]:
df_external_original = pd.read_excel(os.path.join(DIR_SC, PATH_EXTERNAL))
print(df_external_original.shape)
df_external_original.head(2)

(720, 44)


Unnamed: 0,ANM_1_age,ANM_1_asa,ANM_1_charlsonIndex,ANM_1_cciAge,ANM_1_performanceStatus,ANM_1_patientBMI,ANM_1_examEmCreatininemia,ANM_1_examEmeGFR,ANM_1_examEmEmoglobina,ANM_1_examEmEmoglobinaRange_14_0-18_0,...,IST_1_kidney1PN2009_1_0,IST_1_kidney1Grading,IST_1_kidney1MayoPN,IST_1_kidney1MayoGrading,IST_1_kidney1MayoScore,IST_1_kidney1MayoRisk,ocm,csm,death,ttdeath
0,32,3,4,4,0,22.9,14.5,4.2,12.1,0.0,...,pNx,1,pNx,1,0,Low,,,0.0,29
1,82,2,5,9,0,28.7,1.21,72.0,14.7,1.0,...,0,4,pN0,4,3,Intermediate,,,0.0,51


Add progressive index `ID` (manually checked by principal investigator that it matches the `ID` from the integration)

In [9]:
df_external_original = df_external_original.reset_index(drop=False).rename(
    {"index": "ID"}, axis=1
)

Remove space for variable name (data entry)

In [10]:
df_external_original = df_external_original.rename(
    {"IST_1_kidney1PN2009_1_0 ": "IST_1_kidney1PN2009_1_0"}, axis=1
)

### Integration

Integration with `INT_1_examinationTime` and `DEG_1_examEmEmoglobina`

In [11]:
df_external_integration = pd.read_excel(os.path.join(DIR_SC, PATH_EXTERNAL_INTEGRATION))
print(df_external_integration.shape)
df_external_integration.head(2)

(720, 3)


Unnamed: 0,ID,Mean operative time (min),Haemoglobinat discharge (g/dL)
0,1,141.0,12.8
1,2,155.0,12.8


Map column names to DBURI

In [12]:
dict_integration = {
    "Mean operative time (min)": "INT_1_examinationTime",
    "Haemoglobinat discharge (g/dL)": "DEG_1_examEmEmoglobina",
}
df_external_integration = df_external_integration.rename(dict_integration, axis=1)
df_external_integration.head(2)

Unnamed: 0,ID,INT_1_examinationTime,DEG_1_examEmEmoglobina
0,1,141.0,12.8
1,2,155.0,12.8


In [13]:
assert len(df_external_original) == len(df_external_integration)

Integration with `IST_1_kidney1Necrosis`

In [14]:
df_external_integration_necrosis = pd.read_excel(os.path.join(DIR_SC, PATH_EXTERNAL_INTEGRATION_NECROSIS))
print(df_external_integration_necrosis.shape)
df_external_integration_necrosis.head(2)

(720, 2)


Unnamed: 0,ID,Necrosis
0,1,0
1,2,0


Map column names to DBURI

In [15]:
dict_integration_necrosis = {
    "Necrosis": "IST_1_kidney1Necrosis",
}
df_external_integration_necrosis = df_external_integration_necrosis.rename(dict_integration_necrosis, axis=1)
df_external_integration_necrosis.head(2)

Unnamed: 0,ID,IST_1_kidney1Necrosis
0,1,0
1,2,0


In [16]:
assert len(df_external_original) == len(df_external_integration_necrosis)

### Merge external and delete individual dataframes

In [17]:
# Two integrations
df_temp = pd.merge(
    df_external_integration, df_external_integration_necrosis, how="left", on="ID"
)
# Original external + two integrations
df_external = pd.merge(
    df_external_original, df_temp, how="left", on="ID"
)
del df_external_original, df_external_integration, df_external_integration_necrosis, df_temp
print(df_external.shape)
df_external.head(2)

(720, 48)


Unnamed: 0,ID,ANM_1_age,ANM_1_asa,ANM_1_charlsonIndex,ANM_1_cciAge,ANM_1_performanceStatus,ANM_1_patientBMI,ANM_1_examEmCreatininemia,ANM_1_examEmeGFR,ANM_1_examEmEmoglobina,...,IST_1_kidney1MayoGrading,IST_1_kidney1MayoScore,IST_1_kidney1MayoRisk,ocm,csm,death,ttdeath,INT_1_examinationTime,DEG_1_examEmEmoglobina,IST_1_kidney1Necrosis
0,0,32,3,4,4,0,22.9,14.5,4.2,12.1,...,1,0,Low,,,0.0,29,,,
1,1,82,2,5,9,0,28.7,1.21,72.0,14.7,...,4,3,Intermediate,,,0.0,51,141.0,12.8,0.0


### Drop columns with more than 50% of missing

In [18]:
not_features = ["P_1_id", "death", "csm", "ocm", "ttdeath"]
threshold = 0.5 * len(df_external)


# Print the original shape
print("Original shape:", df_external.shape)
# Get the list of original column names
original_columns = df_external.columns
# Create a list of columns to drop based on the threshold, excluding the 'not_features' list
columns_to_drop = [
    col
    for col in original_columns
    if col not in not_features and df_external[col].isna().sum() > threshold
]
# Drop the identified columns
df_external = df_external.drop(columns=columns_to_drop)
# Print the new shape after dropping columns
print("New shape:", df_external.shape)
# Display the dropped columns
print("Dropped columns:", columns_to_drop)

Original shape: (720, 48)
New shape: (720, 47)
Dropped columns: ['DEG_1_examEmCalcio']


##### Get external columns and delete external dataframe

In [19]:
col_external = df_external.columns
del df_external

# Start mlflow run

In [20]:
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run(run_name=str(RANDOM_STATE))
if PARENT_RUN_ID:
    mlflow.set_tag("parent_run_id", PARENT_RUN_ID)

# Drop na on target columns

In [21]:
print(df_ohe.shape[0])
df_ohe = df_ohe.dropna(subset=["ttdeath", "death"])
print(df_ohe.shape[0])

2536
2536


# List features

In [22]:
features_all = sorted(set(df_ohe.columns.tolist()) - set(not_features))
print(len(features_all))

206


# Features and target

In [23]:
# Define features and target
X_missing = df_ohe[features_all]
y = np.array(
    [(event, time) for event, time in zip(df_ohe["death"], df_ohe["ttdeath"])],
    dtype=[("event", bool), ("time", float)],
)
ids = df_ohe["P_1_id"]
mlflow.log_param(
    "death_perc_5yrs",
    pd.Series(y["event"]).value_counts(sort=True, normalize=True)[True],
)

0.050867507886435334

# Imputation

## Fit and trasform

In [24]:
X = X_missing.copy()

imputer = IterativeImputer(
    max_iter=25, initial_strategy="median", random_state=RANDOM_STATE
)
imputer = imputer.fit(X)
X = imputer.transform(X)
X = pd.DataFrame(X, columns=X_missing.columns)

# Assert
assert set(X.columns) == set(X_missing.columns)

del X_missing

# Infer rule-based SSIGN prognostic score

In [25]:
X = calculate_ssign_score(X)

# GRANT fine-tune `CoxPHSurvivalAnalysis_ssign_univariate_T1`

In [26]:
model_name = "CoxPHSurvivalAnalysis_ssign_univariate_T1"
mlflow.start_run(run_name=model_name, nested=True)
mlflow.log_param("random_state", RANDOM_STATE)

42

## Train univariate Cox model on the SSIGN points

In [27]:
# Train the model
cox_ssign_univariate = CoxPHSurvivalAnalysis()
cox_ssign_univariate.fit(X[["ssign_score"]], y)
mlflow.log_param("feature_names_in", cox_ssign_univariate.feature_names_in_)
mlflow.log_param("n_features_in", cox_ssign_univariate.n_features_in_)

1

## Save model weights to pkl

In [28]:
# Save model weights to pkl
os.makedirs(DIR_MODEL_PKL, exist_ok=True)
model_path = os.path.join(
    DIR_MODEL_PKL, "raw_{}_{}.pkl".format(model_name, RANDOM_STATE)
)
joblib.dump(cox_ssign_univariate, model_path)
mlflow.log_artifact(model_path)
mlflow.log_param("model_path", model_path)

'../models_pkl_review/raw_CoxPHSurvivalAnalysis_ssign_univariate_T1_42.pkl'

In [29]:
mlflow.end_run()

# End mlflow run

In [30]:
mlflow.end_run()