In [None]:
import sys

# Add the parent directory to the system path
sys.path.append("../04_survival_models/src")

In [None]:
import json
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from azureml.core import Dataset, Datastore, Workspace
from pandas.api.types import CategoricalDtype
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from uc2_functions import *
from tqdm import tqdm

In [None]:
# Suppress warnings from scipy
warnings.filterwarnings("ignore", category=UserWarning, module="scipy")
warnings.filterwarnings("ignore", category=RuntimeWarning, module="scipy")

# Goal

The goal is to preprocess the data and prepare it for machine learning modeling.

# Parameters

In [None]:
# Legend
PATH_LEGEND = "Legenda_Variabili_Uri_Larcher.xlsx"
# Directories
DIR_SC = os.path.join(os.path.dirname(os.getcwd()), "sc")  # Legend

# Data ingestion

## `Dataset` srace_urologia_nonmeta_q1_all_survival

In [None]:
subscription_id = "753a0b42-95dc-4871-b53e-160ceb0e6bc1"
resource_group = "rg-s-race-aml-dev-we"
workspace_name = "amlsraceamldevwe01"

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name="srace_urologia_nonmeta_q1_all_survival")
df_all = dataset.to_pandas_dataframe()
print(df_all.shape)
df_all.head()

## `.xlsx` Legend

In [None]:
df_legend = pd.read_excel(
    os.path.join(DIR_SC, PATH_LEGEND), sheet_name="Variabili Etichette DBURI"
)
df_legend.head()

### Create dictionary

In [None]:
dict_legend = pd.Series(
    df_legend["Etichetta"].values, index=df_legend["Variabile"]
).to_dict()

# Cast empty cells as np.nan

In [None]:
df_all = (
    df_all.replace("NaN", np.nan)
    .replace("NA", np.nan)
    .replace("None", np.nan)
    .replace("nan", np.nan)
)

# Feature selection

## Target variables

In [None]:
print(df_all["acm"].equals(df_all["death"]))

`acm` and `death` are identical: we'll use `death`.

In [None]:
print(df_all["ttacm"].equals(df_all["ttdeath"]))
print(df_all["ttcsm"].equals(df_all["ttdeath"]))
print(df_all["ttocm"].equals(df_all["ttdeath"]))

All time to event columns are identical: we'll use `ttdeath`.

Both `ocm` and `csm` True should be impossible:

In [None]:
assert df_all[(df_all["ocm"] == True) & (df_all["csm"] == True)].shape[0] == 0

In [None]:
col_target = ["death", "csm", "ocm", "ttdeath"]

## Wide selection (all raw columns)

In [None]:
col_raw = df_legend["Variabile"].tolist()
print(len(col_raw))

### Columns from `df_legend` not in `df_all`

Some columns are in the legend but not in the actual dataframe (pseudo-anonymized), we drop them from the list:

In [None]:
col_raw_anonymous = []
for col in col_raw:
    if col in df_all.columns:
        col_raw_anonymous.append(col)
    else:
        continue
del col_raw
print(len(col_raw_anonymous))

### Irrelevant features for ml models

- Columns at t2
- Columns with id
- Columns with dates
- Columns with unit of measure
- Columns with clinical notes
- Columns with ubication (hospital name)

In [None]:
# List of columns to drop
col_drop = IrrelevantFeatures(columns=col_raw_anonymous).spotall()

In [None]:
# Drop irrelevant features
col_raw_anonymous = ["P_1_id"] + [
    item for item in col_raw_anonymous if item not in col_drop
]
print(len(col_raw_anonymous))

## Select and cast columns of interest

In [None]:
count_columns_by_dtype(df_all[col_raw_anonymous + col_target])

In [None]:
# Select and cast columns of interest
caster = DataFrameCaster(df_all[col_raw_anonymous + col_target].copy())
df_selected = caster.infer_and_cast()
print(df_selected.shape)

# Delete df_all
del df_all

In [None]:
count_columns_by_dtype(df_selected)

# Univariate analysis

We perform univariate analysis as initial feature selection (based on `csm`):

In [None]:
COL_NAME = "col_name"
DTYPE = "dtype"
TST_KIND = "tst_kind"
SIGNIFIC = "signific"
REJECTED = "rejected"

d = {
    COL_NAME: [],
    DTYPE: [],
    TST_KIND: [],
    SIGNIFIC: [],
    REJECTED: [],
}

# Instantiate the feature selector
ufs = UnivariateFeatureSelector(
    df_selected[col_raw_anonymous + ["csm"]], "csm", p_threshold=0.2, verbose=False
)

count = 0
for col in tqdm(df_selected[col_raw_anonymous].columns):
    if (
        df_selected[col].dtype.name == "category"
        or df_selected[col].dtype.name == "boolean"
    ):
        d[COL_NAME].append(col)
        d[DTYPE].append(df_selected[col].dtype.name)
        d[TST_KIND].append("CHISQUARE")
        significance, rejected = ufs.compute_stat_importance_categorical(col)
        d[SIGNIFIC].append(significance)
        d[REJECTED].append(rejected)

    elif df_selected[col].dtype.name == "float64":
        d[COL_NAME].append(col)
        d[DTYPE].append(df_selected[col].dtype.name)
        d[TST_KIND].append("WILCOXON")
        significance, rejected = ufs.compute_stat_importance_numerical(col)
        d[SIGNIFIC].append(significance)
        d[REJECTED].append(rejected if rejected is not None else np.nan)

        d[COL_NAME].append(col)
        d[DTYPE].append(df_selected[col].dtype.name)
        d[TST_KIND].append("T-TEST")
        significance, rejected = ufs.compute_stat_importance_t_test(col)
        d[SIGNIFIC].append(significance)
        d[REJECTED].append(rejected if rejected is not None else np.nan)

    else:
        if ufs.verbose:
            print(
                f"Column {col} has an unsupported data type: {df_selected[col].dtype.name}"
            )

# Create dataframe from dictionary
df_univariate = pd.DataFrame(d)
del d

# Keep column if it is rejected by at least one of the tests
df_univariate["rejected_or_na"] = (
    df_univariate[REJECTED].isna() | df_univariate[REJECTED]
)
df_univariate["keep"] = df_univariate.groupby(COL_NAME)["rejected_or_na"].transform(any)

In [None]:
# List of columns to keep
col_keep = df_univariate[df_univariate["keep"] == True]["col_name"].unique().tolist()
print(len(col_keep))

# Split features and target

In [None]:
assert "P_1_id" in col_keep
df_features = df_selected.drop(col_target, axis=1)[col_keep]
print(df_features.shape)
df_target = df_selected[["P_1_id"] + col_target]
print(df_target.shape)
del df_selected

# Data selection (remove outliers, balancing for rare events)

[Data scientists and domain experts] Survey data for outliers which may distort the Cox model. Statistically identified outliers should be evaluated by domain experts for scientific relevance. Consider the necessity of balancing for rare events, which may involve merging similar categories. Decisions should leverage both data-driven insight and domain knowledge.

In [None]:
PLOTS_OUTLIERS = True
PLOTS_RARE_EVENTS = False

# Select only numeric columns
df_numeric = df_features.select_dtypes(include=[np.number])

# Calculate z-scores (we're not casting to integers here anymore)
z_scores = np.abs(stats.zscore(df_numeric.astype(float)))

threshold = 3  # common choice for detecting outliers
outliers = (z_scores > threshold).any(axis=0)  # modified to detect per column

# Columns with outliers
cols_with_outliers = df_numeric.columns[outliers]

# Map definitions from dict
print("Columns with outliers:")
print(
    json.dumps(
        [
            f"{x}   {dict_legend.get(x, '')}" if x in dict_legend else x
            for x in cols_with_outliers
        ],
        indent=4,
    )
)

# Detect columns with rare events
rare_threshold = 0.05  # threshold to consider an event as rare
cols_with_rare = [
    col
    for col in df_features.select_dtypes(include=["object", "bool", "category"]).columns
    if (df_features[col].value_counts(normalize=True) < rare_threshold).any()
]

# Map definitions from dict
print("Columns with rare events:")
print(
    json.dumps(
        [
            f"{x}   {dict_legend.get(x, '')}" if x in dict_legend else x
            for x in cols_with_rare
        ],
        indent=4,
    )
)

l = []
figsize = (5, 2)
for col in df_features.columns:
    definition = dict_legend.get(col, "")
    if definition == "":
        l.append(col)
    title = col + "   " + definition

    if (
        df_features[col].dtype in ["int64", float]
        and col in cols_with_outliers
        and PLOTS_OUTLIERS is True
    ):
        plt.figure(figsize=figsize)
        sns.boxplot(x=df_features[col])
        plt.title(title)

        xticks = np.linspace(df_features[col].min(), df_features[col].max(), 20)
        plt.xticks(xticks, rotation=45)

        plt.show()

    elif (
        df_features[col].dtype.name == "category"
        and col in cols_with_rare
        and PLOTS_RARE_EVENTS is True
    ):
        if len(df_features[col].cat.categories) < 10:
            plt.figure(figsize=figsize)
            sns.countplot(x=df_features[col], data=df_features)
            plt.title(title)

            xticks = np.arange(len(df_features[col].cat.categories))
            plt.xticks(xticks, df_features[col].cat.categories, rotation=45)

            plt.show()

    elif (
        df_features[col].dtype == "bool"
        and col in cols_with_rare
        and PLOTS_RARE_EVENTS is True
    ):
        plt.figure(figsize=figsize)
        df_features[col].value_counts().plot(kind="barh", stacked=True)
        plt.title(title)

        yticks = np.arange(len(df_features[col].value_counts().index))
        plt.yticks(yticks, df_features[col].value_counts().index)

        plt.show()

    else:
        continue

### Variable `ANM_1_age`

In [None]:
print(df_features.shape[0])
df_features = df_features[df_features["ANM_1_age"] != 0]
print(df_features.shape[0])

### Variable `INT_1_examinationAge`

In [None]:
print(df_features.shape[0])
df_features = df_features[df_features["INT_1_examinationAge"] != 0]
print(df_features.shape[0])

### Variable `bmi`

`bmi` under 10 and over 100 represent clear outliers, we remove them:

In [None]:
# Define thresholds
bmi_lower = 10
bmi_uppuer = 100

# Remove bmi under 10 and over 200
print(df_features.shape[0])
df_features = df_features[
    (
        (df_features["ANM_1_patientBMI"] > bmi_lower)
        & (df_features["ANM_1_patientBMI"] < bmi_uppuer)
    )
    | (df_features["ANM_1_patientBMI"].isna())
]
print(df_features.shape[0])

### Variable `INT_1_conservativeTherapy`

In [None]:
# Convert 'Altro...' to 'No'
df_features["INT_1_conservativeTherapy"] = (
    df_features["INT_1_conservativeTherapy"]
    .apply(lambda x: "No" if x == "Altro..." else x)
    .astype("category")
)

### Variables `pT`, `cT`, `pN`, `cN`, `grade`

In [None]:
col_stages_grade = [
    "IST_1_kidney1PathologicalStage2009",
    "ANM_1_kidney1ClinicalStage2009",
    "IST_1_kidney1PN2009",
    "IST_1_kidney1Grading",
]

In [None]:
for col in col_stages_grade:
    print(col)
    print(df_features[col].value_counts(dropna=False))
    print()

Replace values using a dictionary:

In [None]:
mapping_t_4lev = {
    "T1a": 1.0,
    "T1b": 1.0,
    "T2a": 2.0,
    "T2b": 2.0,
    "T3a": 3.0,
    "T3b": 3.0,
    "T3c": 3.0,
    "T4": 4.0,
    "Tx": np.nan,
}  # Rare event

mapping_t_8lev = {
    "T1a": 1.0,
    "T1b": 2.0,
    "T2a": 3.0,
    "T2b": 4.0,
    "T3a": 5.0,
    "T3b": 6.0,
    "T3c": 7.0,
    "T4": 8.0,
    "Tx": np.nan,
}  # Rare event

mapping_n = {
    "No": 0.0,
    "N1": 1.0,
    "Nx": 2.0,
}  # Unsing np.nan would lead to more than 25% of nans

mapping_grade = {"G1": 1.0, "G2": 2.0, "G3": 3.0, "G4": 4.0}

In [None]:
# pT with 4 levels
df_features.loc[:, "IST_1_kidney1PathologicalStage2009_4lev"] = df_features.loc[
    :, "IST_1_kidney1PathologicalStage2009"
].replace(mapping_t_4lev)
# pT with 8 levels
df_features.loc[:, "IST_1_kidney1PathologicalStage2009"] = df_features.loc[
    :, "IST_1_kidney1PathologicalStage2009"
].replace(mapping_t_8lev)
# cT with 4 levels
df_features.loc[:, "ANM_1_kidney1ClinicalStage2009_4lev"] = df_features.loc[
    :, "ANM_1_kidney1ClinicalStage2009"
].replace(mapping_t_4lev)
# cT with 8 levels
df_features.loc[:, "ANM_1_kidney1ClinicalStage2009"] = df_features.loc[
    :, "ANM_1_kidney1ClinicalStage2009"
].replace(mapping_t_8lev)
# pN
df_features.loc[:, "IST_1_kidney1PN2009"] = df_features.loc[
    :, "IST_1_kidney1PN2009"
].replace(mapping_n)
# Grade
df_features.loc[:, "IST_1_kidney1Grading"] = df_features.loc[
    :, "IST_1_kidney1Grading"
].replace(mapping_grade)

Casting:

In [None]:
# pT with 4 levels ordinal category
col = "IST_1_kidney1PathologicalStage2009_4lev"
unique_values = sorted(df_features[col].dropna().unique())
df_features[col] = pd.Categorical(
    df_features[col], categories=unique_values, ordered=True
)
# pT with 8 levels ordinal category
col = "IST_1_kidney1PathologicalStage2009"
unique_values = sorted(df_features[col].dropna().unique())
df_features[col] = pd.Categorical(
    df_features[col], categories=unique_values, ordered=True
)
# cT with 4 levels ordinal category
col = "ANM_1_kidney1ClinicalStage2009_4lev"
unique_values = sorted(df_features[col].dropna().unique())
df_features[col] = pd.Categorical(
    df_features[col], categories=unique_values, ordered=True
)
# cT with 8 levels ordinal category
col = "ANM_1_kidney1ClinicalStage2009"
unique_values = sorted(df_features[col].dropna().unique())
df_features[col] = pd.Categorical(
    df_features[col], categories=unique_values, ordered=True
)
# pN non ordinal category
col = "IST_1_kidney1PN2009"
unique_values = sorted(df_features[col].dropna().unique())
df_features[col] = pd.Categorical(
    df_features[col], categories=unique_values, ordered=False
)
# Grade ordinal category
col = "IST_1_kidney1Grading"
unique_values = sorted(df_features[col].dropna().unique())
df_features[col] = pd.Categorical(
    df_features[col], categories=unique_values, ordered=True
)

### Variable `IST_1_kidney1MayoRisk`

In [None]:
# Replace strings with numerical
col = "IST_1_kidney1MayoRisk"
mapping = {"Basso": 1, "Intermedio": 2, "Alto": 3}
df_features[col] = df_features[col].map(mapping)
# IST_1_kidney1MayoRisky ordinal category
unique_values = sorted(df_features[col].dropna().unique())
df_features[col] = pd.Categorical(
    df_features[col], categories=unique_values, ordered=True
)

### Variables with 2 unique values

Cases not included in class DataframeCaster (the class focuses on yes-no, but sometimes we do have M-F or symilar).

In [None]:
(
    bool_cols,
    float_cols,
    ordinal_cols,
    non_ordinal_cols,
    other_cols,
) = count_columns_by_dtype(df_features, return_lists=True)

col_ohe = []
for col in non_ordinal_cols:
    if len(df_features[col].dropna().unique()) == 2:
        print(col)
        print(dict_legend[col])
        print(df_features[col].unique())
        print()
        col_ohe.append(col)

In [None]:
print(df_features.shape)
# Cast columns with 2 unique values as boolean (one-hot encoding)
df = one_hot_encoding(df=df_features, cols=col_ohe)
print(df_features.shape)

# Zero variance and near zero variance

[Data scientists] Identify and remove predictors with zero variance and near zero variance as they do not contribute any useful information in the Cox model.

Near-zero variance predictors are predictors that have one unique value (i.e., are zero variance predictors) or predictors that have both of the following characteristics:
- They have very few unique values relative to the number of samples, and
- The ratio of the frequency of the most common value to the frequency of the second most common value is large.

In [None]:
col_zero_variance = identify_near_zero_variance(
    df_features, prevalence_threshold=0.99, unique_ratio_threshold=0.01
)
print(len(col_zero_variance))

for col in col_zero_variance:
    print(f"{col}   {dict_legend.get(col, '')}" if col in dict_legend else col)
    print(df_features[col].value_counts(normalize=True))
    print("\n")

The near-zero variance function is typically applied to both numeric and categorical columns, identifying columns where the majority of values are the same (high prevalence) and there is a low proportion of unique values.

The rare events approach is primarily applied to categorical columns and identifies columns that have one or more categories which occur infrequently.

It's worth noting that while a column with a rare event might not necessarily have near zero variance, a column with near zero variance could potentially be identified as having rare events, depending on the prevalence of the most common category and the specified threshold for "rarity".

In [None]:
# Drop the near zero variance columns from the dataframe
print(df_features.shape)
df_features.drop(col_zero_variance, axis=1, inplace=True)
print(df_features.shape)

# Missing data

## Remove columns with a lot of missing values (hard to impute)

In [None]:
df_missing = df_features.copy()
print(df_missing.shape)
# Drop a column if more than thresh of the values are missing
# thresh: Require that many non-NA values
thresh = 0.75
df_missing = df_missing.dropna(thresh=thresh * df_missing.shape[0], axis=1)
print(df_missing.shape)

# Delete df_selected
del df_features

# Separation

While perfect separation is more critical in models like logistic regression, it is still important to be aware of it in the Cox model. [Data scientists] should monitor separation and address it when it occurs. Domain experts could provide input on how to handle these variables.

In [None]:
df_separation = df_missing.merge(df_target[["P_1_id", "csm"]], how="left", on="P_1_id")

In [None]:
perfect_separators, almost_perfect_separators = check_separation(df_separation, "csm")
print(perfect_separators, almost_perfect_separators)
del df_separation

In [None]:
# Drop the perfect separators from the dataframe
df_missing.drop(perfect_separators, axis=1, inplace=True)
df_missing.drop(almost_perfect_separators, axis=1, inplace=True)

# Collinearity

[Data scientists, in consultation with domain experts] Manage collinearity in the Cox model by addressing high correlation between predictors. Techniques could include removing variables, combining them, or using regularization techniques like ridge regression. The choice should be informed by both statistical criteria and domain knowledge, as collinearity can complicate interpretation and the stability of the model.

In [None]:
# Define the strong correlation threshold
strong_corr_threshold = 0.95

# Select only numeric columns from the dataframe
(
    bool_cols,
    float_cols,
    ordinal_cols,
    non_ordinal_cols,
    other_cols,
) = count_columns_by_dtype(df_missing, return_lists=True)
print()
col_corr = [x for x in df_missing.columns.tolist() if x not in non_ordinal_cols]

# Calculate the correlation matrix for numeric columns only
corr_matrix = df_missing[col_corr].corr()

# Track the processed column pairs
processed_pairs = set()
col_drop_auto = []
col_drop_pairs = []
# Print column pairs with strong correlations, find least significative in pair
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if (corr_matrix.iloc[i, j] > strong_corr_threshold) or (
            corr_matrix.iloc[i, j] < -strong_corr_threshold
        ):
            col_i = corr_matrix.columns[i]
            col_j = corr_matrix.columns[j]
            corr_value = corr_matrix.iloc[i, j]
            if (col_i, col_j) not in processed_pairs and (
                col_j,
                col_i,
            ) not in processed_pairs:
                least_significative = find_least_significative(df_univariate, col_i, col_j)
                if least_significative is not None:
                    # Drop least significative
                    print("___Highly correlated pair:")
                    print(
                        f"{col_i} --- {col_j}\nDropping {least_significative}   {dict_legend.get(least_significative)}"
                    )
                    col_drop_auto.append(least_significative)
                    print()
                if least_significative is None:
                    # Need manual selection on which one to drop
                    col_drop_pairs.append((col_i, col_j))
                    print("___Need manual selection on which one to drop:")
                    print(
                        f"{col_i}   {dict_legend.get(col_i)}\n{col_j}   {dict_legend.get(col_j)}\n{corr_value}"
                    )
                    print()
                processed_pairs.add((col_i, col_j))

print(
    len(col_drop_auto), "columns to drop (least significative in higly correlated pair)"
)
print(
    len(col_drop_pairs),
    "columns that need manual selection on which one to drop (same univariate rank)",
)

## Drop highly correlated features

In [None]:
col_drop_manual = [
    "ANM_1_patientHeight",  # We use BMI
    "ANM_1_patientWeight",  # We use BMI
    "DEG_1_transfusionsHomologous",
    "INT_1_HomologousTransfusions",
    "INT_1_examinationAge",
    "IST_1_kidney1MayoNecrosis",
]

col_drop = col_drop_auto + col_drop_manual

df_missing.drop(col_drop, axis=1, inplace=True)

# Final database

We propose 6 versions of the database:
- Categorical version (non ordinal category features as single column)
    - All events (cancer-specific mortality + other cause mortality) --> `UC2_raw_survival_acm_cat`
    - Only cancer-specific mortality --> `UC2_raw_survival_csm_cat`
    - Only cancer-specific mortality (cut at 5 years) --> `UC2_raw_survival_csm_cat_5yrs`
- One-hot encoding version (non ordinal category features as dummy)
    - All events (cancer-specific mortality + other cause mortality) --> `UC2_raw_survival_acm_ohe`
    - Only cancer-specific mortality --> `UC2_raw_survival_csm_ohe`
    - Only cancer-specific mortality (cut at 5 years) --> `UC2_raw_survival_csm_ohe_5yrs`

In [None]:
print(df_missing.shape)

## Categorical version

### All events (cancer-specific mortality + other cause mortality)

In [None]:
df = df_missing.merge(df_target, how="left", on="P_1_id")
print(df.shape)
print(df["death"].value_counts(normalize=True, dropna=False))
count_columns_by_dtype(df)
df.head()

Write

The parquet format does not support the dtype `category`. That's why we'll cast those type of columns to `object` by using the function `cast_category_to_object` and then write the dataset. We write the lists of `ordinal_cols` and `non_ordinal_cols` using tags.

In [None]:
df_write = df.copy()

dtypes_json = df_write.dtypes.apply(lambda x: x.name).to_json()
is_ordinal_json = (
    df_write.select_dtypes("category").apply(lambda x: x.cat.ordered).to_json()
)
tags = {
    "source": os.path.join(os.getcwd().split("/code")[1], "00_preprocessing_raw.ipynb"),
    "processing": "Preprocessing Raw",
    "description": "Categorical version (non ordinal category features as single column)",
    "events": "All events (cancer-specific mortality + other cause mortality)",
    "dtypes_json": dtypes_json,
    "is_ordinal_json": is_ordinal_json,
}

target_datastore = Datastore.get(workspace, "urologia")
dataset = Dataset.Tabular.register_pandas_dataframe(
    cast_category_to_object(df_write),
    target=target_datastore,
    name="UC2_raw_survival_acm_cat",
    tags=tags,
    show_progress=True,
)

del df_write, dtypes_json, is_ordinal_json

### Only cancer-specific mortality

In [None]:
print(df.shape)
df_csm = df.copy()
df_csm = df_csm[(df_csm["ocm"] == False)]
################################################## New possible approach (alternative)
# Deaths due to other causes than cancer (`ocm`) become censured → when `ocm` is True, leave `ttdeath` as it is and `death` becomes False
# df_csm.loc[df_csm['ocm'] == True, 'death'] = False
##################################################
print(df_csm.shape)
print(df_csm["death"].value_counts(normalize=True, dropna=False))
count_columns_by_dtype(df_csm)
df_csm.head()

Write

In [None]:
df_write = df_csm.copy()

dtypes_json = df_write.dtypes.apply(lambda x: x.name).to_json()
is_ordinal_json = (
    df_write.select_dtypes("category").apply(lambda x: x.cat.ordered).to_json()
)
tags = {
    "source": os.path.join(os.getcwd().split("/code")[1], "00_preprocessing_raw.ipynb"),
    "processing": "Preprocessing Raw",
    "description": "Categorical version (non ordinal category features as single column)",
    "events": "Only cancer-specific mortality",
    "dtypes_json": dtypes_json,
    "is_ordinal_json": is_ordinal_json,
}

target_datastore = Datastore.get(workspace, "urologia")
dataset = Dataset.Tabular.register_pandas_dataframe(
    cast_category_to_object(df_write),
    target=target_datastore,
    name="UC2_raw_survival_csm_cat",
    tags=tags,
    show_progress=True,
)

del df_write, dtypes_json, is_ordinal_json

### Only cancer-specific mortality (cut at 5 years)

In [None]:
print(df.shape)
df_csm_5yrs = df.copy()
del df, df_csm
cut_months = 60
df_csm_5yrs = df_csm_5yrs[
    (df_csm_5yrs["ocm"] == False) | (df_csm_5yrs["ttdeath"] >= cut_months)
]
################################################## New possible approach (alternative)
# Deaths due to other causes than cancer (`ocm`) become censured → when `ocm` is True, leave `ttdeath` as it is and `death` becomes False
# df_csm_5yrs.loc[df_csm_5yrs['ocm'] == True, 'death'] = False
##################################################
# Clip the ttdeath column at 60
df_csm_5yrs["ttdeath"] = df_csm_5yrs["ttdeath"].apply(lambda x: x if x <= 60 else 60)
# Update the death column based on ttdeath values (censor over 60 months)
df_csm_5yrs.loc[df_csm_5yrs["ttdeath"] == 60, "death"] = False
print(df_csm_5yrs.shape)

Write

In [None]:
df_write = df_csm_5yrs.copy()

dtypes_json = df_write.dtypes.apply(lambda x: x.name).to_json()
is_ordinal_json = (
    df_write.select_dtypes("category").apply(lambda x: x.cat.ordered).to_json()
)
tags = {
    "source": os.path.join(os.getcwd().split("/code")[1], "00_preprocessing_raw.ipynb"),
    "processing": "Preprocessing Raw",
    "description": "Categorical version (non ordinal category features as single column)",
    "events": "Only cancer-specific mortality, cut at 5 years",
    "dtypes_json": dtypes_json,
    "is_ordinal_json": is_ordinal_json,
}

target_datastore = Datastore.get(workspace, "urologia")
dataset = Dataset.Tabular.register_pandas_dataframe(
    cast_category_to_object(df_write),
    target=target_datastore,
    name="UC2_raw_survival_csm_cat_5yrs",
    tags=tags,
    show_progress=True,
)

del df_write, dtypes_json, is_ordinal_json

## One-hot encoding version (non ordinal category features as dummy)

### All events (cancer-specific mortality + other cause mortality)

In [None]:
(
    bool_cols,
    float_cols,
    ordinal_cols,
    non_ordinal_cols,
    other_cols,
) = count_columns_by_dtype(df_missing, return_lists=True)
df_ohe = one_hot_encoding(df_missing, non_ordinal_cols).merge(
    df_target, how="left", on="P_1_id"
)
# replace "." with "_" in column names
df_ohe.columns = df_ohe.columns.str.replace(".", "_")
print(df_ohe.shape)
print(df_ohe["death"].value_counts(normalize=True, dropna=False))
count_columns_by_dtype(df_ohe)
df_ohe.head()

Write

In [None]:
df_write = df_ohe.copy()

dtypes_json = df_write.dtypes.apply(lambda x: x.name).to_json()
is_ordinal_json = (
    df_write.select_dtypes("category").apply(lambda x: x.cat.ordered).to_json()
)
tags = {
    "source": os.path.join(os.getcwd().split("/code")[1], "00_preprocessing_raw.ipynb"),
    "processing": "Preprocessing Raw",
    "description": "One-hot encoding version (non ordinal category features as dummy)",
    "events": "All events (cancer-specific mortality + other cause mortality)",
    "dtypes_json": dtypes_json,
    "is_ordinal_json": is_ordinal_json,
}

target_datastore = Datastore.get(workspace, "urologia")
dataset = Dataset.Tabular.register_pandas_dataframe(
    cast_category_to_object(df_write),
    target=target_datastore,
    name="UC2_raw_survival_acm_ohe",
    tags=tags,
    show_progress=True,
)

del df_write, dtypes_json, is_ordinal_json

### Only cancer-specific mortality

In [None]:
print(df_ohe.shape)
df_ohe_csm = df_ohe.copy()
df_ohe_csm = df_ohe_csm[(df_ohe_csm["ocm"] == False)]
################################################## New possible approach (alternative)
# Deaths due to other causes than cancer (`ocm`) become censured → when `ocm` is True, leave `ttdeath` as it is and `death` becomes False
# df_ohe_csm.loc[df_ohe_csm['ocm'] == True, 'death'] = False
##################################################
print(df_ohe_csm.shape)
print(df_ohe_csm["death"].value_counts(normalize=True, dropna=False))
count_columns_by_dtype(df_ohe_csm)
df_ohe_csm.head()

Write

In [None]:
df_write = df_ohe_csm.copy()

dtypes_json = df_write.dtypes.apply(lambda x: x.name).to_json()
is_ordinal_json = (
    df_write.select_dtypes("category").apply(lambda x: x.cat.ordered).to_json()
)
tags = {
    "source": os.path.join(os.getcwd().split("/code")[1], "00_preprocessing_raw.ipynb"),
    "processing": "Preprocessing Raw",
    "description": "One-hot encoding version (non ordinal category features as dummy)",
    "events": "Only cancer-specific mortality",
    "dtypes_json": dtypes_json,
    "is_ordinal_json": is_ordinal_json,
}

target_datastore = Datastore.get(workspace, "urologia")
dataset = Dataset.Tabular.register_pandas_dataframe(
    cast_category_to_object(df_write),
    target=target_datastore,
    name="UC2_raw_survival_csm_ohe",
    tags=tags,
    show_progress=True,
)

del df_write, dtypes_json, is_ordinal_json

### Only cancer-specific mortality (cut at 5 years)

In [None]:
print(df_ohe.shape)
df_ohe_csm_5yrs = df_ohe.copy()
del df_ohe, df_ohe_csm
cut_months = 60
df_ohe_csm_5yrs = df_ohe_csm_5yrs[
    (df_ohe_csm_5yrs["ocm"] == False) | (df_ohe_csm_5yrs["ttdeath"] >= cut_months)
]
################################################## New possible approach (alternative)
# Deaths due to other causes than cancer (`ocm`) become censured → when `ocm` is True, leave `ttdeath` as it is and `death` becomes False
# df_ohe_csm_5yrs.loc[df_ohe_csm_5yrs['ocm'] == True, 'death'] = False
##################################################
# Clip the ttdeath column at 60
df_ohe_csm_5yrs["ttdeath"] = df_ohe_csm_5yrs["ttdeath"].apply(
    lambda x: x if x <= 60 else 60
)
# Update the death column based on ttdeath values (censor over 60 months)
df_ohe_csm_5yrs.loc[df_ohe_csm_5yrs["ttdeath"] == 60, "death"] = False
print(df_ohe_csm_5yrs.shape)

Write

In [None]:
df_write = df_ohe_csm_5yrs.copy()

dtypes_json = df_write.dtypes.apply(lambda x: x.name).to_json()
is_ordinal_json = (
    df_write.select_dtypes("category").apply(lambda x: x.cat.ordered).to_json()
)
tags = {
    "source": os.path.join(os.getcwd().split("/code")[1], "00_preprocessing_raw.ipynb"),
    "processing": "Preprocessing Raw",
    "description": "One-hot encoding version (non ordinal category features as dummy)",
    "events": "Only cancer-specific mortality, cut at 5 years",
    "dtypes_json": dtypes_json,
    "is_ordinal_json": is_ordinal_json,
}

target_datastore = Datastore.get(workspace, "urologia")
dataset = Dataset.Tabular.register_pandas_dataframe(
    cast_category_to_object(df_write),
    target=target_datastore,
    name="UC2_raw_survival_csm_ohe_5yrs",
    tags=tags,
    show_progress=True,
)

del df_write, dtypes_json, is_ordinal_json