# 0. Installing packages

In [None]:
!pip uninstall scikit-learn scikit-survival -y

!pip install scikit-learn
!pip install scikit-survival

!pip install lifelines

!pip install joblib

In [None]:
import sksurv
import lifelines

import os
import subprocess
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.metrics import concordance_index_censored
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.util import Surv

import itertools

from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

from sklearn.metrics import log_loss

import joblib

import warnings
from sklearn.exceptions import FitFailedWarning

# 1. Utils

## 1.1 EN

In [None]:
def train_opt_EN(train_data, train_labels, 
                      l1_ratios=np.linspace(0.1, 1.0, 10), max_iter=100, alpha_min_ratio=0.01, cv_folds=5, verbose = True):
   
    """
    EN model hyperparam opt 
        - estimates alpha grid using initial model with l1 = 0.5
        - 5-fold CV along alpha-lambda grid 
        - determines optimal alpha and lambda
        - retrains model on whole training split using optimal settings
        - returns model and CV results
    """
    

    labels_array = np.array([(status, time) for status, time in zip(train_labels.iloc[:, 0], train_labels.iloc[:, 1])], dtype=[('event', '?'), ('time', '<f8')])

    warnings.simplefilter("ignore", UserWarning)
    warnings.simplefilter("ignore", FitFailedWarning)

    print("estimating alphas with lambda=0.5...")

    initial_model = CoxnetSurvivalAnalysis(l1_ratio=0.5, alpha_min_ratio=alpha_min_ratio, max_iter=max_iter, n_alphas = 5)
    initial_model.fit(train_data, labels_array)
    estimated_alphas = initial_model.alphas_

    print(f"estimated {len(estimated_alphas)} alphas ranging from {estimated_alphas.min():.5f} to {estimated_alphas.max():.5f}.")

    #cv grid
    param_grid = {
        'l1_ratio': l1_ratios,
        'alphas': [[alpha] for alpha in estimated_alphas]
    }

    cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)

    grid_search = GridSearchCV(
        CoxnetSurvivalAnalysis(max_iter=max_iter, fit_baseline_model = True),
        param_grid=param_grid,
        cv=cv,
        n_jobs=-1,
        verbose=1 if verbose else 0
    )
    
    grid_search.fit(train_data, labels_array)

    #get best model
    best_model = grid_search.best_estimator_ 
    best_l1_ratio = grid_search.best_params_['l1_ratio']
    best_alpha = grid_search.best_params_['alphas'][0]

    if verbose:
        print(f"\nBest l1_ratio: {best_l1_ratio:.2f}, Best alpha: {best_alpha:.5f}")

    cv_results = pd.DataFrame(grid_search.cv_results_)

    return best_model, cv_results


## 1.2 Data Prep

In [None]:
def split_train_test(df_filtered, labels, testtrain_column='testtrain'):
    
    train_data = df_filtered[df_filtered[testtrain_column] == 'train'].drop(columns=[testtrain_column])
    test_data = df_filtered[df_filtered[testtrain_column] == 'test'].drop(columns=[testtrain_column])

    train_labels = labels[labels[testtrain_column] == 'train'].drop(columns=[testtrain_column])
    test_labels = labels[labels[testtrain_column] == 'test'].drop(columns=[testtrain_column])

    return train_data, test_data, train_labels, test_labels

In [None]:
# Define the directory containing the files
dir = "UKBRISK_ENModels/NHC/Initial_10y"

command = f"dx ls {dir}"
files = subprocess.check_output(command, shell=True).decode("utf-8").splitlines()


# Filter the files to get only those that contain "_combined_10y.csv"
absrisk_files = [f for f in files if "_combined_10y.csv" in f]

# Download the files
for file_name in absrisk_files:
    download_cmd = f"dx download {dir}/{file_name} --overwrite"
    subprocess.run(download_cmd, shell=True)

merged_dataframes = {}

# Process each file and merge dataframes
for file_name in absrisk_files:
    
    abs_risk_data = pd.read_csv(file_name)

    # Split the file name by underscores and extract parts
    parts = file_name.split("_")
    
    endpoint = parts[1]  # e.g., "RD"
    
    # The combination name is everything after the second element and before "survival_probs"
    survival_index = parts.index("survival")  # Find the index of "survival"
    
    # Construct the combination name
    combo_name = "_".join(parts[2:survival_index])
    
    # Check if the endpoint already exists in the dictionary
    if endpoint in merged_dataframes:
        # Merge the new data with the existing data for this endpoint
        merged_dataframes[endpoint] = pd.merge(
            merged_dataframes[endpoint], 
            abs_risk_data[["eid", "survival_probability", "set"]], 
            on=["eid", "set"], 
            how="outer"
        )
        # Rename the new survival probability column to reflect the combination
        merged_dataframes[endpoint].rename(columns={"survival_probability": combo_name}, inplace=True)
    else:
        # Initialize the dataframe for this endpoint
        abs_risk_data_merged = abs_risk_data[["eid", "set", "survival_probability"]]
        abs_risk_data_merged.rename(columns={"survival_probability": combo_name}, inplace=True)
        merged_dataframes[endpoint] = abs_risk_data_merged

print("All absolute risk data frames merged successfully.")

train_dataframes_absrisk_nhc_10y = {}
test_dataframes_absrisk_nhc_10y = {}

# Split each merged dataframe into train and test sets
for endpoint in merged_dataframes:
    df = merged_dataframes[endpoint]
    
    # Split based on the 'set' column
    train_dataframes_absrisk_nhc_10y[endpoint] = df[df["set"] == "train"].copy()
    test_dataframes_absrisk_nhc_10y[endpoint] = df[df["set"] == "test"].copy()


In [None]:
#dl files
dl_cmd = f"dx download 'UKBRISK_Processed/Processed_final_25112024.tsv' --overwrite"
!{dl_cmd}
df = pd.read_csv("Processed_final_25112024.tsv", sep="\t")

dl_cmd = f"dx download 'Risk score dataframes/NHSHC_exclusion.tsv' --overwrite"
!{dl_cmd}
exclusion = pd.read_csv("NHSHC_exclusion.tsv", sep="\t")
exclusion.rename(columns={'DM_at_base.x': 'DM_at_base'}, inplace=True)
exclusion.rename(columns={'CKD_at_base': 'RD_at_base'}, inplace=True)

## 1.3 Saving & Uploading

In [None]:
def upload_model(model, endpoint, combo_name, cvresults, directory="UKBRISK_ENModels/NHC/Secondary_10y"):
    
    filename_model = f"EN_{endpoint}_{combo_name}.pkl"
    upload_cmd_model = f"dx upload {filename_model} --path {directory}/{filename_model}"
    
    filename_cvresults = f"EN_{endpoint}_{combo_name}_cvresults.tsv"
    upload_cmd_cvresults = f"dx upload {filename_cvresults} --path {directory}/{filename_cvresults}"
    
    joblib.dump(model, filename_model)
    !{upload_cmd_model}
    
    cvresults.to_csv(filename_cvresults, sep='\t', index=False)
    !{upload_cmd_cvresults}
    
    os.remove(filename_model)
    os.remove(filename_cvresults)

In [None]:
def save_and_upload_lps(model, train_data, test_data, train_labels, test_labels, endpoint, combo_name, directory="UKBRISK_ENModels/NHC/Secondary_10y"):

    train_lp = model.predict(train_data)
    test_lp = model.predict(test_data)
    
    train_lp_df = pd.DataFrame({"eid": train_labels.index, "LP": train_lp})
    test_lp_df = pd.DataFrame({"eid": test_labels.index, "LP": test_lp})

    train_lp_filename = f"{endpoint}_{combo_name}_train_LP.tsv"
    test_lp_filename = f"{endpoint}_{combo_name}_test_LP.tsv"
    train_lp_df.to_csv(train_lp_filename, sep='\t', index=False)
    test_lp_df.to_csv(test_lp_filename, sep='\t', index=False)
    
    upload_cmd_trainlp = f"dx upload {train_lp_filename} --path {directory}/{train_lp_filename}"
    upload_cmd_testlp = f"dx upload {test_lp_filename} --path {directory}/{test_lp_filename}"
    !{upload_cmd_trainlp}
    !{upload_cmd_testlp}


In [None]:
def save_and_upload_coefficients(model, train_data, endpoint, combo_name, directory="UKBRISK_ENModels/NHC/Secondary_10y"):

    coeff_filename = f"{endpoint}_{combo_name}_coefficients.tsv"
    coef_df = pd.DataFrame(model.coef_, index=train_data.columns, columns=["Coefficient"])
    coef_df.to_csv(coeff_filename, sep='\t')
    
    upload_cmd_coef = f"dx upload {coeff_filename} --path {directory}/{coeff_filename}"
    !{upload_cmd_coef}

In [None]:
def calculate_and_upload_survival_probs(best_model, train_data, test_data, endpoint, combo_name, directory="UKBRISK_ENModels/NHC/Secondary_10y"):

    unique_times = best_model.unique_times_
    time_point_index = (np.abs(unique_times - 10)).argmin()
    
    print(f"Selected time point index: {time_point_index}, Time: {unique_times[time_point_index]}")

    surv_probs_train = best_model.predict_survival_function(train_data, return_array=True)[:, time_point_index]

    surv_probs_test = best_model.predict_survival_function(test_data, return_array=True)[:, time_point_index]

    train_eid = train_data.index
    test_eid = test_data.index

    surv_10y_train_df = pd.DataFrame({
        'eid': train_eid,
        'survival_probability': surv_probs_train,
        'set': 'train'
    })

    surv_10y_test_df = pd.DataFrame({
        'eid': test_eid,
        'survival_probability': surv_probs_test,
        'set': 'test'
    })

    combined_df = pd.concat([surv_10y_train_df, surv_10y_test_df], ignore_index=True)

    filename_combined = f"EN_{endpoint}_{combo_name}_survival_probs_combined_10y.csv"
    combined_df.to_csv(filename_combined, index=False)
    upload_cmd_combined = f"dx upload {filename_combined} --path {directory}/{filename_combined}"
    subprocess.run(upload_cmd_combined, shell=True)
    os.remove(filename_combined)
    
    print(f"Combined survival probabilities at 10 years uploaded for {endpoint} - {combo_name}")



## 1.4 Define Predictors for Initial and Secondary models

In [None]:
#for secondary
always_include = ["clinicalrisk_Age.at.recruitment", "clinicalrisk_Sex_0", "clinicalrisk_Sex_1", "eid", "testtrain"]

predictor_combinations = {
    "nhc_pmh_ts": ["nhc", "pmh_", "ts_"],
    "nhc_prs_metabolomics_pmh_ts": ["nhc", "prs_", "metabolomics_", "pmh_", "ts_"]
}

In [None]:
#for initial
risk_columns = [
    "pmh_ts",
    "prs_metabolomics_pmh_ts"
]

## 1.5 Subset endpoints for NHS HC predictions

In [None]:
endpoint_names_nhc = ["DM", "CVD", "RD"]

## 1.7 exclude based on NHC inclusion criteria

In [None]:
inclusion = exclusion[~exclusion.drop(columns=["eid"]).any(axis=1)]['eid']
print(exclusion.shape)
print(inclusion.shape)

In [None]:
print(df.shape)
df = df[df['eid'].isin(inclusion)]
print(df.shape)

In [None]:
cols_with_nans = df.columns[df.isna().any()].tolist()
cols_with_nans

## 1.8 10y exclusion 

In [None]:
df.loc[df["CVD_followup"] > 10, "CVD_status"] = False
df.loc[df["RD_followup"] > 10, "RD_status"] = False
df.loc[df["DM_followup"] > 10, "DM_status"] = False

In [None]:
df["CVD_followup"] = df["CVD_followup"].clip(upper=10)
df["RD_followup"] = df["RD_followup"].clip(upper=10)
df["DM_followup"] = df["DM_followup"].clip(upper=10)

In [None]:
status_mask = (
    (df["CVD_status"] == False) &
    (df["RD_status"] == False) &
    (df["DM_status"] == False)
)

# 2. Final Loop

In [None]:
for risk_col in risk_columns:
    print(f"Processing population based on risk column: {risk_col}")

    # Determine the high-risk population for all endpoints based on survival probabilities (train data)
    cvd_risk_filtered_5_train = train_dataframes_absrisk_nhc_10y["CVD"].loc[
        train_dataframes_absrisk_nhc_10y["CVD"][risk_col] < 0.95, "eid"
    ]
    rd_risk_filtered_5_train = train_dataframes_absrisk_nhc_10y["RD"].loc[
        train_dataframes_absrisk_nhc_10y["RD"][risk_col] < 0.95, "eid"
    ]
    dm_risk_filtered_5_train = train_dataframes_absrisk_nhc_10y["DM"].loc[
        train_dataframes_absrisk_nhc_10y["DM"][risk_col] < 0.95, "eid"
    ]

    # Calculate intersections for the high-risk population at 5% threshold (train data)
    high_risk_eids_5_train = set(cvd_risk_filtered_5_train).union(set(rd_risk_filtered_5_train), set(dm_risk_filtered_5_train))
    print(f"High-risk eids for 5% threshold (train): {len(high_risk_eids_5_train)}")

    cvd_risk_filtered_10_train = train_dataframes_absrisk_nhc_10y["CVD"].loc[
        train_dataframes_absrisk_nhc_10y["CVD"][risk_col] < 0.90, "eid"
    ]
    rd_risk_filtered_10_train = train_dataframes_absrisk_nhc_10y["RD"].loc[
        train_dataframes_absrisk_nhc_10y["RD"][risk_col] < 0.90, "eid"
    ]
    dm_risk_filtered_10_train = train_dataframes_absrisk_nhc_10y["DM"].loc[
        train_dataframes_absrisk_nhc_10y["DM"][risk_col] < 0.90, "eid"
    ]

    # Calculate intersections for the high-risk population at 10% threshold (train data)
    high_risk_eids_10_train = set(cvd_risk_filtered_10_train).union(set(rd_risk_filtered_10_train), set(dm_risk_filtered_10_train))
    print(f"High-risk eids for 10% threshold (train): {len(high_risk_eids_10_train)}")

    # Determine the high-risk population for all endpoints based on survival probabilities (test data)
    cvd_risk_filtered_5_test = test_dataframes_absrisk_nhc_10y["CVD"].loc[
        test_dataframes_absrisk_nhc_10y["CVD"][risk_col] < 0.95, "eid"
    ]
    rd_risk_filtered_5_test = test_dataframes_absrisk_nhc_10y["RD"].loc[
        test_dataframes_absrisk_nhc_10y["RD"][risk_col] < 0.95, "eid"
    ]
    dm_risk_filtered_5_test = test_dataframes_absrisk_nhc_10y["DM"].loc[
        test_dataframes_absrisk_nhc_10y["DM"][risk_col] < 0.95, "eid"
    ]

    # Calculate intersections for the high-risk population at 5% threshold (test data)
    high_risk_eids_5_test = set(cvd_risk_filtered_5_test).union(set(rd_risk_filtered_5_test), set(dm_risk_filtered_5_test))
    print(f"High-risk eids for 5% threshold (test): {len(high_risk_eids_5_test)}")

    cvd_risk_filtered_10_test = test_dataframes_absrisk_nhc_10y["CVD"].loc[
        test_dataframes_absrisk_nhc_10y["CVD"][risk_col] < 0.90, "eid"
    ]
    rd_risk_filtered_10_test = test_dataframes_absrisk_nhc_10y["RD"].loc[
        test_dataframes_absrisk_nhc_10y["RD"][risk_col] < 0.90, "eid"
    ]
    dm_risk_filtered_10_test = test_dataframes_absrisk_nhc_10y["DM"].loc[
        test_dataframes_absrisk_nhc_10y["DM"][risk_col] < 0.90, "eid"
    ]

    # Calculate intersections for the high-risk population at 10% threshold (test data)
    high_risk_eids_10_test = set(cvd_risk_filtered_10_test).union(set(rd_risk_filtered_10_test), set(dm_risk_filtered_10_test))
    print(f"High-risk eids for 10% threshold (test): {len(high_risk_eids_10_test)}")

    # Proceed with model fitting for each endpoint...
    for endpoint in endpoint_names_nhc:
        print(f"Started with endpoint: {endpoint}")

        # Endpoint specific exclusion and other preprocessing (same as before)
        eids_to_include = df[df[f"{endpoint}_at_base"] == False]["eid"]
        df_filtered = df[df["eid"].isin(eids_to_include)]
        print(f"Retained n = {len(eids_to_include)} individuals due to criteria: past occurrence of endpoint")

        # Filter logical columns to improve robustness
        logical_cols = df_filtered[[col for col in df_filtered.columns if (col.startswith('pmh_') or col.startswith('ts_')) and df_filtered[col].dtype == 'bool']]
        cols_to_remove = [col for col in logical_cols.columns if logical_cols[col].mean() < 0.001 or logical_cols[col].mean() > 0.999]
        df_filtered = df_filtered.drop(columns=cols_to_remove)

        # Make labels
        labels = df_filtered[[f"{endpoint}_status", f"{endpoint}_followup", "eid", "testtrain"]].copy()
        labels = labels.set_index("eid")

        for combo_name, prefixes in predictor_combinations.items():
            print(f"Analyzing combination: {combo_name} for {risk_col} and endpoint {endpoint}")

            selected_cols = always_include + [col for col in df_filtered.columns if any(col.startswith(prefix) for prefix in prefixes) and col not in always_include]
            df_filtered2 = df_filtered[selected_cols]
            df_filtered2 = df_filtered2.set_index("eid").replace({'TRUE': 1, 'FALSE': 0})

            # Split the data into training and testing sets
            train_data, test_data, train_labels, test_labels = split_train_test(df_filtered2, labels)

            # Subset the train data further for high-risk individuals only
            train_data_5 = train_data[train_data.index.isin(high_risk_eids_5_train)]
            train_data_10 = train_data[train_data.index.isin(high_risk_eids_10_train)]

            # Subset the test data further for high-risk individuals only
            test_data_5 = test_data[test_data.index.isin(high_risk_eids_5_test)]
            test_data_10 = test_data[test_data.index.isin(high_risk_eids_10_test)]

            # Train and evaluate models using the high-risk population for 5% threshold
            if not train_data_5.empty and not test_data_5.empty:
                model_suffix_5pct = f"risk_{risk_col}_model_{combo_name}_5pct"
                best_model_5, results_df_5 = train_opt_EN(train_data_5, train_labels.loc[train_data_5.index])
                upload_model(best_model_5, endpoint, model_suffix_5pct, results_df_5)
                save_and_upload_lps(best_model_5, train_data_5, test_data_5, train_labels.loc[train_data_5.index], test_labels.loc[test_data_5.index], endpoint, model_suffix_5pct)
                save_and_upload_coefficients(best_model_5, train_data_5, endpoint, model_suffix_5pct)
                calculate_and_upload_survival_probs(best_model_5, train_data_5, test_data_5, endpoint, model_suffix_5pct)

            # Train and evaluate models using the high-risk population for 10% threshold
            if not train_data_10.empty and not test_data_10.empty:
                model_suffix_10pct = f"risk_{risk_col}_model_{combo_name}_10pct"
                best_model_10, results_df_10 = train_opt_EN(train_data_10, train_labels.loc[train_data_10.index])
                upload_model(best_model_10, endpoint, model_suffix_10pct, results_df_10)
                save_and_upload_lps(best_model_10, train_data_10, test_data_10, train_labels.loc[train_data_10.index], test_labels.loc[test_data_10.index], endpoint, model_suffix_10pct)
                save_and_upload_coefficients(best_model_10, train_data_10, endpoint, model_suffix_10pct)
                calculate_and_upload_survival_probs(best_model_10, train_data_10, test_data_10, endpoint, model_suffix_10pct)
