# 0. Installing packages

In [None]:
!pip uninstall scikit-learn scikit-survival -y

!pip install scikit-learn
!pip install scikit-survival

!pip install lifelines

!pip install joblib

!pip install openpyxl

In [None]:
import sksurv
import lifelines

import os 
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.metrics import concordance_index_censored
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.util import Surv

import itertools

from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

from sklearn.metrics import log_loss

import joblib

import warnings
from sklearn.exceptions import FitFailedWarning

# 1. Utils

## 1.1 EN

In [None]:
def train_opt_EN(train_data, train_labels, 
                      l1_ratios=np.linspace(0.1, 1.0, 10), max_iter=100, alpha_min_ratio=0.01, cv_folds=5, verbose = True):
   
    """
    EN model hyperparam opt 
        - estimates alpha grid using initial model with l1 = 0.5
        - 5-fold CV along a 10x10 alpha-lambda grid 
        - determines optimal alpha and lambda
        - retrains model on whole training split using optimal settings
        - returns model and CV results
    """
    

    labels_array = np.array([(status, time) for status, time in zip(train_labels.iloc[:, 0], train_labels.iloc[:, 1])], dtype=[('event', '?'), ('time', '<f8')])

    warnings.simplefilter("ignore", UserWarning)
    warnings.simplefilter("ignore", FitFailedWarning)

    print("estimating alphas with lambda=0.5...")

    initial_model = CoxnetSurvivalAnalysis(l1_ratio=0.5, alpha_min_ratio=alpha_min_ratio, max_iter=max_iter, n_alphas = 5)
    initial_model.fit(train_data, labels_array)
    estimated_alphas = initial_model.alphas_

    print(f"estimated {len(estimated_alphas)} alphas ranging from {estimated_alphas.min():.5f} to {estimated_alphas.max():.5f}.")

    #cv grid
    param_grid = {
        'l1_ratio': l1_ratios,
        'alphas': [[alpha] for alpha in estimated_alphas]
    }

    cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)

    grid_search = GridSearchCV(
        CoxnetSurvivalAnalysis(max_iter=max_iter),
        param_grid=param_grid,
        cv=cv,
        n_jobs=-1,
        verbose=1 if verbose else 0
    )
    
    grid_search.fit(train_data, labels_array)

    #get best model
    best_model = grid_search.best_estimator_ 
    best_l1_ratio = grid_search.best_params_['l1_ratio']
    best_alpha = grid_search.best_params_['alphas'][0]

    if verbose:
        print(f"\nBest l1_ratio: {best_l1_ratio:.2f}, Best alpha: {best_alpha:.5f}")

    cv_results = pd.DataFrame(grid_search.cv_results_)

    return best_model, cv_results


## 1.2 Data Prep

In [None]:
def split_train_test(df_filtered, labels, testtrain_column='testtrain'):
    
    train_data = df_filtered[df_filtered[testtrain_column] == 'train'].drop(columns=[testtrain_column])
    test_data = df_filtered[df_filtered[testtrain_column] == 'test'].drop(columns=[testtrain_column])

    train_labels = labels[labels[testtrain_column] == 'train'].drop(columns=[testtrain_column])
    test_labels = labels[labels[testtrain_column] == 'test'].drop(columns=[testtrain_column])

    return train_data, test_data, train_labels, test_labels

In [None]:
#dl file
dl_cmd = f"dx download 'UKBRISK_Processed/Processed_final_25112024.tsv' --overwrite"
!{dl_cmd}
df = pd.read_csv("Processed_final_25112024.tsv", sep="\t")

In [None]:
endpoint_names = [
    "CVD", "HF", "CAD", "ISS", "PAD"
]

In [None]:
'''
endpoint_names = [
    "CVD", "HF", "BC", "DM", "LD", "RD", "AF",  "CAD", "VT", "ISS", 
    "AAA", "PAD", "AS", "COPD", "LC", "MEL", "CRC", "PC",  
    "PD", "OP", "CAT", "POAG", "HT", "AD" 
]
'''


In [None]:
print(endpoint_names)


## subset ts for estbb questions

In [None]:
dl_cmd = f"dx download 'UKBRISK/Variables_to_calculate_risk_scores_v2_250621_includedcol.xlsx' --overwrite"
!{dl_cmd}
mapping_EGCUT = pd.read_excel('Variables_to_calculate_risk_scores_v2_250621_includedcol.xlsx')

In [None]:
filemapping = 'Variables_to_calculate_risk_scores_v2_250621_includedcol.xlsx'
included = []
df_mapping = pd.read_excel(filemapping, sheet_name=3, usecols=[0,1])          
included = df_mapping.loc[df_mapping.iloc[:,0]=='x', df_mapping.columns[1]].tolist() 
included = ['ts_' + var for var in dict.fromkeys(included)]

In [None]:
qrisk_cols = [col for col in df.columns if col.startswith('qrisk_') and col not in ['qrisk_Townsend.deprivation.index.at.recruitment', 
                                                                                    'qrisk_Illnesses.of.relatives.0_1', 
                                                                                    'qrisk_SBP_sd']]
score2_cols = [col for col in df.columns if col.startswith('score_')]
prevent_cols = [col for col in df.columns if col.startswith('prevent_') and col not in ['prevent_Townsend.deprivation.index.at.recruitment', 
                                                                                        'prevent_UACR',
                                                                                        'prevent_Glycated.haemoglobin..HbA1c....Instance.0']]

In [None]:
included += qrisk_cols + score2_cols + prevent_cols

In [None]:
final_columns = [
    col for col in df.columns
    if col.startswith('pmh_')
    or col in included
    or col in ["eid", "testtrain"]
    or any(col.startswith(ep) for ep in endpoint_names)
]
df = df[final_columns]

## 1.3 Saving & Uploading

In [None]:
def upload_model(model, endpoint, combo_name, cvresults, directory="UKBRISK_ENModels/ExtValEstBB"):
    
    filename_model = f"EN_{endpoint}_{combo_name}.pkl"
    upload_cmd_model = f"dx upload {filename_model} --path {directory}/{filename_model}"
    
    filename_cvresults = f"EN_{endpoint}_{combo_name}_cvresults.tsv"
    upload_cmd_cvresults = f"dx upload {filename_cvresults} --path {directory}/{filename_cvresults}"
    
    joblib.dump(model, filename_model)
    !{upload_cmd_model}
    
    cvresults.to_csv(filename_cvresults, sep='\t', index=False)
    !{upload_cmd_cvresults}
    
    os.remove(filename_model)
    os.remove(filename_cvresults)

In [None]:
def save_and_upload_lps(model, train_data, test_data, train_labels, test_labels, endpoint, combo_name, directory="UKBRISK_ENModels/ExtValEstBB"):

    train_lp = model.predict(train_data)
    test_lp = model.predict(test_data)
    
    train_lp_df = pd.DataFrame({"eid": train_labels.index, "LP": train_lp})
    test_lp_df = pd.DataFrame({"eid": test_labels.index, "LP": test_lp})

    train_lp_filename = f"{endpoint}_{combo_name}_train_LP.tsv"
    test_lp_filename = f"{endpoint}_{combo_name}_test_LP.tsv"
    train_lp_df.to_csv(train_lp_filename, sep='\t', index=False)
    test_lp_df.to_csv(test_lp_filename, sep='\t', index=False)
    
    upload_cmd_trainlp = f"dx upload {train_lp_filename} --path {directory}/{train_lp_filename}"
    upload_cmd_testlp = f"dx upload {test_lp_filename} --path {directory}/{test_lp_filename}"
    !{upload_cmd_trainlp}
    !{upload_cmd_testlp}


In [None]:
def save_and_upload_coefficients(model, train_data, endpoint, combo_name, directory="UKBRISK_ENModels/ExtValEstBB"):

    coeff_filename = f"{endpoint}_{combo_name}_coefficients.tsv"
    coef_df = pd.DataFrame(model.coef_, index=train_data.columns, columns=["Coefficient"])
    coef_df.to_csv(coeff_filename, sep='\t')
    
    upload_cmd_coef = f"dx upload {coeff_filename} --path {directory}/{coeff_filename}"
    !{upload_cmd_coef}

## 1.4 Predictor combos

In [None]:
always_include = ['qrisk_Age.at.recruitment','qrisk_Sex_0','qrisk_Sex_1', "eid", "testtrain"]

predictor_combinations = {
    "agesex": [],
    "pmh": ["pmh_"],
    "ts": ["ts_"],
    #"metabolomics": ["metabolomics_"],
    #"prs": ["prs_"],
    #"clinicalrisk": ["clinicalrisk_"],
    "pmh_ts": ["pmh_", "ts_"],
    #"prs_metabolomics": ["prs_", "metabolomics_"],
    #"prs_metabolomics_pmh_ts": ["prs_", "metabolomics_", "pmh_", "ts_"],
    #"clinicalrisk_pmh_ts": ["clinicalrisk_", "pmh_", "ts_"],
    #"clinicalrisk_prs_metabolomics": ["clinicalrisk_", "prs_", "metabolomics_"],
    #"everything": ["clinicalrisk_", "pmh_", "ts_", "prs_", "metabolomics_"],
    "score": ["score_"],
    "qrisk": ["qrisk_"],
    "prevent": ["prevent_"]
}


# 2. Final Loop

## 2.1 for everything

In [None]:
for endpoint in endpoint_names:
    print(f"started with: {endpoint}")
    
    #endpoint specific exclusion
    #bl endpoint status
    eids_to_include = df[df[f"{endpoint}_at_base"] == False]["eid"]
    df_filtered = df[df["eid"].isin(eids_to_include)]
    print(f"retained n = {len(eids_to_include)} individuals due to criteria: past occurrence of endpoint")
    
    #sex
    if endpoint == "PC":
        eids_to_exclude = df[df["clinicalrisk_Sex_0"] == True]["eid"]
        df_filtered = df_filtered[~df_filtered["eid"].isin(eids_to_exclude)]
    elif endpoint == "BC":
        eids_to_exclude = df[df["clinicalrisk_Sex_1"] == True]["eid"]
        df_filtered = df_filtered[~df_filtered["eid"].isin(eids_to_exclude)]
        
    #remove low count logical cols
    logical_cols = df_filtered[[col for col in df_filtered.columns if (col.startswith('pmh_') or col.startswith('ts_')) and df_filtered[col].dtype == 'bool']]
    cols_to_remove = [col for col in logical_cols.columns if logical_cols[col].mean() < 0.001 or logical_cols[col].mean() > 0.999]
    df_filtered = df_filtered.drop(columns=cols_to_remove)
    
    #make labels
    labels = df_filtered[[f"{endpoint}_status",f"{endpoint}_followup","eid","testtrain"]].copy()
    labels = labels.set_index("eid")

    for combo_name, prefixes in predictor_combinations.items():
        
        print(f"Analyzing combination: {combo_name}")
        
        selected_cols = always_include + [col for col in df_filtered.columns if any(col.startswith(prefix) for prefix in prefixes) and col not in always_include]
        df_filtered2 = df_filtered[selected_cols]
        df_filtered2 = df_filtered2.set_index("eid").replace({'TRUE': 1, 'FALSE': 0})

        train_data, test_data, train_labels, test_labels = split_train_test(df_filtered2, labels)
        
        best_model, results_df = train_opt_EN(train_data, train_labels)
        
        upload_model(best_model, endpoint, combo_name, results_df)
        
        save_and_upload_lps(best_model, train_data, test_data, train_labels, test_labels, endpoint, combo_name)
        
        save_and_upload_coefficients(best_model, train_data, endpoint, combo_name)



# 3. Coef table

In [None]:
import subprocess, pandas as pd

remote_dir = "UKBRISK_ENModels/ExtValEstBB"
coef_dict = {}

# grab list of all remote coeff files
for fn in subprocess.check_output(f"dx ls {remote_dir}", shell=True, text=True).split():
    if fn.endswith("_coefficients.tsv"):
        # download it
        subprocess.run(f"dx download {remote_dir}/{fn} --overwrite", shell=True)
        # read the single Coefficient column as a Series
        s = pd.read_csv(fn, sep="\t", index_col=0)["Coefficient"]
        # name it by endpoint_combo
        coef_dict[fn.replace("_coefficients.tsv","")] = s

coef_table = pd.DataFrame(coef_dict)

In [None]:
coef_table.to_csv("all_endpoint_combo_coefficients.tsv", sep="\t")
upload_cmd = f"dx upload 'all_endpoint_combo_coefficients.tsv' --path 'UKBRISK/all_endpoint_combo_coefficients.tsv'"
!{upload_cmd}