# Imports and configs

In [None]:
"""
To evaluate the equitable prediction of transplant survival outcomes,
we use the concordance index (C-index) between a series of event
times and a predicted score across each race group.
 
It represents the global assessment of the model discrimination power:
this is the model’s ability to correctly provide a reliable ranking
of the survival times based on the individual risk scores.
 
The concordance index is a value between 0 and 1 where:
 
0.5 is the expected result from random predictions,
1.0 is perfect concordance (with no censoring, otherwise <1.0),
0.0 is perfect anti-concordance (with no censoring, otherwise >0.0)

"""

import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

In [None]:
from sklearn.preprocessing import quantile_transform
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [None]:
class CFG:
    train_path = "/kaggle/input/equity-post-HCT-survival-predictions/train.csv"
    test_path = "/kaggle/input/equity-post-HCT-survival-predictions/test.csv"
    sample_sub_path = "/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv"

    n_folds = 10
    seed = 42
    time_limit = 3600 * 8
    target = 'quantile'

# Loading and preprocessing data

In [None]:
train = pd.read_csv(CFG.train_path)
test = pd.read_csv(CFG.test_path)

In [None]:
def create_target(time, event):
    # https://www.kaggle.com/code/ambrosm/esp-eda-which-makes-sense
    transformed = np.full(len(time), np.nan)
    transformed_dead = quantile_transform(- time[event == 1].values.reshape(-1, 1)).ravel()
    transformed[event == 1] = transformed_dead
    transformed[event == 0] = transformed_dead.min() - 0.3
    return transformed

In [None]:
train[CFG.target] = create_target(train.efs_time, train.efs)

In [None]:
kf = KFold(n_splits=CFG.n_folds, random_state=CFG.seed, shuffle=True)
split = kf.split(train, train[["efs", "efs_time"]])
for i, (train_index, val_index) in enumerate(split):
    train.loc[val_index, 'fold'] = i

In [None]:
train = train.drop(['ID', 'efs', 'efs_time'], axis=1)

# Training the predictor

In [None]:
predictor = TabularPredictor(
    path='/ag_logs',
    problem_type='regression',
    eval_metric='rmse',
    label=CFG.target,
    groups='fold',
    verbosity=2
)

In [None]:
predictor.fit(
    train_data=train,
    time_limit=CFG.time_limit,
    presets='best_quality',
    excluded_model_types=['KNN'],
    keep_only_best=True
)

In [None]:
predictor.leaderboard(silent=True).style.background_gradient(subset=['score_val'], cmap='RdYlGn')

# Creating a submission file

In [None]:
sub = pd.read_csv(CFG.sample_sub_path)
sub.prediction = predictor.predict(test).values
sub.to_csv("submission.csv", index=False)
sub.head()

# Results

In [None]:
_train = pd.read_csv(CFG.train_path)

oof_preds = {}
overall_scores = {}
best_model = predictor.model_best
for model in predictor.model_names():
    model_oof_preds = predictor.predict_oof(model).values
    oof_preds[model] = model_oof_preds

    y_true = _train[['ID', 'efs', 'efs_time', 'race_group']].copy()
    y_pred = _train[['ID']].copy()
    y_pred['prediction'] = model_oof_preds
    
    overall_scores[model] = [score(y_true, y_pred, 'ID')]

In [None]:
scores = {}
split = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed).split(train, train[CFG.target])
for fold_idx, (train_index, val_index) in enumerate(split):
    for model in predictor.model_names():

        y_true_fold = _train.iloc[val_index][['ID', 'efs', 'efs_time', 'race_group']].copy()
        y_pred_fold = _train.iloc[val_index][['ID']].copy()
        y_pred_fold['prediction'] = oof_preds[model][val_index]
        
        fold_score = score(y_true_fold, y_pred_fold, 'ID')

        if model not in scores:
            scores[model] = []
        scores[model].append(fold_score)

In [None]:
scores_df = pd.DataFrame(scores)
overall_scores_series = pd.Series({k: v[0] for k, v in overall_scores.items()})
order = overall_scores_series.sort_values(ascending=False).index.tolist()

min_score = min(scores_df.min().min(), overall_scores_series.min())
max_score = max(scores_df.max().max(), overall_scores_series.max())
padding = (max_score - min_score) * 0.5
lower_limit = min_score - padding
upper_limit = max_score + padding

fig, axs = plt.subplots(1, 2, figsize=(15, len(scores) * 0.4))

sns.boxplot(data=scores_df, order=order, ax=axs[0], orient='h', palette='RdYlGn_r')
axs[0].set_title('Fold CI')
axs[0].set_xlabel('')
axs[0].set_ylabel('')

barplot = sns.barplot(x=overall_scores_series, y=overall_scores_series.index, ax=axs[1], palette='RdYlGn_r', order=order)
axs[1].set_title('Overall CI')
axs[1].set_xlabel('')
axs[1].set_xlim(left=lower_limit, right=upper_limit)
axs[1].set_ylabel('')

for i, score in enumerate(overall_scores_series[order]):
    barplot.text(score, i, f'{score:.6f}', va='center')

plt.tight_layout()
plt.show()