#### ideas
1. Classification -> Regression?
2. 


In [None]:
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')
import numpy as np
import polars as pl
import pandas as pd

In [None]:
import plotly.colors as pc
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# pio.renderers.default = 'iframe'
pio.renderers.default = 'notebook'

In [None]:
pd.options.display.max_columns = None

In [None]:
import lightgbm as lgb
import xgboost as xgb
from scipy.stats import rankdata
from sklearn.model_selection import KFold

In [None]:
from catboost import CatBoostRegressor
from lifelines import CoxPHFitter, KaplanMeierFitter, NelsonAalenFitter

In [None]:
class CFG:

    train_path = Path('input/train.csv')
    test_path = Path('input/test.csv')
    subm_path = Path('input/sample_submission.csv')

    color = '#EADDCA'

    batch_size = 32768
    early_stop = 300
    penalizer = 0.01
    n_splits = 5

    weights = [0.5, 0.5, 4.0, 2.0, 4.0, 2.0, 3.0, 3.0]

    ctb_params = {
        'loss_function': 'RMSE',
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'num_trees': 6000,
        'subsample': 0.85,
        'reg_lambda': 8.0,
        'depth': 8
    }

    lgb_params = {
        'objective': 'regression',
        'min_child_samples': 32,
        'num_iterations': 6000,
        'learning_rate': 0.03,
        'extra_trees': True,
        'reg_lambda': 8.0,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'metric': 'rmse',
        'max_depth': 8,
        'device': 'cpu',
        'max_bin': 128,
        'verbose': -1,
        'seed': 42
    }

    lgb_params_tweedie = {
        'objective': 'tweedie',
        'min_child_samples': 32,
        'num_iterations': 6000,
        'learning_rate': 0.03,
        'extra_trees': True,
        'reg_lambda': 8.0,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'metric': 'rmse',
        'max_depth': 8,
        'device': 'cpu',
        'max_bin': 128,
        'verbose': -1,
        'seed': 42
    }

    cox1_params = {
        'grow_policy': 'Depthwise',
        'min_child_samples': 8,
        'loss_function': 'Cox',
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'num_trees': 6000,
        'subsample': 0.85,
        'reg_lambda': 8.0,
        'depth': 8
    }

    cox2_params = {
        'grow_policy': 'Lossguide',
        'loss_function': 'Cox',
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'num_trees': 6000,
        'subsample': 0.85,
        'reg_lambda': 8.0,
        'num_leaves': 32,
        'depth': 8
    }

## Feature Engineering

In [None]:
class FE:

    def __init__(self, batch_size):
        self.batch_size = batch_size

    def load_data(self, path):

        return pl.read_csv(path, batch_size=self.batch_size)

    def cast_datatypes(self, df):

        num_cols = [
            'hla_high_res_8',
            'hla_low_res_8',
            'hla_high_res_6',
            'hla_low_res_6',
            'hla_high_res_10',
            'hla_low_res_10',
            'hla_match_dqb1_high',
            'hla_match_dqb1_low',
            'hla_match_drb1_high',
            'hla_match_drb1_low',
            'hla_nmdp_6',
            'year_hct',
            'hla_match_a_high',
            'hla_match_a_low',
            'hla_match_b_high',
            'hla_match_b_low',
            'hla_match_c_high',
            'hla_match_c_low',
            'donor_age',
            'age_at_hct',
            'comorbidity_score',
            'karnofsky_score',
            'efs',
            'efs_time'
        ]

        # fill missings
        for col in df.columns:
            if col in num_cols:
                df = df.with_columns(pl.col(col).fill_null(-1).cast(pl.Float32))  

            else:
                df = df.with_columns(pl.col(col).fill_null('Unknown').cast(pl.String))  

        return df.with_columns(pl.col('ID').cast(pl.Int32))

    def info(self, df):
        
        print(f'\nShape of dataframe: {df.shape}') 
        
        mem = df.memory_usage().sum() / 1024**2
        print('Memory usage: {:.2f} MB\n'.format(mem))

        display(df.head())

    def apply_fe(self, path):

        df = self.load_data(path)
        df = self.cast_datatypes(df)
        df = df.to_pandas()

        self.info(df)
        
        cat_cols = [col for col in df.columns if df[col].dtype == pl.String]

        return df, cat_cols

In [None]:
fe = FE(CFG.batch_size)

In [None]:
train_data, cat_cols = fe.apply_fe(CFG.train_path)

In [None]:
test_data, _ = fe.apply_fe(CFG.test_path)

## EDA

In [None]:
class EDA:
    
    def __init__(self, color, data):
        self._color = color  
        self.data = data

    def _template(self, fig, title):
        
        fig.update_layout(
            title=title,
            title_x=0.5, 
            plot_bgcolor='rgba(74, 74, 74, 1)',  
            paper_bgcolor='rgba(74, 74, 74, 1)', 
            font=dict(color=self._color),
            margin=dict(l=72, r=72, t=72, b=72), 
            height=720
        )
        
        return fig

    def distribution_plot(self, col, title):
        
        fig = px.histogram(
            self.data,
            x=col,
            nbins=100,
            color_discrete_sequence=[self._color]
        )
        
        fig.update_layout(
            xaxis_title='Values',
            yaxis_title='Count',
            bargap=0.1,
            xaxis=dict(gridcolor='grey'),
            yaxis=dict(gridcolor='grey', zerolinecolor='grey')
        )
        
        fig.update_traces(hovertemplate='Value: %{x:.2f}<br>Count: %{y:,}')
        
        fig = self._template(fig, f'{title}')
        fig.show()
        
    def _plot_cv(self, scores, title, metric='C-Index'):
        
        fold_scores = [round(score, 4) for score in scores]
        mean_score = round(np.mean(scores), 4)

        fig = go.Figure()

        fig.add_trace(go.Scatter(
            x = list(range(1, len(fold_scores) + 1)),
            y = fold_scores,
            mode = 'markers', 
            name = 'Fold Scores',
            marker = dict(size = 27, color=self._color, symbol='diamond'),
            text = [f'{score:.3f}' for score in fold_scores],
            hovertemplate = 'Fold %{x}: %{text}<extra></extra>',
            hoverlabel = dict(font=dict(size=18))  
        ))

        fig.add_trace(go.Scatter(
            x = [1, len(fold_scores)],
            y = [mean_score, mean_score],
            mode = 'lines',
            name = f'Mean: {mean_score:.3f}',
            line = dict(dash = 'dash', color = '#FFBF00'),
            hoverinfo = 'none'
        ))
        
        fig.update_layout(
            title = f'{title} | Cross-validation Mean {metric} Score: {mean_score}',
            xaxis_title = 'Fold',
            yaxis_title = f'{metric} Score',
            plot_bgcolor = 'rgba(74, 74, 74, 1)',  
            paper_bgcolor = 'rgba(74, 74, 74, 1)',
            font = dict(color=self._color), 
            xaxis = dict(
                gridcolor = 'grey',
                tickmode = 'linear',
                tick0 = 1,
                dtick = 1,
                range = [0.5, len(fold_scores) + 0.5],
                zerolinecolor = 'grey'
            ),
            yaxis = dict(
                gridcolor = 'grey',
                zerolinecolor = 'grey'
            )
        )
        
        fig.show()

## Make Custom Targets

In [None]:
class Targets:

    def __init__(self, data, cat_cols, penalizer):
        self.data = data
        self.cat_cols = cat_cols
        self._penalizer = penalizer

    def create_target1(self):   

        # Convert categorical columns to numeric since CoxPH accepts only numeric covariates (features)
        data = pd.get_dummies(self.data, columns=self.cat_cols, drop_first=True).drop('ID', axis=1)
        
        cph = CoxPHFitter(penalizer=self._penalizer)
        cph.fit(data, duration_col='efs_time', event_col='efs')

        self.data['target1'] = cph.predict_partial_hazard(data)       

        return self.data

    def create_target2(self):        
        
        kmf = KaplanMeierFitter()          
        kmf.fit(durations=self.data['efs_time'], event_observed=self.data['efs'])

        self.data['target2'] = kmf.survival_function_at_times(self.data['efs_time']).values

        return self.data

    def create_target3(self):        
        
        naf = NelsonAalenFitter()
        naf.fit(durations=self.data['efs_time'], event_observed=self.data['efs'])

        self.data['target3'] = -naf.cumulative_hazard_at_times(self.data['efs_time']).values

        return self.data

    def create_target4(self):

        self.data['target4'] = self.data.efs_time.copy()
        self.data.loc[self.data.efs == 0, 'target4'] *= -1

        return self.data

## Make Models

In [None]:
class MD:
    
    def __init__(self, color, data, cat_cols, penalizer, n_splits, early_stop):
        
        self.eda = EDA(color, data)
        self.targets = Targets(data, cat_cols, penalizer)
        
        self.data = data
        self.cat_cols = cat_cols
        self._n_splits = n_splits
        self._early_stop = early_stop

    def create_targets(self):

        self.data = self.targets.create_target1()
        self.data = self.targets.create_target2()
        self.data = self.targets.create_target3()
        self.data = self.targets.create_target4()

        return self.data
        
    def train_model(self, params, target, title):
        
        for col in self.cat_cols:
            self.data[col] = self.data[col].astype('category')
            
        X = self.data.drop(['ID', 'efs', 'efs_time', 'target1', 'target2', 'target3', 'target4'], axis=1)
        y = self.data[target]
        
        models, fold_scores = [], []
            
        cv = KFold(n_splits=self._n_splits, shuffle=True, random_state=42)
                
        oof_preds = np.zeros(len(X))
    
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y)):
                
            X_train = X.iloc[train_index]
            X_valid = X.iloc[valid_index]
                
            y_train = y.iloc[train_index]
            y_valid = y.iloc[valid_index]

    
            if title.startswith('LightGBM'):
                        
                model = lgb.LGBMRegressor(**params)
                        
                model.fit(
                    X_train, 
                    y_train,  
                    eval_set=[(X_valid, y_valid)],
                    eval_metric='rmse',
                    callbacks=[lgb.early_stopping(self._early_stop, verbose=0), lgb.log_evaluation(0)]
                )
                        
            elif title.startswith('CatBoost'):
                        
                model = CatBoostRegressor(**params, verbose=0, cat_features=self.cat_cols)
                        
                model.fit(
                    X_train,
                    y_train,
                    eval_set=(X_valid, y_valid),
                    early_stopping_rounds=self._early_stop, 
                    verbose=0
                )           

            # elif title.startswith('XGBoost'):

            #     dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
            #     dvalid = xgb.DMatrix(data=X_valid, label=y_valid, enable_categorical=True)

            #     model = xgb.train(
            #         params,
            #         dtrain,
            #         self._early_stop,
            #         [(dvalid, "val")],
            #         verbose=0
            #     )

            models.append(model)
                
            oof_preds[valid_index] = model.predict(X_valid)

            y_true_fold = self.data.iloc[valid_index][['ID', 'efs', 'efs_time', 'race_group']].copy()
            y_pred_fold = self.data.iloc[valid_index][['ID']].copy()
            
            y_pred_fold['prediction'] = oof_preds[valid_index]
    
            fold_score = score(y_true_fold, y_pred_fold, 'ID')
            fold_scores.append(fold_score)
                
        self.eda._plot_cv(fold_scores, title)
            
        y_true = self.data[['ID', 'efs', 'efs_time', 'race_group']].copy()
        y_pred = self.data[['ID']].copy()
        
        y_pred['prediction'] = oof_preds
            
        c_index_score = score(y_true.copy(), y_pred.copy(), 'ID')
        print(f'\nOverall C-Index for {title}: {c_index_score:.4f}\n')
        
        return models, oof_preds

    def infer_model(self, data, models):
        
        data = data.drop(['ID'], axis=1)

        for col in self.cat_cols:
            data[col] = data[col].astype('category')

        return np.mean([model.predict(data) for model in models], axis=0)

In [None]:
md = MD(CFG.color, train_data, cat_cols, CFG.penalizer, CFG.n_splits, CFG.early_stop)

In [None]:
train_data = md.create_targets()

In [None]:
md.eda.distribution_plot('target1', 'Cox Target')

In [None]:
md.eda.distribution_plot('target2', 'Kaplan-Meier Target')

In [None]:
md.eda.distribution_plot('target3', 'Nelson-Aalen Target')

In [None]:
md.eda.distribution_plot('target4', 'Target for Cox-Loss Models')

In [None]:
fe.info(train_data)

### Models with Cox Target

In [None]:
# training
ctb1_models, ctb1_oof_preds = md.train_model(CFG.ctb_params, target='target1', title='CatBoost')

In [None]:
# training
lgb1_models, lgb1_oof_preds = md.train_model(CFG.lgb_params, target='target1', title='LightGBM')

In [None]:
# training
lgb1_tweedie_models, lgb1_tweedie_oof_preds = md.train_model(CFG.lgb_params_tweedie, target='target1', title='LightGBM_tweedie')

In [None]:
# predicting
ctb1_preds = md.infer_model(test_data, ctb1_models)

In [None]:
# predicting
lgb1_preds = md.infer_model(test_data, lgb1_models)

In [None]:
# predicting
lgb1_tweedie_preds = md.infer_model(test_data, lgb1_tweedie_models)

### Models with Kaplan-Meier Target

In [None]:
ctb2_models, ctb2_oof_preds = md.train_model(CFG.ctb_params, target='target2', title='CatBoost')

In [None]:
lgb2_models, lgb2_oof_preds = md.train_model(CFG.lgb_params, target='target2', title='LightGBM')

In [None]:
lgb2_tweedie_models, lgb2_tweedie_oof_preds = md.train_model(CFG.lgb_params_tweedie, target='target2', title='LightGBM_tweedie')

In [None]:
ctb2_preds = md.infer_model(test_data, ctb2_models)

In [None]:
lgb2_preds = md.infer_model(test_data, lgb2_models)

In [None]:
lgb2_tweedie_preds = md.infer_model(test_data, lgb2_tweedie_models)

### Models with Nelson-Aalen Target

In [None]:
ctb3_models, ctb3_oof_preds = md.train_model(CFG.ctb_params, target='target3', title='CatBoost')

In [None]:
lgb3_models, lgb3_oof_preds = md.train_model(CFG.lgb_params, target='target3', title='LightGBM')

In [None]:
lgb3_tweedie_models, lgb3_tweedie_oof_preds = md.train_model(CFG.lgb_tweedie_params, target='target3', title='LightGBM_tweedie')

In [None]:
ctb3_preds = md.infer_model(test_data, ctb3_models)

In [None]:
lgb3_preds = md.infer_model(test_data, lgb3_models)

In [None]:
lgb3_tweedie_preds = md.infer_model(test_data, lgb3_tweedie_models)

### Cox-Loss Models

In [None]:
cox1_models, cox1_oof_preds = md.train_model(CFG.cox1_params, target='target4', title='CatBoost')

In [None]:
cox2_models, cox2_oof_preds = md.train_model(CFG.cox2_params, target='target4', title='CatBoost')

In [None]:
cox1_preds = md.infer_model(test_data, cox1_models)

In [None]:
cox2_preds = md.infer_model(test_data, cox2_models)

### Ensemble Models

In [None]:
oof_preds = [
    ctb1_oof_preds, 
    lgb1_oof_preds,
    lgb1_tweedie_oof_preds,
    
    ctb2_oof_preds, 
    lgb2_oof_preds, 
    lgb2_tweedie_oof_preds,
    
    ctb3_oof_preds, 
    lgb3_oof_preds, 
    lgb3_tweedie_oof_preds,
    
    cox1_oof_preds,
    cox2_oof_preds
]

In [None]:
ranked_oof_preds = np.array([rankdata(p) for p in oof_preds])

In [None]:
ensemble_oof_preds = np.sum([w * p for w, p in zip(CFG.weights, ranked_oof_preds)], axis=0)

In [None]:
y_true = train_data[['ID', 'efs', 'efs_time', 'race_group']].copy()
y_pred = train_data[['ID']].copy()
        
y_pred['prediction'] = ensemble_oof_preds
            
c_index_score = score(y_true.copy(), y_pred.copy(), 'ID')
print(f'\nOverall C-Index for Ensemble model: {c_index_score:.4f}')

In [None]:
preds = [
    ctb1_preds, 
    lgb1_preds, 
    lgb1_tweedie_preds,
    
    ctb2_preds, 
    lgb2_preds, 
    lgb2_tweedie_preds,
    
    ctb3_preds, 
    lgb3_preds,
    lgb3_tweedie_preds,
    
    cox1_preds,
    cox2_preds
]

In [None]:
ranked_preds = np.array([rankdata(p) for p in preds])

In [None]:
ensemble_preds = np.sum([w * p for w, p in zip(CFG.weights, ranked_preds)], axis=0)

In [None]:
subm_data = pd.read_csv(CFG.subm_path)
subm_data['prediction'] = ensemble_preds

In [None]:
subm_data.to_csv('submission.csv', index=False)
display(subm_data.head())