In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.util import Surv
from lifelines.utils import concordance_index
from sklearn.metrics import roc_auc_score

In [None]:
from joblib import load
model = load('./hyperopt_results/best_model.pkl')
    
test_df = pd.read_csv('./data/equity-post-HCT-survival-predictions/test.csv')


In [30]:
model.n_features_in_

213

In [None]:
def prepare_data(df, categorical_cols, id_col='ID', expected_columns=None):
    # Create a copy of the dataframe
    data = df.copy()
    
    # Ensure efs is integer (event indicator: 0 or 1)
    # data['efs'] = data['efs'].astype(int)
    
    # Drop the ID column if it exists
    if id_col in data.columns:
        data = data.drop(columns=[id_col])
        print(f"Dropped column: {id_col}")
    else:
        print(f"No column named '{id_col}' found in the dataset")
        
    
    # Separate features and target
    X = data
    # y = Surv.from_arrays(event=data['efs'], time=data['efs_time'])
    
    
    
    # Define preprocessing for categorical and numerical columns
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    numerical_cols = [col for col in X.columns if col not in categorical_cols]
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_cols),
            ('num', numerical_transformer, numerical_cols)
        ])
    
    # Fit and transform the data
    X_preprocessed = preprocessor.fit_transform(X)
    
    # Get feature names after one-hot encoding
    cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
    feature_names = np.concatenate([cat_feature_names, numerical_cols])
    # Add missing columns that are in the expected list but not in the dataframe
    if expected_columns is not None:
        missing_cols = [col for col in expected_columns if col not in data.columns]
        for col in missing_cols:
            data[col] = 0  # Fill with 0 as specified
            print(f"Added missing column: {col} (filled with 0)")
    
    return X_preprocessed

In [None]:
categorical_cols=[
    'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
    'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
    'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
    'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
    'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
    'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
    'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related',
    'melphalan_dose', 'cardiac', 'pulm_moderate'
]
id_col='ID'

test_df_preprocessed = prepare_data(test_df, categorical_cols, id_col)

In [None]:
test_df_preprocessed.shape

In [None]:
model.co

In [None]:
def create_submission(predictions, ids, output_path):
    """
    Create submission file in the required format
    """
    submission = pd.DataFrame({
        'ID': ids,
        'prediction': predictions
    })
    
    submission.to_csv(output_path, index=False)
    print(f"Submission file created at {output_path}")
    
    return submission

In [None]:
test_df_preprocessed.columns