# CIBMTR 2025

In [137]:
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.impute import KNNImputer

import textwrap

## Data loading

In [140]:
df_train = pd.read_csv('/content/drive/MyDrive/kaggle/CIBMTR_2025/data/train.csv',
                       index_col='ID')

In [141]:
df_train.head()

Unnamed: 0_level_0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,Bone marrow,...,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,Bone marrow,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,Bone marrow,...,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4,High,No,,No,2.0,8.0,No TBI,No,6.0,Peripheral blood,...,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223


In [142]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28800 entries, 0 to 28799
Data columns (total 59 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   dri_score               28646 non-null  object 
 1   psych_disturb           26738 non-null  object 
 2   cyto_score              20732 non-null  object 
 3   diabetes                26681 non-null  object 
 4   hla_match_c_high        24180 non-null  float64
 5   hla_high_res_8          22971 non-null  float64
 6   tbi_status              28800 non-null  object 
 7   arrhythmia              26598 non-null  object 
 8   hla_low_res_6           25530 non-null  float64
 9   graft_type              28800 non-null  object 
 10  vent_hist               28541 non-null  object 
 11  renal_issue             26885 non-null  object 
 12  pulm_severe             26665 non-null  object 
 13  prim_disease_hct        28800 non-null  object 
 14  hla_high_res_6          23516 non-null  flo

In [143]:
df_test = pd.read_csv('/content/drive/MyDrive/kaggle/CIBMTR_2025/data/test.csv',
                      index_col='ID')

In [144]:
df_test.head()

Unnamed: 0_level_0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28800,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,Bone marrow,...,90.0,No,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0
28801,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,...,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0
28802,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,Bone marrow,...,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0


In [145]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 28800 to 28802
Data columns (total 57 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   dri_score               3 non-null      object 
 1   psych_disturb           3 non-null      object 
 2   cyto_score              1 non-null      object 
 3   diabetes                3 non-null      object 
 4   hla_match_c_high        2 non-null      float64
 5   hla_high_res_8          2 non-null      float64
 6   tbi_status              3 non-null      object 
 7   arrhythmia              3 non-null      object 
 8   hla_low_res_6           3 non-null      float64
 9   graft_type              3 non-null      object 
 10  vent_hist               3 non-null      object 
 11  renal_issue             3 non-null      object 
 12  pulm_severe             3 non-null      object 
 13  prim_disease_hct        3 non-null      object 
 14  hla_high_res_6          3 non-null      flo

In [146]:
data_dictionary = pd.read_csv('/content/drive/MyDrive/kaggle/CIBMTR_2025/data/data_dictionary.csv')

In [147]:
data_dictionary

Unnamed: 0,variable,description,type,values
0,dri_score,Refined disease risk index,Categorical,['Intermediate' 'High' 'N/A - non-malignant in...
1,psych_disturb,Psychiatric disturbance,Categorical,['Yes' 'No' nan 'Not done']
2,cyto_score,Cytogenetic score,Categorical,['Intermediate' 'Favorable' 'Poor' 'TBD' nan '...
3,diabetes,Diabetes,Categorical,['No' 'Yes' nan 'Not done']
4,hla_match_c_high,Recipient / 1st donor allele level (high resol...,Numerical,
5,hla_high_res_8,Recipient / 1st donor allele-level (high resol...,Numerical,
6,tbi_status,TBI,Categorical,"['No TBI' 'TBI + Cy +- Other' 'TBI +- Other, <..."
7,arrhythmia,Arrhythmia,Categorical,['No' nan 'Yes' 'Not done']
8,hla_low_res_6,Recipient / 1st donor antigen-level (low resol...,Numerical,
9,graft_type,Graft type,Categorical,['Peripheral blood' 'Bone marrow']


In [148]:
sample_submission = pd.read_csv('/content/drive/MyDrive/kaggle/CIBMTR_2025/data/sample_submission.csv',
                                index_col='ID')

In [149]:
sample_submission

Unnamed: 0_level_0,prediction
ID,Unnamed: 1_level_1
28800,0.5
28801,0.5
28802,0.5


## Data preparing

Targets:

In [49]:
duration_col = 'efs_time'
event_col = 'efs'

Data types:

In [22]:
data_dictionary['type'].unique()

array(['Categorical', 'Numerical'], dtype=object)

In [44]:
categorical_cols = data_dictionary[data_dictionary['type'] == 'Categorical']['variable'].values
numerical_cols = data_dictionary[data_dictionary['type'] == 'Numerical']['variable'].values

print(f'CATEGORICAL COLUMNS:  ({len(categorical_cols)} items)')
print(textwrap.fill(', '.join(categorical_cols), width=80), '\n')
print(f'NUMERICAL COLUMNS:  ({len(numerical_cols)} items)')
print(textwrap.fill(', '.join(numerical_cols), width=80))

CATEGORICAL COLUMNS:  (36 items)
dri_score, psych_disturb, cyto_score, diabetes, tbi_status, arrhythmia,
graft_type, vent_hist, renal_issue, pulm_severe, prim_disease_hct, cmv_status,
tce_imm_match, rituximab, prod_type, cyto_score_detail, conditioning_intensity,
ethnicity, obesity, mrd_hct, in_vivo_tcd, tce_match, hepatic_severe,
prior_tumor, peptic_ulcer, gvhd_proph, rheum_issue, sex_match, race_group,
hepatic_mild, tce_div_match, donor_related, melphalan_dose, cardiac,
pulm_moderate, efs 

NUMERICAL COLUMNS:  (23 items)
hla_match_c_high, hla_high_res_8, hla_low_res_6, hla_high_res_6,
hla_high_res_10, hla_match_dqb1_high, hla_nmdp_6, hla_match_c_low,
hla_match_drb1_low, hla_match_dqb1_low, year_hct, hla_match_a_high, donor_age,
hla_match_b_low, age_at_hct, hla_match_a_low, hla_match_b_high,
comorbidity_score, karnofsky_score, hla_low_res_8, hla_match_drb1_high,
hla_low_res_10, efs_time


NA analysis:

In [28]:
def analyze_na(df):
    na_summary = pd.DataFrame({
        'NA count': df.isna().sum(),
        'NA percentage (%)': df.isna().mean() * 100
    })
    return na_summary

In [45]:
analyze_na(df_train[categorical_cols])

Unnamed: 0,NA count,NA percentage (%)
dri_score,154,0.534722
psych_disturb,2062,7.159722
cyto_score,8068,28.013889
diabetes,2119,7.357639
tbi_status,0,0.0
arrhythmia,2202,7.645833
graft_type,0,0.0
vent_hist,259,0.899306
renal_issue,1915,6.649306
pulm_severe,2135,7.413194


In [46]:
analyze_na(df_train[numerical_cols])

Unnamed: 0,NA count,NA percentage (%)
hla_match_c_high,4620,16.041667
hla_high_res_8,5829,20.239583
hla_low_res_6,3270,11.354167
hla_high_res_6,5284,18.347222
hla_high_res_10,7163,24.871528
hla_match_dqb1_high,5199,18.052083
hla_nmdp_6,4197,14.572917
hla_match_c_low,2800,9.722222
hla_match_drb1_low,2643,9.177083
hla_match_dqb1_low,4194,14.5625


Fill NA values in the categorical columns with the mode value of each column. Futher, encode the values with the `LabelEncoder`.

In [150]:
# Filling NAs with mode values
for col in categorical_cols:
    df_train[col].fillna(df_train[col].mode()[0], inplace=True)

# Data encoding
encoder = OrdinalEncoder()
df_train[categorical_cols] = encoder.fit_transform(df_train[categorical_cols])

In [114]:
analyze_na(df_train[categorical_cols])

Unnamed: 0,NA count,NA percentage (%)
dri_score,0,0.0
psych_disturb,0,0.0
cyto_score,0,0.0
diabetes,0,0.0
tbi_status,0,0.0
arrhythmia,0,0.0
graft_type,0,0.0
vent_hist,0,0.0
renal_issue,0,0.0
pulm_severe,0,0.0


In [115]:
df_train[categorical_cols].head()

Unnamed: 0_level_0,dri_score,psych_disturb,cyto_score,diabetes,tbi_status,arrhythmia,graft_type,vent_hist,renal_issue,pulm_severe,...,rheum_issue,sex_match,race_group,hepatic_mild,tce_div_match,donor_related,melphalan_dose,cardiac,pulm_moderate,efs
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,7.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,3.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0
1,2.0,0.0,1.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,3.0,1.0,1.0,0.0,2.0,1.0
2,7.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,5.0,2.0,3.0,2.0,1.0,0.0,0.0,0.0
4,0.0,0.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0


And fill NAs in numerical columns by the Nearest neighbors imputation - `KNNImputer`:

In [151]:
imputer = KNNImputer(n_neighbors=5, weights="uniform")
train_imputed = imputer.fit_transform(df_train)
df_train = pd.DataFrame(train_imputed,
                        columns=df_train.columns,
                        index=df_train.index)

In [133]:
analyze_na(df_train[numerical_cols])

Unnamed: 0,NA count,NA percentage (%)
hla_match_c_high,0,0.0
hla_high_res_8,0,0.0
hla_low_res_6,0,0.0
hla_high_res_6,0,0.0
hla_high_res_10,0,0.0
hla_match_dqb1_high,0,0.0
hla_nmdp_6,0,0.0
hla_match_c_low,0,0.0
hla_match_drb1_low,0,0.0
hla_match_dqb1_low,0,0.0


Scale with `MinMaxScaler`

In [152]:
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(df_train)
df_train = pd.DataFrame(train_scaled,
                        columns=df_train.columns,
                        index=df_train.index)

Result:

In [155]:
df_train.head()

Unnamed: 0_level_0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.7,0.0,0.833333,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.268542
1,0.2,0.0,0.166667,0.0,1.0,1.0,0.857143,0.0,1.0,1.0,...,1.0,0.5,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.027728
2,0.7,0.0,0.833333,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.5,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.124356
3,0.0,0.0,0.166667,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.651918
4,0.0,0.0,0.833333,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.101543


In [154]:
df_train.describe()

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
count,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0,...,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0,28800.0
mean,0.377243,0.127083,0.591661,0.153108,0.883896,0.814005,0.131086,0.046389,0.785852,0.713403,...,0.906111,0.703854,0.747917,0.817567,0.055295,0.854361,0.185139,0.781056,0.539306,0.146369
std,0.315077,0.331164,0.346656,0.358394,0.203774,0.24475,0.256578,0.20788,0.289561,0.45218,...,0.234896,0.257643,0.434216,0.249148,0.225751,0.220915,0.386557,0.295335,0.498461,0.158479
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.2,0.0,0.166667,0.0,0.9,0.666667,0.0,0.0,0.5,0.0,...,1.0,0.5,0.0,0.666667,0.0,0.5,0.0,0.666667,0.0,0.033784
50%,0.2,0.0,0.833333,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.5,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.060475
75%,0.7,0.0,0.833333,0.0,1.0,1.0,0.142857,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.222173
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [156]:
df_train.to_csv('/content/drive/MyDrive/kaggle/CIBMTR_2025/data/train_prepared.csv')

## Model

## Git

In [157]:
!git config --global user.name "serjshul"
!git config --global user.email "shulginsergey0@gmail.com"

In [163]:
!cp "/content/drive/MyDrive/kaggle/CIBMTR_2025/CIBMTR_2025.ipynb" "/content/CIBMTR_Survival_Predictions"

%cd /content/CIBMTR_Survival_Predictions
!git add .
!git commit -m "Notebook init"
!git push origin main

/content/CIBMTR_Survival_Predictions
[main ace8613] Notebook init
 1 file changed, 1 insertion(+)
 create mode 100644 CIBMTR_2025.ipynb
Enumerating objects: 4, done.
Counting objects: 100% (4/4), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 20.00 KiB | 4.00 MiB/s, done.
Total 3 (delta 0), reused 0 (delta 0), pack-reused 0
remote: [1;31merror[m: GH013: Repository rule violations found for refs/heads/main.[K
remote: 
remote: - GITHUB PUSH PROTECTION[K
remote:   —————————————————————————————————————————[K
remote:     Resolve the following violations before pushing again[K
remote: 
remote:     - Push cannot contain secrets[K
remote: 
remote:     [K
remote:      (?) Learn how to resolve a blocked push[K
remote:      https://docs.github.com/code-security/secret-scanning/working-with-secret-scanning-and-push-protection/working-with-push-protection-from-the-command-line#resolving-a-blocked-push[K
remote:     [K
rem

In [164]:
!git reset --hard HEAD~1

HEAD is now at 3a2fc27 Initial commit
