<a href="https://colab.research.google.com/github/subikkshas/PREPARE-ALL/blob/main/New_data_matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [3]:
import pandas as pd
import numpy as np
import os
import optuna
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from typing import Dict, List, Tuple, Union
from sklearn.preprocessing import OneHotEncoder
from google.colab import drive
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, precision_recall_curve, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

In [3]:
def load_train_test_data(load_path='/content/drive/MyDrive/Prepare all data/'):
    # Mount Google Drive (if not already mounted)
    try:
        drive.mount('/content/drive')
        print("Google Drive mounted successfully.")
    except:
        print("Google Drive already mounted or mounting failed.")

    # Load the data files
    try:
        loaded_X_train = pd.read_csv(os.path.join(load_path, 'X_train_encoded.csv'))
        loaded_y_train = pd.read_csv(os.path.join(load_path, 'y_train_encoded.csv'))
        loaded_X_test = pd.read_csv(os.path.join(load_path, 'X_test_encoded.csv'))
        loaded_y_test = pd.read_csv(os.path.join(load_path, 'y_test_encoded.csv'))

        print("Data loaded successfully!")
        print(f"X_train shape: {loaded_X_train.shape}")
        print(f"y_train shape: {loaded_y_train.shape}")
        print(f"X_test shape: {loaded_X_test.shape}")
        print(f"y_test shape: {loaded_y_test.shape}")

        return loaded_X_train, loaded_y_train, loaded_X_test, loaded_y_test

    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
        return None, None, None, None
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None, None, None

In [4]:
X_Train, y_Train, X_Test, y_Test = load_train_test_data()

Mounted at /content/drive
Google Drive mounted successfully.
Data loaded successfully!
X_train shape: (1801, 28)
y_train shape: (1801, 1)
X_test shape: (451, 28)
y_test shape: (451, 1)


In [5]:
X_Train = X_Train.drop('Remission status _EOI', axis=1)
X_Test = X_Test.drop('Remission status _EOI', axis=1)

print(f"X_Train shape after dropping column: {X_Train.shape}")
print(f"X_Test shape after dropping column: {X_Test.shape}")

X_Train shape after dropping column: (1801, 27)
X_Test shape after dropping column: (451, 27)


## Data extract

In [131]:
file_path = "/content/Icicle Pre trial AI paper dataset_anonymised_April 2025.xlsx"
df_full = pd.read_excel(file_path)

In [132]:
print(df_full.columns)

Index(['Sl No', 'UPN_AI_dataset', 'Previous Treatment', 'Date of Birth',
       'NCI Risk', 'Sex', 'Date of Diagnosis', 'Age', 'Lineage',
       'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response',
       'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics',
       'Provisional risk', 'Date of Remission_EOI', 'Remission status _EOI',
       'MRD Status_EOI', 'Final Risk ', 'Date of Completion of treatment',
       'Relapsed', 'Date of Relapse', 'Time-point of Relapse ',
       'Type of relapse', 'Site of Relapse ', 'Death', 'Date of Death ',
       'Treatment death phase',
       'Treatment abandonment/Disease progresssion/Off protocol',
       'Date of Treatment abandonment/ Disease progresssion/Off protocol',
       'Date of Last FU', 'Current status at Last Follow up '],
      dtype='object')


In [133]:
def remove_columns(df, feature_set='normal'):

    # Define base leakage columns to always remove
    base_leakage_cols = [
        "Sl No",
        "UPN_AI_dataset",
        # All Date columns
        "Date of Birth", "Date of Diagnosis", "Date of Remission_EOI",
        "Date of Completion of treatment", "Date of Relapse",
        "Date of Death ", "Date of Treatment abandonment/ Disease progresssion/Off protocol",
        "Date of Last FU",
        # Relapse & outcome details
        "Time-point of Relapse ", "Type of relapse", "Site of Relapse ",
        "Death", "Treatment death phase",
        "Treatment abandonment/Disease progresssion/Off protocol",
        "Current status at Last Follow up "
    ]

    # Additional columns to remove for minimal feature set
    minimal_cols = [
        "Previous Treatment", "NCI Risk", "Lineage", "Bulky Disease",
        "CNS Disease", "Detail cytogenetics", "Provisional risk",
    ]

    # Combine columns based on feature_set parameter
    if feature_set == 'minimal':
        cols_to_remove = base_leakage_cols + minimal_cols
    else:  # 'normal' or any other value
        cols_to_remove = base_leakage_cols

    # Remove only columns that exist in the dataframe
    existing_cols_to_remove = [c for c in cols_to_remove if c in df.columns]
    df_clean = df.drop(columns=existing_cols_to_remove)

    return df_clean



In [134]:
# df_full_clean = remove_columns(df_full, feature_set='normal')
df_full_clean = df_full
print(df_full_clean.columns.tolist())

['Sl No', 'UPN_AI_dataset', 'Previous Treatment', 'Date of Birth', 'NCI Risk', 'Sex', 'Date of Diagnosis', 'Age', 'Lineage', 'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics', 'Provisional risk', 'Date of Remission_EOI', 'Remission status _EOI', 'MRD Status_EOI', 'Final Risk ', 'Date of Completion of treatment', 'Relapsed', 'Date of Relapse', 'Time-point of Relapse ', 'Type of relapse', 'Site of Relapse ', 'Death', 'Date of Death ', 'Treatment death phase', 'Treatment abandonment/Disease progresssion/Off protocol', 'Date of Treatment abandonment/ Disease progresssion/Off protocol', 'Date of Last FU', 'Current status at Last Follow up ']


In [135]:
def clean_string_columns(df, columns_to_clean):
  existing_cols_to_clean = [col for col in columns_to_clean if col in df.columns]

  for col in existing_cols_to_clean:
      df[col] = df[col].astype("string").str.strip().str.lower()

  return df

In [136]:
df_full_strip = clean_string_columns(df_full_clean, ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage', 'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics', 'Provisional risk', 'Remission status _EOI', 'MRD Status_EOI', 'Final Risk ', 'Relapsed'])

In [137]:
file_path = "/content/X_test for clinician.xlsx"
df_cl = pd.read_excel(file_path)

In [138]:
print(df_cl.columns)

Index(['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage',
       'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response',
       'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics',
       'Provisional risk', 'MRD Status_EOI', 'Final Risk ',
       'Doctor Predictions'],
      dtype='object')


In [139]:
df_cl_strip = clean_string_columns(df_cl, ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage',
       'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response',
       'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics',
       'Provisional risk', 'MRD Status_EOI', 'Final Risk ',
       'Doctor Predictions'])

In [140]:
print(df_cl_strip.shape)

(467, 15)


In [141]:
# Define key columns
# key_cols = ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage',
#             'Bulky Disease', 'Highest presenting WBC','Final Risk ']
key_cols = [
    'Previous Treatment',
    'NCI Risk',
    # 'Sex',
    'Age',
    'Lineage',
    'Bulky Disease',
    'Highest presenting WBC',
    'Prednisolone Response',
    'CNS Disease',
    'Cytogenetic groups ',
    'Detail cytogenetics',
    'Provisional risk',
    'MRD Status_EOI',
    'Final Risk '
    ]
# key_cols = ['Sex', 'Age','Highest presenting WBC']

df_cl_strip = df_cl_strip.reset_index().rename(columns={'index': 'clinical_idx'})


df_full_strip['Age'] = df_full_strip['Age'].astype(float).fillna(0).astype(int)
df_full_strip['Highest presenting WBC'] = df_full_strip['Highest presenting WBC'].astype(float).fillna(0).astype(int)
df_cl_strip['Age'] = df_cl_strip['Age'].astype(float).fillna(0).astype(int)
df_cl_strip['Highest presenting WBC'] = df_cl_strip['Highest presenting WBC'].astype(float).fillna(0).astype(int)

df_full_strip['Highest presenting WBC'] = df_full_strip['Highest presenting WBC'].round()
df_cl_strip['Highest presenting WBC'] = df_cl_strip['Highest presenting WBC'].round()

# Step 1: Inner merge on key columns to find matches
matches = pd.merge(df_full_strip,df_cl_strip, on=key_cols, how='right',indicator=True,suffixes=('_full', '_clinician'))

# Step 2: Count number of matching rows
print(f"Number of matching rows: {len(matches)}")

# Optional: preview some matching rows
# matches.head()

Number of matching rows: 473


In [142]:
# matches[matches["_merge"]=="right_only"]
matches["_merge"].value_counts()

Unnamed: 0_level_0,count
_merge,Unnamed: 1_level_1
both,473
left_only,0
right_only,0


In [143]:
matches

Unnamed: 0,Sl No,UPN_AI_dataset,Previous Treatment,Date of Birth,NCI Risk,Sex_full,Date of Diagnosis,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,Date of Remission_EOI,Remission status _EOI,MRD Status_EOI,Final Risk,Date of Completion of treatment,Relapsed,Date of Relapse,Time-point of Relapse,Type of relapse,Site of Relapse,Death,Date of Death,Treatment death phase,Treatment abandonment/Disease progresssion/Off protocol,Date of Treatment abandonment/ Disease progresssion/Off protocol,Date of Last FU,Current status at Last Follow up,clinical_idx,Sex_clinician,Doctor Predictions,_merge
0,2177,PT-AI_2177,no,2012-02-26 00:00:00,high,female,2016-05-13,4,b,no,135000,good,no,non-high risk,b-other,intermediate,2016-06-25 00:00:00,in remission,,intermediate,2019-04-02 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-05-10,Alive,0,female,,both
1,396,PT-AI_396,no,Not available,standard,male,2017-05-30,2,b,no,20140,good,no,non-high risk,high hyperdiploidy,standard,2017-07-11 00:00:00,in remission,negative,standard,.,no,.,.,.,.,No,.,.,Yes (Abandoned),2019-09-06 00:00:00,2019-09-06,Lost to follow up,1,male,,both
2,790,PT-AI_790,yes,Not available,standard,male,2014-04-26,7,b,yes,3830,good,no,,,high,2014-05-27 00:00:00,in remission,negative,high,2016-09-12 00:00:00,no,.,.,.,.,No,.,.,No,.,2021-02-01,Alive,2,male,,both
3,466,PT-AI_466,no,Not available,standard,male,2017-08-22,4,b,yes,6840,good,no,non-high risk,b-other,intermediate,2017-09-25 00:00:00,in remission,negative,intermediate,2019-11-04 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-02-11,Alive,3,male,,both
4,62,PT-AI_62,no,2005-11-07 00:00:00,high,male,2014-11-07,8,b,no,70700,good,yes,non-high risk,b-other,high,2014-12-13 00:00:00,in remission,negative,high,.,yes,2015-10-05 00:00:00,Very Early,Isolated extramedullary,CNS,Yes,2015-12-22 00:00:00,.,No,.,2015-12-22,Dead,4,male,,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,982,PT-AI_982,no,Not available,standard,male,2015-01-23,6,b,yes,30950,good,no,non-high risk,high hyperdiploidy,intermediate,2015-02-23 00:00:00,in remission,negative,intermediate,2017-03-30 00:00:00,no,.,.,.,.,No,.,.,No,.,2018-08-16,Lost to follow up,462,male,,both
469,1963,PT-AI_1963,no,Not available,high,male,2015-08-17,11,t,yes,4500,poor,no,not required,not required,t,.,.,.,.,.,no,.,.,.,.,Yes,2015-12-24 00:00:00,Induction,No,.,2015-12-24,Dead,463,male,,both
470,1368,PT-AI_1368,no,Not available,standard,female,2016-04-19,3,b,yes,27000,good,no,non-high risk,b-other,intermediate,2016-05-20 00:00:00,in remission,negative,intermediate,2018-06-30 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-06-01,Alive,464,female,,both
471,1893,PT-AI_1893,no,Not available,standard,male,2016-07-02,1,b,no,12900,good,no,non-high risk,b-other,standard,2016-08-16 00:00:00,in remission,negative,standard,2019-01-15 00:00:00,yes,2021-04-07 00:00:00,Late,Isolated medullary,Bone marrow,No,.,.,No,.,2021-10-18,Alive,465,male,,both


In [144]:
# Group by key columns and count occurrences
grouped = matches.groupby(key_cols).size().reset_index(name='count')

# Filter for groups with count > 1
duplicates = grouped[grouped['count'] > 1]

# Merge back with the original 'matches' dataframe to show the actual duplicate rows
duplicate_rows = matches.merge(duplicates[key_cols], on=key_cols, how='inner')

print("Rows with more than one match based on key columns:")
duplicate_rows = duplicate_rows.sort_values(by='Highest presenting WBC')
pd.set_option('display.max_columns', None)

print(len(duplicate_rows))
# display(duplicate_rows)

Rows with more than one match based on key columns:
12


In [145]:
duplicate_rows

Unnamed: 0,Sl No,UPN_AI_dataset,Previous Treatment,Date of Birth,NCI Risk,Sex_full,Date of Diagnosis,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,Date of Remission_EOI,Remission status _EOI,MRD Status_EOI,Final Risk,Date of Completion of treatment,Relapsed,Date of Relapse,Time-point of Relapse,Type of relapse,Site of Relapse,Death,Date of Death,Treatment death phase,Treatment abandonment/Disease progresssion/Off protocol,Date of Treatment abandonment/ Disease progresssion/Off protocol,Date of Last FU,Current status at Last Follow up,clinical_idx,Sex_clinician,Doctor Predictions,_merge
7,1310,PT-AI_1310,no,Not available,standard,male,2016-02-19,5,b,no,2000,good,no,non-high risk,high hyperdiploidy,standard,2016-03-21 00:00:00,in remission,negative,standard,2018-06-06 00:00:00,no,.,.,.,.,No,.,.,No,.,2019-09-18,Lost to follow up,402,male,,both
6,1107,PT-AI_1107,no,Not available,standard,male,2015-06-05,5,b,no,2000,good,no,non-high risk,high hyperdiploidy,standard,2015-07-06 00:00:00,in remission,negative,standard,2017-09-17 00:00:00,no,.,.,.,.,No,.,.,No,.,2021-09-24,Alive,402,male,,both
5,1077,PT-AI_1077,no,Not available,high,female,2015-05-09,12,b,no,2000,good,no,non-high risk,etv6-runx1,intermediate,2015-06-09 00:00:00,in remission,negative,intermediate,2017-07-11 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-04-06,Alive,306,female,,both
4,672,PT-AI_672,no,Not available,high,female,2013-12-23,12,b,no,2000,good,no,non-high risk,etv6-runx1,intermediate,2014-01-23 00:00:00,in remission,negative,intermediate,2016-02-28 00:00:00,no,.,.,.,.,No,.,.,No,.,2018-07-19,Lost to follow up,306,female,,both
1,1911,PT-AI_1911,no,Not available,standard,male,2016-08-13,2,b,no,2510,good,no,non-high risk,high hyperdiploidy,standard,2016-09-19 00:00:00,in remission,negative,standard,.,yes,2017-05-26 00:00:00,Very Early,Isolated medullary,Bone marrow,Yes,2018-01-16 00:00:00,.,No,.,2018-01-16,Dead,5,male,,both
0,521,PT-AI_521,no,Not available,standard,male,2017-10-20,2,b,no,2510,good,no,non-high risk,high hyperdiploidy,standard,2017-12-04 00:00:00,in remission,negative,standard,2020-02-02 00:00:00,no,.,.,.,.,No,.,.,No,.,2020-05-18,Lost to follow up,5,male,,both
9,1911,PT-AI_1911,no,Not available,standard,male,2016-08-13,2,b,no,2510,good,no,non-high risk,high hyperdiploidy,standard,2016-09-19 00:00:00,in remission,negative,standard,.,yes,2017-05-26 00:00:00,Very Early,Isolated medullary,Bone marrow,Yes,2018-01-16 00:00:00,.,No,.,2018-01-16,Dead,420,male,,both
8,521,PT-AI_521,no,Not available,standard,male,2017-10-20,2,b,no,2510,good,no,non-high risk,high hyperdiploidy,standard,2017-12-04 00:00:00,in remission,negative,standard,2020-02-02 00:00:00,no,.,.,.,.,No,.,.,No,.,2020-05-18,Lost to follow up,420,male,,both
10,972,PT-AI_972,no,Not available,standard,male,2015-01-07,4,b,no,2700,good,no,non-high risk,b-other,standard,2015-02-07 00:00:00,in remission,negative,standard,2017-04-26 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-03-24,Alive,430,male,,both
11,1746,PT-AI_1746,no,Not available,standard,male,2015-05-16,4,b,no,2700,good,no,non-high risk,b-other,standard,2015-06-24 00:00:00,in remission,negative,standard,2017-10-25 00:00:00,yes,2019-03-06 00:00:00,Late,Combined,Bone marrow+Testes,No,.,.,No,.,2021-11-20,Alive,430,male,,both


In [146]:
# Check for duplicate rows within df_cl_strip
duplicates_within_cl_strip = df_cl_strip[df_cl_strip.duplicated()]

if not duplicates_within_cl_strip.empty:
    print("Duplicate rows found within df_cl_strip:")
    display(duplicates_within_cl_strip)
else:
    print("No duplicate rows found within df_cl_strip.")

No duplicate rows found within df_cl_strip.


In [103]:
df_cl_strip

Unnamed: 0,clinical_idx,Previous Treatment,NCI Risk,Sex,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,MRD Status_EOI,Final Risk,Doctor Predictions
0,0,no,high,female,4,b,no,135000,good,no,non-high risk,b-other,intermediate,,intermediate,
1,1,no,standard,male,2,b,no,20140,good,no,non-high risk,high hyperdiploidy,standard,negative,standard,
2,2,yes,standard,male,7,b,yes,3830,good,no,,,high,negative,high,
3,3,no,standard,male,4,b,yes,6840,good,no,non-high risk,b-other,intermediate,negative,intermediate,
4,4,no,high,male,8,b,no,70700,good,yes,non-high risk,b-other,high,negative,high,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,462,no,standard,male,6,b,yes,30950,good,no,non-high risk,high hyperdiploidy,intermediate,negative,intermediate,
463,463,no,high,male,11,t,yes,4500,poor,no,not required,not required,t,.,.,
464,464,no,standard,female,3,b,yes,27000,good,no,non-high risk,b-other,intermediate,negative,intermediate,
465,465,no,standard,male,1,b,no,12900,good,no,non-high risk,b-other,standard,negative,standard,


In [104]:
# Find duplicate clinical_idx within df_cl_strip
duplicate_clinical_idx = df_cl_strip[df_cl_strip['clinical_idx'].duplicated(keep=False)]

if not duplicate_clinical_idx.empty:
    print("Duplicate clinical_idx found within df_cl_strip:")
    display(duplicate_clinical_idx)
else:
    print("No duplicate clinical_idx found within df_cl_strip.")

No duplicate clinical_idx found within df_cl_strip.


In [147]:
# Get the clinical_idx values from the duplicate_rows
indices_to_remove = duplicate_rows['clinical_idx'].unique().tolist()

## Remove rows from df_cl_strip where clinical_idx is in the indices_to_remove list
df_cl_strip = df_cl_strip[~df_cl_strip['clinical_idx'].isin(indices_to_remove)]

print(indices_to_remove)

# # Remove rows from df_cl_strip where clinical_idx is in the indices_to_remove list
# df_cl_strip = df_cl_strip[~df_cl_strip['clinical_idx'].isin(indices_to_remove)]

# print(f"Removed {len(indices_to_remove)} duplicate rows from df_cl_strip.")

df_cl_strip[df_cl_strip['clinical_idx'] == 402]

[402, 306, 5, 420, 430, 257]


Unnamed: 0,clinical_idx,Previous Treatment,NCI Risk,Sex,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,MRD Status_EOI,Final Risk,Doctor Predictions


In [148]:
df_cl_strip[df_cl_strip['clinical_idx'] == 257]

Unnamed: 0,clinical_idx,Previous Treatment,NCI Risk,Sex,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,MRD Status_EOI,Final Risk,Doctor Predictions


In [149]:
print(df_cl_strip.shape)

(461, 16)


In [150]:
# Define key columns
# key_cols = ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage',
#             'Bulky Disease', 'Highest presenting WBC','Final Risk ']
key_cols = [
    'Previous Treatment',
    'NCI Risk',
    # 'Sex',
    'Age',
    'Lineage',
    'Bulky Disease',
    'Highest presenting WBC',
    'Prednisolone Response',
    'CNS Disease',
    'Cytogenetic groups ',
    'Detail cytogenetics',
    'Provisional risk',
    'MRD Status_EOI',
    'Final Risk '
    ]
# key_cols = ['Sex', 'Age','Highest presenting WBC']

# df_cl_strip = df_cl_strip.reset_index().rename(columns={'index': 'clinical_idx'})


# df_full_strip['Age'] = df_full_strip['Age'].astype(float).fillna(0).astype(int)
# df_full_strip['Highest presenting WBC'] = df_full_strip['Highest presenting WBC'].astype(float).fillna(0).astype(int)
# df_cl_strip['Age'] = df_cl_strip['Age'].astype(float).fillna(0).astype(int)
# df_cl_strip['Highest presenting WBC'] = df_cl_strip['Highest presenting WBC'].astype(float).fillna(0).astype(int)

# df_full_strip['Highest presenting WBC'] = df_full_strip['Highest presenting WBC'].round()
# df_cl_strip['Highest presenting WBC'] = df_cl_strip['Highest presenting WBC'].round()

# Step 1: Inner merge on key columns to find matches
matches1 = pd.merge(df_full_strip,df_cl_strip, on=key_cols, how='right',indicator=True,suffixes=('_full', '_clinician'))

# Step 2: Count number of matching rows
print(f"Number of matching rows: {len(matches1)}")

# Optional: preview some matching rows
# matches.head()

Number of matching rows: 461


In [155]:
# Select 'clinical_idx' and 'Sl No' from matches1 for merging
sl_no_data = matches1[['clinical_idx', 'Sl No']].copy()

# Merge df_cl_strip with sl_no_data on clinical_idx
df_cl_strip = pd.merge(df_cl_strip, sl_no_data, on='clinical_idx', how='left')

print("Sl No column added to df_cl_strip.")
print(f"New shape of df_cl_strip: {df_cl_strip.shape}")

MergeError: Passing 'suffixes' which cause duplicate columns {'Sl No_x'} is not allowed.

In [156]:
# Reorder columns to put 'Sl No' as the first column
current_cols = df_cl_strip.columns.tolist()
# Remove the duplicate 'Sl No' column if it exists before reordering
if 'Sl No_x' in current_cols:
    current_cols.remove('Sl No_x')
if 'Sl No_y' in current_cols:
    current_cols.remove('Sl No_y')
if 'Sl No' in current_cols:
    current_cols.remove('Sl No')
new_cols = ['Sl No'] + current_cols
df_cl_strip = df_cl_strip[new_cols]

print("Sl No column reordered to be the first column.")
print(f"New shape of df_cl_strip: {df_cl_strip.shape}")

Sl No column reordered to be the first column.
New shape of df_cl_strip: (461, 17)


In [158]:
# Identify columns missing in df_cl_strip compared to df_full_strip (excluding 'Sl No')
missing_cols = [col for col in df_full_strip.columns if col not in df_cl_strip.columns and col != 'Sl No']

# Select 'Sl No' and the missing columns from df_full_strip
cols_to_merge_from_full = ['Sl No'] + missing_cols

# Merge df_cl_strip with the selected columns from df_full_strip on 'Sl No'
# Using a left merge to keep all rows from df_cl_strip
df_cl_strip = pd.merge(df_cl_strip, df_full_strip[cols_to_merge_from_full], on='Sl No', how='left')

print("Missing columns from df_full_strip appended to df_cl_strip.")
print(f"New shape of df_cl_strip: {df_cl_strip.shape}")
print(f"Columns added: {missing_cols}")

Missing columns from df_full_strip appended to df_cl_strip.
New shape of df_cl_strip: (461, 35)
Columns added: ['UPN_AI_dataset', 'Date of Birth', 'Date of Diagnosis', 'Date of Remission_EOI', 'Remission status _EOI', 'Date of Completion of treatment', 'Relapsed', 'Date of Relapse', 'Time-point of Relapse ', 'Type of relapse', 'Site of Relapse ', 'Death', 'Date of Death ', 'Treatment death phase', 'Treatment abandonment/Disease progresssion/Off protocol', 'Date of Treatment abandonment/ Disease progresssion/Off protocol', 'Date of Last FU', 'Current status at Last Follow up ']


In [159]:
df_cl_strip

Unnamed: 0,Sl No,clinical_idx,Previous Treatment,NCI Risk,Sex,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,MRD Status_EOI,Final Risk,Doctor Predictions,UPN_AI_dataset,Date of Birth,Date of Diagnosis,Date of Remission_EOI,Remission status _EOI,Date of Completion of treatment,Relapsed,Date of Relapse,Time-point of Relapse,Type of relapse,Site of Relapse,Death,Date of Death,Treatment death phase,Treatment abandonment/Disease progresssion/Off protocol,Date of Treatment abandonment/ Disease progresssion/Off protocol,Date of Last FU,Current status at Last Follow up
0,2177,0,no,high,female,4,b,no,135000,good,no,non-high risk,b-other,intermediate,,intermediate,,PT-AI_2177,2012-02-26 00:00:00,2016-05-13,2016-06-25 00:00:00,in remission,2019-04-02 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-05-10,Alive
1,396,1,no,standard,male,2,b,no,20140,good,no,non-high risk,high hyperdiploidy,standard,negative,standard,,PT-AI_396,Not available,2017-05-30,2017-07-11 00:00:00,in remission,.,no,.,.,.,.,No,.,.,Yes (Abandoned),2019-09-06 00:00:00,2019-09-06,Lost to follow up
2,790,2,yes,standard,male,7,b,yes,3830,good,no,,,high,negative,high,,PT-AI_790,Not available,2014-04-26,2014-05-27 00:00:00,in remission,2016-09-12 00:00:00,no,.,.,.,.,No,.,.,No,.,2021-02-01,Alive
3,466,3,no,standard,male,4,b,yes,6840,good,no,non-high risk,b-other,intermediate,negative,intermediate,,PT-AI_466,Not available,2017-08-22,2017-09-25 00:00:00,in remission,2019-11-04 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-02-11,Alive
4,62,4,no,high,male,8,b,no,70700,good,yes,non-high risk,b-other,high,negative,high,,PT-AI_62,2005-11-07 00:00:00,2014-11-07,2014-12-13 00:00:00,in remission,.,yes,2015-10-05 00:00:00,Very Early,Isolated extramedullary,CNS,Yes,2015-12-22 00:00:00,.,No,.,2015-12-22,Dead
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,982,462,no,standard,male,6,b,yes,30950,good,no,non-high risk,high hyperdiploidy,intermediate,negative,intermediate,,PT-AI_982,Not available,2015-01-23,2015-02-23 00:00:00,in remission,2017-03-30 00:00:00,no,.,.,.,.,No,.,.,No,.,2018-08-16,Lost to follow up
457,1963,463,no,high,male,11,t,yes,4500,poor,no,not required,not required,t,.,.,,PT-AI_1963,Not available,2015-08-17,.,.,.,no,.,.,.,.,Yes,2015-12-24 00:00:00,Induction,No,.,2015-12-24,Dead
458,1368,464,no,standard,female,3,b,yes,27000,good,no,non-high risk,b-other,intermediate,negative,intermediate,,PT-AI_1368,Not available,2016-04-19,2016-05-20 00:00:00,in remission,2018-06-30 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-06-01,Alive
459,1893,465,no,standard,male,1,b,no,12900,good,no,non-high risk,b-other,standard,negative,standard,,PT-AI_1893,Not available,2016-07-02,2016-08-16 00:00:00,in remission,2019-01-15 00:00:00,yes,2021-04-07 00:00:00,Late,Isolated medullary,Bone marrow,No,.,.,No,.,2021-10-18,Alive


In [163]:
# Remove 'clinical_idx' and 'Doctor Predictions' columns from df_cl_strip
df_test = df_cl_strip.drop(columns=['clinical_idx', 'Doctor Predictions'])

print("Removed 'clinical_idx' and 'Doctor Predictions' columns from df_cl_strip.")
print(f"New shape of df_test: {df_test.shape}")

Removed 'clinical_idx' and 'Doctor Predictions' columns from df_cl_strip.
New shape of df_test: (461, 33)


In [167]:
not_in_remission_count = df_test[df_test['Remission status _EOI'] == 'not in remission'].shape[0]
print(f"Number of entries in df_test with 'not in remission' in 'Remission status _EOI': {not_in_remission_count}")

Number of entries in df_test with 'not in remission' in 'Remission status _EOI': 14


In [164]:
# Get the list of 'Sl No' values from df_cl_strip
sl_nos_to_remove = df_test['Sl No'].tolist()

# Remove rows from df_full_strip where 'Sl No' is in the list of Sl Nos from df_cl_strip
df_full_strip_remaining = df_full_strip[~df_full_strip['Sl No'].isin(sl_nos_to_remove)].copy()

print(f"Removed {len(sl_nos_to_remove)} rows from df_full_strip based on Sl No in df_test.")
print(f"New shape of df_full_strip_remaining: {df_full_strip_remaining.shape}")

Removed 461 rows from df_full_strip based on Sl No in df_test.
New shape of df_full_strip_remaining: (1870, 33)


In [168]:
not_in_remission_count1 = df_full_strip[df_full_strip['Remission status _EOI'] == 'not in remission'].shape[0]
print(f"Number of entries in df_test with 'not in remission' in 'Remission status _EOI': {not_in_remission_count1}")

Number of entries in df_test with 'not in remission' in 'Remission status _EOI': 79


In [169]:
not_in_remission_count2 = df_full_strip_remaining[df_full_strip_remaining['Remission status _EOI'] == 'not in remission'].shape[0]
print(f"Number of entries in df_test with 'not in remission' in 'Remission status _EOI': {not_in_remission_count2}")

Number of entries in df_test with 'not in remission' in 'Remission status _EOI': 65


In [170]:
df_train = df_full_strip_remaining

In [171]:
# Define the folder path in Google Drive
folder_path = '/content/drive/MyDrive/NIR dataset/'

# Create the directory if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Define the file paths for df_train and df_test
save_path_train = folder_path + 'df_train.xlsx'
save_path_test = folder_path + 'df_test.xlsx'

# Save df_train to an Excel file
df_train.to_excel(save_path_train, index=False)
print(f"df_train saved successfully to {save_path_train}")

# Save df_test to an Excel file
df_test.to_excel(save_path_test, index=False)
print(f"df_test saved successfully to {save_path_test}")

df_train saved successfully to /content/drive/MyDrive/NIR dataset/df_train.xlsx
df_test saved successfully to /content/drive/MyDrive/NIR dataset/df_test.xlsx


In [None]:
#####################################                                   XXX                             ##############################################

In [122]:
matches1[matches1['Remission status _EOI']=="not in remission"][["Sl No","clinical_idx"]]["Sl No"].to_list()

[1514, 450, 530, 1973, 2284, 298, 1960, 1595, 2142, 291, 1631, 2230, 60, 2294]

In [109]:
matches1

Unnamed: 0,Sl No,UPN_AI_dataset,Previous Treatment,Date of Birth,NCI Risk,Sex_full,Date of Diagnosis,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,Date of Remission_EOI,Remission status _EOI,MRD Status_EOI,Final Risk,Date of Completion of treatment,Relapsed,Date of Relapse,Time-point of Relapse,Type of relapse,Site of Relapse,Death,Date of Death,Treatment death phase,Treatment abandonment/Disease progresssion/Off protocol,Date of Treatment abandonment/ Disease progresssion/Off protocol,Date of Last FU,Current status at Last Follow up,clinical_idx,Sex_clinician,Doctor Predictions,_merge
0,2177,PT-AI_2177,no,2012-02-26 00:00:00,high,female,2016-05-13,4,b,no,135000,good,no,non-high risk,b-other,intermediate,2016-06-25 00:00:00,in remission,,intermediate,2019-04-02 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-05-10,Alive,0,female,,both
1,396,PT-AI_396,no,Not available,standard,male,2017-05-30,2,b,no,20140,good,no,non-high risk,high hyperdiploidy,standard,2017-07-11 00:00:00,in remission,negative,standard,.,no,.,.,.,.,No,.,.,Yes (Abandoned),2019-09-06 00:00:00,2019-09-06,Lost to follow up,1,male,,both
2,790,PT-AI_790,yes,Not available,standard,male,2014-04-26,7,b,yes,3830,good,no,,,high,2014-05-27 00:00:00,in remission,negative,high,2016-09-12 00:00:00,no,.,.,.,.,No,.,.,No,.,2021-02-01,Alive,2,male,,both
3,466,PT-AI_466,no,Not available,standard,male,2017-08-22,4,b,yes,6840,good,no,non-high risk,b-other,intermediate,2017-09-25 00:00:00,in remission,negative,intermediate,2019-11-04 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-02-11,Alive,3,male,,both
4,62,PT-AI_62,no,2005-11-07 00:00:00,high,male,2014-11-07,8,b,no,70700,good,yes,non-high risk,b-other,high,2014-12-13 00:00:00,in remission,negative,high,.,yes,2015-10-05 00:00:00,Very Early,Isolated extramedullary,CNS,Yes,2015-12-22 00:00:00,.,No,.,2015-12-22,Dead,4,male,,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,982,PT-AI_982,no,Not available,standard,male,2015-01-23,6,b,yes,30950,good,no,non-high risk,high hyperdiploidy,intermediate,2015-02-23 00:00:00,in remission,negative,intermediate,2017-03-30 00:00:00,no,.,.,.,.,No,.,.,No,.,2018-08-16,Lost to follow up,462,male,,both
457,1963,PT-AI_1963,no,Not available,high,male,2015-08-17,11,t,yes,4500,poor,no,not required,not required,t,.,.,.,.,.,no,.,.,.,.,Yes,2015-12-24 00:00:00,Induction,No,.,2015-12-24,Dead,463,male,,both
458,1368,PT-AI_1368,no,Not available,standard,female,2016-04-19,3,b,yes,27000,good,no,non-high risk,b-other,intermediate,2016-05-20 00:00:00,in remission,negative,intermediate,2018-06-30 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-06-01,Alive,464,female,,both
459,1893,PT-AI_1893,no,Not available,standard,male,2016-07-02,1,b,no,12900,good,no,non-high risk,b-other,standard,2016-08-16 00:00:00,in remission,negative,standard,2019-01-15 00:00:00,yes,2021-04-07 00:00:00,Late,Isolated medullary,Bone marrow,No,.,.,No,.,2021-10-18,Alive,465,male,,both


In [110]:
# Count entries with 'not in remission' in 'Remission status _EOI'
not_in_remission_count = matches1[matches1['Remission status _EOI'] == 'not in remission'].shape[0]

print(f"Number of entries in matches1 with 'not in remission' in 'Remission status _EOI': {not_in_remission_count}")

Number of entries in matches1 with 'not in remission' in 'Remission status _EOI': 14


In [111]:
# Get the clinical_idx for entries with 'not in remission' in matches1
not_in_remission_indices = matches1[matches1['Remission status _EOI'] == 'not in remission']['clinical_idx'].tolist()

print(not_in_remission_indices)

[102, 123, 182, 196, 216, 228, 232, 239, 291, 294, 393, 405, 416, 425]


In [112]:
# Print the matches1 entries from the not_in_remission_indices
display(matches1[matches1['clinical_idx'].isin(not_in_remission_indices)])

Unnamed: 0,Sl No,UPN_AI_dataset,Previous Treatment,Date of Birth,NCI Risk,Sex_full,Date of Diagnosis,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,Date of Remission_EOI,Remission status _EOI,MRD Status_EOI,Final Risk,Date of Completion of treatment,Relapsed,Date of Relapse,Time-point of Relapse,Type of relapse,Site of Relapse,Death,Date of Death,Treatment death phase,Treatment abandonment/Disease progresssion/Off protocol,Date of Treatment abandonment/ Disease progresssion/Off protocol,Date of Last FU,Current status at Last Follow up,clinical_idx,Sex_clinician,Doctor Predictions,_merge
101,1514,PT-AI_1514,yes,Not available,standard,male,2016-09-21,3,b,no,18760,good,no,non-high risk,b-other,high,2016-11-08 00:00:00,not in remission,positive,high,,no,.,.,.,.,No,.,.,No,.,2022-02-18,Alive,102,male,,both
122,450,PT-AI_450,no,Not available,standard,female,2017-08-03,6,b,no,30100,good,no,non-high risk,b-other,standard,2017-09-08 00:00:00,not in remission,positive,high,.,no,.,.,.,.,Yes,2018-08-13 00:00:00,.,Yes (Off protocol),2017-09-08 00:00:00,2018-08-13,Dead,123,female,,both
181,530,PT-AI_530,no,Not available,high,male,2017-10-31,13,b,no,18870,poor,no,non-high risk,high hyperdiploidy,high,2017-12-13 00:00:00,not in remission,positive,high,.,no,.,.,.,.,Yes,2019-07-31 00:00:00,.,Yes (Off protocol),2017-12-13 00:00:00,2019-07-31,Dead,182,male,,both
195,1973,PT-AI_1973,no,Not available,high,female,2015-12-16,5,b,no,280000,,no,non-high risk,,high,2016-01-22 00:00:00,not in remission,positive,high,2018-12-20 00:00:00,no,.,.,.,.,No,.,.,No,.,2020-06-13,Lost to follow up,196,female,,both
215,2284,PT-AI_2284,no,2006-06-03 00:00:00,standard,male,2015-03-04,8,b,yes,1100,good,no,non-high risk,high hyperdiploidy,intermediate,2015-04-21 00:00:00,not in remission,positive,high,2017-10-26 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-05-21,Alive,216,male,,both
227,298,PT-AI_298,yes,Not available,standard,male,2017-02-27,3,b,no,2970,good,no,high risk,kmt2a rearranged,high,2017-04-10 00:00:00,not in remission,positive,high,.,no,.,.,.,.,Yes,2017-06-30 00:00:00,.,No,.,2017-06-30,Dead,228,male,,both
231,1960,PT-AI_1960,no,Not available,high,female,2015-07-29,13,b,yes,850000,,no,non-high risk,,high,2015-10-01 00:00:00,not in remission,positive,high,.,no,.,.,.,.,No,.,.,Yes (Off protocol),2015-10-01 00:00:00,2015-10-05,Lost to follow up,232,female,,both
238,1595,PT-AI_1595,no,Not available,standard,female,2016-12-17,9,b,no,3730,good,no,non-high risk,b-other,standard,2017-01-30 00:00:00,not in remission,positive,high,.,no,.,.,.,.,No,.,.,Yes (Off protocol),2017-01-30 00:00:00,2019-02-28,Lost to follow up,239,female,,both
289,2142,PT-AI_2142,no,2005-03-06 00:00:00,high,male,2015-02-07,9,t,no,420000,poor,no,not required,not required,t,2015-03-10 00:00:00,not in remission,not required,.,.,no,.,.,.,.,Yes,2015-07-02 00:00:00,.,Yes (Off protocol),2015-03-10 00:00:00,2015-07-02,Dead,291,male,,both
292,291,PT-AI_291,no,Not available,standard,female,2017-02-27,7,b,no,2970,good,no,non-high risk,high hyperdiploidy,standard,2017-04-11 00:00:00,not in remission,positive,high,2019-07-16 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-02-15,Alive,294,female,,both


In [113]:
# Check for duplicate rows within matches1
duplicates_within_matches1 = matches1[matches1.duplicated()]

if not duplicates_within_matches1.empty:
    print("Duplicate rows found within matches1:")
    display(duplicates_within_matches1)
else:
    print("No duplicate rows found within matches1.")

No duplicate rows found within matches1.


In [73]:
# Remove rows from df_cl_strip where clinical_idx is in the not_in_remission_indices list
df_cl_strip = df_cl_strip[~df_cl_strip['clinical_idx'].isin(not_in_remission_indices)]

print(f"Removed {len(not_in_remission_indices)} rows from df_cl_strip based on 'not in remission' status in matches1.")
print(f"New shape of df_cl_strip: {df_cl_strip.shape}")

Removed 14 rows from df_cl_strip based on 'not in remission' status in matches1.
New shape of df_cl_strip: (447, 16)


In [74]:
df_cl_strip.columns

Index(['clinical_idx', 'Previous Treatment', 'NCI Risk', 'Sex', 'Age',
       'Lineage', 'Bulky Disease', 'Highest presenting WBC',
       'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ',
       'Detail cytogenetics', 'Provisional risk', 'MRD Status_EOI',
       'Final Risk ', 'Doctor Predictions'],
      dtype='object')

In [75]:
# Define key columns
# key_cols = ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage',
#             'Bulky Disease', 'Highest presenting WBC','Final Risk ']
key_cols = [
    'Previous Treatment',
    'NCI Risk',
    # 'Sex',
    'Age',
    'Lineage',
    'Bulky Disease',
    'Highest presenting WBC',
    'Prednisolone Response',
    'CNS Disease',
    'Cytogenetic groups ',
    'Detail cytogenetics',
    'Provisional risk',
    'MRD Status_EOI',
    'Final Risk '
    ]
# key_cols = ['Sex', 'Age','Highest presenting WBC']

# df_cl_strip = df_cl_strip.reset_index().rename(columns={'index': 'clinical_idx'})


# df_full_strip['Age'] = df_full_strip['Age'].astype(float).fillna(0).astype(int)
# df_full_strip['Highest presenting WBC'] = df_full_strip['Highest presenting WBC'].astype(float).fillna(0).astype(int)
# df_cl_strip['Age'] = df_cl_strip['Age'].astype(float).fillna(0).astype(int)
# df_cl_strip['Highest presenting WBC'] = df_cl_strip['Highest presenting WBC'].astype(float).fillna(0).astype(int)

# df_full_strip['Highest presenting WBC'] = df_full_strip['Highest presenting WBC'].round()
# df_cl_strip['Highest presenting WBC'] = df_cl_strip['Highest presenting WBC'].round()

# Step 1: Inner merge on key columns to find matches
matches2 = pd.merge(df_full_strip,df_cl_strip, on=key_cols, how='right',indicator=True,suffixes=('_full', '_clinician'))

# Step 2: Count number of matching rows
print(f"Number of matching rows: {len(matches2)}")

# Optional: preview some matching rows
# matches.head()

Number of matching rows: 447


In [76]:
# Select the necessary columns from matches1 for merging
relapse_data = matches1[['clinical_idx', 'Relapsed']].copy()

# Merge df_cl_strip with relapse_data on clinical_idx
df_cl_strip = pd.merge(df_cl_strip, relapse_data, on='clinical_idx', how='left')

# Drop the clinical_idx column from df_cl_strip
df_cl_strip = df_cl_strip.drop('clinical_idx', axis=1)

print("Relapsed column added to df_cl_strip and clinical_idx column removed.")
print(f"New shape of df_cl_strip: {df_cl_strip.shape}")

Relapsed column added to df_cl_strip and clinical_idx column removed.
New shape of df_cl_strip: (447, 16)


In [77]:
df_cl_strip = df_cl_strip.reset_index().rename(columns={'index': 'clinical_idx'})

In [78]:
df_cl_strip

Unnamed: 0,clinical_idx,Previous Treatment,NCI Risk,Sex,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,MRD Status_EOI,Final Risk,Doctor Predictions,Relapsed
0,0,no,high,female,4,b,no,135000,good,no,non-high risk,b-other,intermediate,,intermediate,,no
1,1,no,standard,male,2,b,no,20140,good,no,non-high risk,high hyperdiploidy,standard,negative,standard,,no
2,2,yes,standard,male,7,b,yes,3830,good,no,,,high,negative,high,,no
3,3,no,standard,male,4,b,yes,6840,good,no,non-high risk,b-other,intermediate,negative,intermediate,,no
4,4,no,high,male,8,b,no,70700,good,yes,non-high risk,b-other,high,negative,high,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,442,no,standard,male,6,b,yes,30950,good,no,non-high risk,high hyperdiploidy,intermediate,negative,intermediate,,no
443,443,no,high,male,11,t,yes,4500,poor,no,not required,not required,t,.,.,,no
444,444,no,standard,female,3,b,yes,27000,good,no,non-high risk,b-other,intermediate,negative,intermediate,,no
445,445,no,standard,male,1,b,no,12900,good,no,non-high risk,b-other,standard,negative,standard,,yes


In [79]:
# Define the path to save the file in your Google Drive
folder_path = '/content/drive/MyDrive/NIR dataset/'
save_path = folder_path + 'df_cl_strip.xlsx'

# Create the directory if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the DataFrame to an Excel file
df_cl_strip.to_excel(save_path, index=False)

print(f"DataFrame saved successfully to {save_path}")

DataFrame saved successfully to /content/drive/MyDrive/NIR dataset/df_cl_strip.xlsx


In [114]:
df_full_strip['Remission status _EOI'].value_counts()

# matches1[matches1['Remission status _EOI']=="not in remission"][["Sl No","clinical_idx"]]["Sl No"].to_list()

Unnamed: 0_level_0,count
Remission status _EOI,Unnamed: 1_level_1
in remission,2118
.,118
not in remission,79


In [130]:
df_full_strip.columns

Index(['Sl No', 'UPN_AI_dataset', 'Previous Treatment', 'Date of Birth',
       'NCI Risk', 'Sex', 'Date of Diagnosis', 'Age', 'Lineage',
       'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response',
       'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics',
       'Provisional risk', 'Date of Remission_EOI', 'Remission status _EOI',
       'MRD Status_EOI', 'Final Risk ', 'Date of Completion of treatment',
       'Relapsed', 'Date of Relapse', 'Time-point of Relapse ',
       'Type of relapse', 'Site of Relapse ', 'Death', 'Date of Death ',
       'Treatment death phase',
       'Treatment abandonment/Disease progresssion/Off protocol',
       'Date of Treatment abandonment/ Disease progresssion/Off protocol',
       'Date of Last FU', 'Current status at Last Follow up '],
      dtype='object')

In [80]:
# Identify the rows in df_full_strip that match the current df_cl_strip based on key_cols
# We'll use a left merge and keep only the rows that match
matched_in_full_strip = pd.merge(df_full_strip, df_cl_strip[key_cols], on=key_cols, how='inner')

# Get the indices of these matched rows in the original df_full_strip
indices_to_remove_from_full = matched_in_full_strip.index.tolist()

# Remove these rows from df_full_strip
df_full_strip_remaining = df_full_strip.drop(indices_to_remove_from_full)

print(f"Removed {len(indices_to_remove_from_full)} rows from df_full_strip.")
print(f"New shape of df_full_strip_remaining: {df_full_strip_remaining.shape}")

Removed 447 rows from df_full_strip.
New shape of df_full_strip_remaining: (1884, 33)


In [127]:
# df_full_strip_remaining['Remission status _EOI'].value_counts()
sl_no_removed = matches1[matches1['Remission status _EOI']=="not in remission"][["Sl No","clinical_idx"]]["Sl No"].to_list()
# Check if "any" df_full_strip_remaining contains sl_no_removed then print those sl no
print(sl_no_removed)
print(df_full_strip_remaining[df_full_strip_remaining['Sl No'].isin(sl_no_removed)])
# if not all(df_full_strip_remaining['Sl No'].isin(sl_no_removed)):
#     print("Not all elements in sl_no_removed are present in df_full_strip_remaining.")
# if all(df_full_strip_remaining['Sl No'].isin(sl_no_removed)):
#     print("All elements in sl_no_removed are present in df_full_strip_remaining.")

[1514, 450, 530, 1973, 2284, 298, 1960, 1595, 2142, 291, 1631, 2230, 60, 2294]
      Sl No UPN_AI_dataset Previous Treatment        Date of Birth  NCI Risk  \
449     450      PT-AI_450                 no        Not available  standard   
529     530      PT-AI_530                 no        Not available      high   
1513   1514     PT-AI_1514                yes        Not available  standard   
1594   1595     PT-AI_1595                 no        Not available  standard   
1630   1631     PT-AI_1631                yes        Not available      high   
1959   1960     PT-AI_1960                 no        Not available      high   
1972   1973     PT-AI_1973                 no        Not available      high   
2141   2142     PT-AI_2142                 no  2005-03-06 00:00:00      high   
2229   2230     PT-AI_2230                 no  2011-08-10 00:00:00      high   
2283   2284     PT-AI_2284                 no  2006-06-03 00:00:00  standard   
2293   2294     PT-AI_2294               

In [81]:
df_full_strip_remaining.columns

Index(['Sl No', 'UPN_AI_dataset', 'Previous Treatment', 'Date of Birth',
       'NCI Risk', 'Sex', 'Date of Diagnosis', 'Age', 'Lineage',
       'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response',
       'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics',
       'Provisional risk', 'Date of Remission_EOI', 'Remission status _EOI',
       'MRD Status_EOI', 'Final Risk ', 'Date of Completion of treatment',
       'Relapsed', 'Date of Relapse', 'Time-point of Relapse ',
       'Type of relapse', 'Site of Relapse ', 'Death', 'Date of Death ',
       'Treatment death phase',
       'Treatment abandonment/Disease progresssion/Off protocol',
       'Date of Treatment abandonment/ Disease progresssion/Off protocol',
       'Date of Last FU', 'Current status at Last Follow up '],
      dtype='object')

In [82]:
# Count entries with 'not in remission' in 'Remission status _EOI' in df_full_strip_remaining
not_in_remission_remaining_count = df_full_strip_remaining[df_full_strip_remaining['Remission status _EOI'] == 'not in remission'].shape[0]

print(f"Number of entries in df_full_strip_remaining with 'not in remission' in 'Remission status _EOI': {not_in_remission_remaining_count}")

Number of entries in df_full_strip_remaining with 'not in remission' in 'Remission status _EOI': 66


In [84]:
# Display entries with 'not in remission' in 'Remission status _EOI' in df_full_strip_remaining
nir_df_full = (df_full_strip_remaining[df_full_strip_remaining['Remission status _EOI'] == 'not in remission'])

In [85]:
nir_df_full

Unnamed: 0,Sl No,UPN_AI_dataset,Previous Treatment,Date of Birth,NCI Risk,Sex,Date of Diagnosis,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,Date of Remission_EOI,Remission status _EOI,MRD Status_EOI,Final Risk,Date of Completion of treatment,Relapsed,Date of Relapse,Time-point of Relapse,Type of relapse,Site of Relapse,Death,Date of Death,Treatment death phase,Treatment abandonment/Disease progresssion/Off protocol,Date of Treatment abandonment/ Disease progresssion/Off protocol,Date of Last FU,Current status at Last Follow up
449,450,PT-AI_450,no,Not available,standard,female,2017-08-03,6,b,no,30100,good,no,non-high risk,b-other,standard,2017-09-08 00:00:00,not in remission,positive,high,.,no,.,.,.,.,Yes,2018-08-13 00:00:00,.,Yes (Off protocol),2017-09-08 00:00:00,2018-08-13,Dead
493,494,PT-AI_494,no,Not available,standard,male,2017-09-18,5,b,yes,5130,poor,no,non-high risk,high hyperdiploidy,high,2017-10-27 00:00:00,not in remission,positive,high,.,no,.,.,.,.,No,.,.,Yes (Off protocol),2017-10-27 00:00:00,2020-01-02,Lost to follow up
503,504,PT-AI_504,no,Not available,high,female,2017-10-03,9,b,yes,137800,poor,no,non-high risk,high hyperdiploidy,high,2017-11-14 00:00:00,not in remission,positive,high,.,no,.,.,.,.,Yes,2018-04-01 00:00:00,.,Yes (Off protocol),2017-11-14 00:00:00,2018-04-01,Dead
529,530,PT-AI_530,no,Not available,high,male,2017-10-31,13,b,no,18870,poor,no,non-high risk,high hyperdiploidy,high,2017-12-13 00:00:00,not in remission,positive,high,.,no,.,.,.,.,Yes,2019-07-31 00:00:00,.,Yes (Off protocol),2017-12-13 00:00:00,2019-07-31,Dead
572,573,PT-AI_573,no,Not available,standard,male,2017-12-08,8,b,yes,40000,poor,no,non-high risk,tcf3-pbx1,high,2018-01-11 00:00:00,not in remission,positive,high,.,no,.,.,.,.,No,.,.,Yes (Off protocol),2018-01-17 00:00:00,2018-03-03,Lost to follow up
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2283,2284,PT-AI_2284,no,2006-06-03 00:00:00,standard,male,2015-03-04,8,b,yes,1100,good,no,non-high risk,high hyperdiploidy,intermediate,2015-04-21 00:00:00,not in remission,positive,high,2017-10-26 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-05-21,Alive
2287,2288,PT-AI_2288,no,1998-05-20 00:00:00,high,male,2014-10-04,16,t,no,135000,good,no,not required,not required,t,2014-11-13 00:00:00,not in remission,not required,t,2017-06-22 00:00:00,no,.,.,.,.,Yes,2020-04-01 00:00:00,.,No,.,2020-04-01,Dead
2293,2294,PT-AI_2294,no,2011-11-14 00:00:00,standard,male,2017-03-22,5,b,no,7500,good,no,non-high risk,high hyperdiploidy,standard,2017-04-26 00:00:00,not in remission,positive,high,2019-11-06 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-04-15,Alive
2298,2299,PT-AI_2299,no,2011-12-13 00:00:00,standard,male,2017-02-22,5,b,no,47600,good,no,non-high risk,b-other,standard,2017-03-29 00:00:00,not in remission,positive,high,2019-11-04 00:00:00,no,.,.,.,.,No,.,.,No,.,2022-04-26,Alive


In [86]:
# Find duplicate rows within nir_df_full
duplicates_in_nir_df_full = nir_df_full[nir_df_full.duplicated()]

if not duplicates_in_nir_df_full.empty:
    print("Duplicate rows found within nir_df_full:")
    display(duplicates_in_nir_df_full)
else:
    print("No duplicate rows found within nir_df_full.")

No duplicate rows found within nir_df_full.


In [None]:
df_full_strip.iloc[[1309,1106]]

Unnamed: 0,Sl No,UPN_AI_dataset,Previous Treatment,Date of Birth,NCI Risk,Sex,Date of Diagnosis,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response,CNS Disease,Cytogenetic groups,Detail cytogenetics,Provisional risk,Date of Remission_EOI,Remission status _EOI,MRD Status_EOI,Final Risk,Date of Completion of treatment,Relapsed,Date of Relapse,Time-point of Relapse,Type of relapse,Site of Relapse,Death,Date of Death,Treatment death phase,Treatment abandonment/Disease progresssion/Off protocol,Date of Treatment abandonment/ Disease progresssion/Off protocol,Date of Last FU,Current status at Last Follow up
1309,1310,PT-AI_1310,no,Not available,standard,male,2016-02-19,5,b,no,2000,good,no,non-high risk,high hyperdiploidy,standard,2016-03-21 00:00:00,in remission,negative,standard,2018-06-06 00:00:00,no,.,.,.,.,No,.,.,No,.,2019-09-18,Lost to follow up
1106,1107,PT-AI_1107,no,Not available,standard,male,2015-06-05,5,b,no,2000,good,no,non-high risk,high hyperdiploidy,standard,2015-07-06 00:00:00,in remission,negative,standard,2017-09-17 00:00:00,no,.,.,.,.,No,.,.,No,.,2021-09-24,Alive


In [None]:
# Step 1: Define key columns for matching
key_cols = ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage',
            'Bulky Disease', 'Highest presenting WBC']

# Step 2: Filter full dataset for 'not in remission'
df_not_in_remission = df_full_strip[df_full_strip['Remission status _EOI'] == 'not in remission']

# Step 2.5: Clean 'Age' column in df_cl to match df_full_strip
df_cl['Age'] = df_cl['Age'].astype(str)

# Step 2.6: Clean 'Highest presenting WBC' column in df_cl to match df_full_strip
df_cl['Highest presenting WBC'] = df_cl['Highest presenting WBC'].astype(str)

# Step 3: Merge with clinician dataset on key columns to see matches
df_matches = df_not_in_remission.merge(
    df_cl,
    on=key_cols,
    how='inner',  # only rows that match on all key_cols
    suffixes=('_full', '_clinician')
)

# Step 4: See results
print(f"Total rows in full dataset with 'not in remission': {len(df_not_in_remission)}")
print(f"Rows matching clinician dataset on key columns: {len(df_matches)}")
display(df_matches.head())

Total rows in full dataset with 'not in remission': 79
Rows matching clinician dataset on key columns: 13


Unnamed: 0,Previous Treatment,NCI Risk,Sex,Age,Lineage,Bulky Disease,Highest presenting WBC,Prednisolone Response_full,CNS Disease_full,Cytogenetic groups _full,...,Prednisolone Response_clinician,CNS Disease_clinician,Cytogenetic groups _clinician,Detail cytogenetics_clinician,Provisional risk_clinician,MRD Status_EOI_clinician,Final Risk _clinician,Doctor Predictions - Gargi,Doctor's Preference - Prasanth,Doctor Predictions
0,no,standard,female,7.0,b,no,2970.0,good,no,non-high risk,...,good,no,non-high risk,high hyperdiploidy,standard,positive,high,No,Yes,no
1,yes,standard,male,3.0,b,no,2970.0,good,no,high risk,...,good,no,high risk,kmt2a rearranged,high,positive,high,Yes,Yes,yes
2,no,standard,female,6.0,b,no,30100.0,good,no,non-high risk,...,good,no,non-high risk,b-other,standard,positive,high,Yes,Yes,yes
3,no,high,male,13.0,b,no,18870.0,poor,no,non-high risk,...,poor,no,non-high risk,high hyperdiploidy,high,positive,high,Yes,Yes,no
4,yes,standard,male,3.0,b,no,18760.0,good,no,non-high risk,...,good,no,non-high risk,b-other,high,positive,high,Yes,Yes,no


In [None]:
# Step 2: Remove matched rows from df_cl
df_remaining = df_cl.merge(df_matches[key_cols], on=key_cols, how='left', indicator=True)
df_remaining = df_remaining[df_remaining['_merge'] == 'left_only'].drop(columns=['_merge'])

# Step 3: Save remaining rows to Excel
df_remaining.to_excel('remaining_rows.xlsx', index=False)

print(f"Rows removed: {len(df_matches)}")
print(f"Remaining rows saved: {len(df_remaining)}")

Rows removed: 13
Remaining rows saved: 454


In [None]:
file_path = "/content/new x test.xlsx"
new_x_test = pd.read_excel(file_path)

In [None]:
print(df_full_strip['Remission status _EOI'].value_counts(dropna=False))
print(df_full_strip.shape)

Remission status _EOI
in remission        2118
.                    118
not in remission      79
<NA>                  16
Name: count, dtype: Int64
(2331, 16)


In [None]:
# True where exactly 'not in remission', False elsewhere (NA -> <NA>), then fill NA -> False
mask = df_full_strip['Remission status _EOI'].eq('not in remission').fillna(False)

# keep rows where mask is False (i.e. not 'not in remission')
df_rem = df_full_strip.loc[~mask].copy()

In [None]:
print(df_rem['Remission status _EOI'].value_counts(dropna=False))
print(df_rem.shape)

Remission status _EOI
in remission    2118
.                118
<NA>              16
Name: count, dtype: Int64
(2252, 16)


In [None]:
df_rem = df_rem.drop('Remission status _EOI', axis=1)


KeyError: "['Remission status _EOI'] not found in axis"

In [None]:
print(df_rem.shape)

(2252, 15)


## Data Preprocessing

### Load data

In [5]:
from google.colab import drive
import os

# Mount Google Drive
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    print("Attempting to remount or it might be already mounted.")
    # Optional: add a check here if already mounted
    if not os.path.exists('/content/drive/MyDrive'):
        print("Google Drive not mounted. Please check permissions or try mounting manually.")

Mounted at /content/drive
Google Drive mounted successfully.


In [6]:
# Define the folder path in Google Drive
folder_path = '/content/drive/MyDrive/NIR dataset/'

# Load df_train and df_test from the specified folder
try:
    df_train = pd.read_excel(os.path.join(folder_path, 'df_train.xlsx'))
    df_test = pd.read_excel(os.path.join(folder_path, 'df_test.xlsx'))

    print("Data loaded successfully!")
    print(f"df_train shape: {df_train.shape}")
    print(f"df_test shape: {df_test.shape}")

except FileNotFoundError as e:
    print(f"Error: File not found - {e}")
except Exception as e:
    print(f"Error loading data: {e}")

Data loaded successfully!
df_train shape: (1870, 33)
df_test shape: (461, 33)


### Remove leakage columns

In [7]:
def remove_columns(df, feature_set='normal'):

    # Define base leakage columns to always remove
    base_leakage_cols = [
        "Sl No",
        "UPN_AI_dataset",
        # All Date columns
        "Date of Birth", "Date of Diagnosis", "Date of Remission_EOI",
        "Date of Completion of treatment", "Date of Relapse",
        "Date of Death ", "Date of Treatment abandonment/ Disease progresssion/Off protocol",
        "Date of Last FU",
        # Relapse & outcome details
        "Time-point of Relapse ", "Type of relapse", "Site of Relapse ",
        "Death", "Treatment death phase",
        "Treatment abandonment/Disease progresssion/Off protocol",
        "Current status at Last Follow up "
    ]

    # Additional columns to remove for minimal feature set
    minimal_cols = [
        "Previous Treatment", "NCI Risk", "Lineage", "Bulky Disease",
        "CNS Disease", "Detail cytogenetics", "Provisional risk",
    ]

    # Combine columns based on feature_set parameter
    if feature_set == 'minimal':
        cols_to_remove = base_leakage_cols + minimal_cols
    else:  # 'normal' or any other value
        cols_to_remove = base_leakage_cols

    # Remove only columns that exist in the dataframe
    existing_cols_to_remove = [c for c in cols_to_remove if c in df.columns]
    df_clean = df.drop(columns=existing_cols_to_remove)

    return df_clean

In [8]:
df_train_clean = remove_columns(df_train, feature_set='normal')
print(df_train_clean.columns.tolist())

['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage', 'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics', 'Provisional risk', 'Remission status _EOI', 'MRD Status_EOI', 'Final Risk ', 'Relapsed']


In [9]:
df_test_clean = remove_columns(df_test, feature_set='normal')
print(df_test_clean.columns.tolist())

['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage', 'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics', 'Provisional risk', 'MRD Status_EOI', 'Final Risk ', 'Remission status _EOI', 'Relapsed']


### Clean Column Strings

In [10]:
def clean_string_columns(df, columns_to_clean):
  existing_cols_to_clean = [col for col in columns_to_clean if col in df.columns]

  for col in existing_cols_to_clean:
      df[col] = df[col].astype("string").str.strip().str.lower()

  return df

In [11]:
df_train_clean = clean_string_columns(df_train_clean, ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage', 'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics', 'Provisional risk', 'Remission status _EOI', 'MRD Status_EOI', 'Final Risk ', 'Relapsed'])

In [12]:
df_test_clean = clean_string_columns(df_test_clean, ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage', 'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics', 'Provisional risk', 'Remission status _EOI', 'MRD Status_EOI', 'Final Risk ', 'Relapsed'])

### Dealing 'Not in remission' Entries

In [13]:
print(df_train_clean['Remission status _EOI'].value_counts(dropna=False))
print(f"\n")
print(df_test_clean['Remission status _EOI'].value_counts(dropna=False))

Remission status _EOI
in remission        1703
.                     90
not in remission      65
<NA>                  12
Name: count, dtype: Int64


Remission status _EOI
in remission        415
.                    28
not in remission     14
<NA>                  4
Name: count, dtype: Int64


In [14]:
# True where exactly 'not in remission', False elsewhere (NA -> <NA>), then fill NA -> False
mask = df_train_clean['Remission status _EOI'].eq('not in remission').fillna(False)

# keep rows where mask is False (i.e. not 'not in remission')
df_train_rem = df_train_clean.loc[~mask].copy()



# True where exactly 'not in remission', False elsewhere (NA -> <NA>), then fill NA -> False
mask1 = df_test_clean['Remission status _EOI'].eq('not in remission').fillna(False)

# keep rows where mask is False (i.e. not 'not in remission')
df_test_rem = df_test_clean.loc[~mask1].copy()

In [15]:
print(df_train_rem['Remission status _EOI'].value_counts(dropna=False))
print(f"\n")
print(df_test_rem['Remission status _EOI'].value_counts(dropna=False))

Remission status _EOI
in remission    1703
.                 90
<NA>              12
Name: count, dtype: Int64


Remission status _EOI
in remission    415
.                28
<NA>              4
Name: count, dtype: Int64


### Removing Remission status col

In [16]:
# Remove 'Remission status _EOI' col from df_train_rem and df_test_rem
df_train_remain = df_train_rem.drop('Remission status _EOI', axis=1)
df_test_remain = df_test_rem.drop('Remission status _EOI', axis=1)

In [17]:
print(df_train_remain.columns.tolist())
print(df_test_remain.columns.tolist())

['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage', 'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics', 'Provisional risk', 'MRD Status_EOI', 'Final Risk ', 'Relapsed']
['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage', 'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics', 'Provisional risk', 'MRD Status_EOI', 'Final Risk ', 'Relapsed']


### Split X and Y data

In [18]:
# Split df_train_remain into features (X_train) and target (y_train)
X_train = df_train_remain.drop('Relapsed', axis=1)
y_train = df_train_remain['Relapsed']

# Split df_test_remain into features (X_test) and target (y_test)
X_test = df_test_remain.drop('Relapsed', axis=1)
y_test = df_test_remain['Relapsed']

print("Data split into features (X) and target (y) for training and testing.")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

Data split into features (X) and target (y) for training and testing.
X_train shape: (1805, 14)
y_train shape: (1805,)
X_test shape: (447, 14)
y_test shape: (447,)


### Missing values

In [19]:
print("Missing values in X_train:")
print(X_train.isnull().sum())
print("\nPercentage of missing values in X_train:")
print(X_train.isnull().sum() / len(X_train) * 100)


print("\nMissing values in X_test:")
print(X_test.isnull().sum())
print("\nPercentage of missing values in X_test:")
print(X_test.isnull().sum() / len(X_test) * 100)

Missing values in X_train:
Previous Treatment          0
NCI Risk                    1
Sex                         0
Age                         0
Lineage                     0
Bulky Disease              51
Highest presenting WBC      0
Prednisolone Response      79
CNS Disease                77
Cytogenetic groups         49
Detail cytogenetics       106
Provisional risk            0
MRD Status_EOI             56
Final Risk                  0
dtype: int64

Percentage of missing values in X_train:
Previous Treatment        0.000000
NCI Risk                  0.055402
Sex                       0.000000
Age                       0.000000
Lineage                   0.000000
Bulky Disease             2.825485
Highest presenting WBC    0.000000
Prednisolone Response     4.376731
CNS Disease               4.265928
Cytogenetic groups        2.714681
Detail cytogenetics       5.872576
Provisional risk          0.000000
MRD Status_EOI            3.102493
Final Risk                0.000000
dtype: f

In [20]:
total_missing_train = X_train.isnull().sum().sum()
total_missing_test = X_test.isnull().sum().sum()
total_missing = total_missing_train + total_missing_test
total_values = X_train.size + X_test.size
percentage_missing = (total_missing / total_values) * 100

print(f"Total missing values in X_train: {total_missing_train}")
print(f"Total values in X_train: {X_train.size}")
print(f"% of Total missing values in X_train: {(total_missing_test/X_train.size)*100}")

print(f"\nTotal missing values in X_test: {total_missing_test}")
print(f"Total values in X_test: {X_test.size}")
print(f"% of Total missing values in X_test: {(total_missing_test/X_test.size)*100}")

print(f"\nTotal missing values in the dataset (X_train + X_test): {total_missing}")
print(f"Percentage of total missing values in the dataset: {percentage_missing:.2f}%")

Total missing values in X_train: 419
Total values in X_train: 25270
% of Total missing values in X_train: 0.4946576968737634

Total missing values in X_test: 125
Total values in X_test: 6258
% of Total missing values in X_test: 1.9974432726110578

Total missing values in the dataset (X_train + X_test): 544
Percentage of total missing values in the dataset: 1.73%


### Imputation

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from typing import List, Tuple, Dict
import numpy as np

def impute_train_test(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    categorical_cols: List[str],
    numeric_cols: List[str],
    numeric_strategy: str = "median",
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, SimpleImputer]]:
    """
    Fit imputers on X_train and transform both X_train and X_test.
    - Categorical: most_frequent
    - Numeric: numeric_strategy (default median)
    """

    # Safety: work on copies
    X_train_imp = X_train.copy()
    X_test_imp  = X_test.copy()

    # Convert pd.NA to np.nan everywhere
    X_train_imp = X_train_imp.replace({pd.NA: np.nan, None: np.nan})
    X_test_imp  = X_test_imp.replace({pd.NA: np.nan, None: np.nan})

    # Filter to existing columns
    cat_cols_exist = [c for c in categorical_cols if c in X_train.columns]
    num_cols_exist = [c for c in numeric_cols if c in X_train.columns]

    # Define imputers
    cat_imputer = SimpleImputer(strategy="most_frequent", missing_values=np.nan)
    num_imputer = SimpleImputer(strategy=numeric_strategy, missing_values=np.nan)

    # --- Fit on TRAIN ---
    if cat_cols_exist:
        # Ensure categorical columns are string type to avoid issues
        for c in cat_cols_exist:
            X_train_imp[c] = X_train_imp[c].astype(str).replace('nan', np.nan)
            X_test_imp[c] = X_test_imp[c].astype(str).replace('nan', np.nan) if c in X_test_imp.columns else X_test_imp[c]

        X_train_imp[cat_cols_exist] = cat_imputer.fit_transform(X_train_imp[cat_cols_exist])
        # Convert back to object type after imputation
        for c in cat_cols_exist:
            X_train_imp[c] = X_train_imp[c].astype(object)

    if num_cols_exist:
        # Convert numeric columns to numeric type, coercing errors to NaN
        for c in num_cols_exist:
            X_train_imp[c] = pd.to_numeric(X_train_imp[c], errors="coerce")
            if c in X_test_imp.columns:
                X_test_imp[c] = pd.to_numeric(X_test_imp[c], errors="coerce")

        X_train_imp[num_cols_exist] = num_imputer.fit_transform(X_train_imp[num_cols_exist])
        # No need to convert back to numeric type after imputation as it should already be numeric

    # --- Transform TEST ---
    cat_cols_test = [c for c in cat_cols_exist if c in X_test_imp.columns]
    num_cols_test = [c for c in num_cols_exist if c in X_test_imp.columns]

    if cat_cols_test:
        X_test_imp[cat_cols_test] = cat_imputer.transform(X_test_imp[cat_cols_test])
        for c in cat_cols_test:
            X_test_imp[c] = X_test_imp[c].astype(object)

    if num_cols_test:
        X_test_imp[num_cols_test] = num_imputer.transform(X_test_imp[num_cols_test])
        # No need to convert back to numeric type after imputation

    imputers = {"cat": cat_imputer, "num": num_imputer}
    return X_train_imp, X_test_imp, imputers

In [22]:
# Define column types
categorical_cols = ['Previous Treatment', 'NCI Risk', 'Sex', 'Lineage', 'Bulky Disease', 'Prednisolone Response', 'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics', 'Provisional risk', 'Remission status _EOI', 'MRD Status_EOI', 'Final Risk ']
numeric_cols = ['Age', 'Highest presenting WBC']

# Apply safe imputation
X_train_imputed, X_test_imputed, fitted_imputers = impute_train_test(
    X_train,
    X_test,
    categorical_cols,
    numeric_cols,
    numeric_strategy="median"
)

In [23]:
print("Missing values in X_train:")
print(X_train_imputed.isnull().sum())

print("\nMissing values in X_test:")
print(X_test_imputed.isnull().sum())

Missing values in X_train:
Previous Treatment        0
NCI Risk                  0
Sex                       0
Age                       0
Lineage                   0
Bulky Disease             0
Highest presenting WBC    0
Prednisolone Response     0
CNS Disease               0
Cytogenetic groups        0
Detail cytogenetics       0
Provisional risk          0
MRD Status_EOI            0
Final Risk                0
dtype: int64

Missing values in X_test:
Previous Treatment        0
NCI Risk                  0
Sex                       0
Age                       0
Lineage                   0
Bulky Disease             0
Highest presenting WBC    0
Prednisolone Response     0
CNS Disease               0
Cytogenetic groups        0
Detail cytogenetics       0
Provisional risk          0
MRD Status_EOI            0
Final Risk                0
dtype: int64


### Encoding: Label & One-hot

In [24]:
def encode_data_simple(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series = None,
    y_test: pd.Series = None,
    custom_mapping: Dict[str, Dict[str, int]] = None
):
    """
    Simplified version using manual one-hot encoding to avoid sklearn issues.
    """

    # Default mapping
    default_mapping = {
        'Previous Treatment': {'no': 0, 'yes': 1},
        'NCI Risk': {'standard': 0, 'high': 1},
        'Sex': {'male': 0, 'female': 1},
        'Lineage': {'b': 0, 't': 1},
        'Bulky Disease': {'no': 0, 'yes': 1},
        'Prednisolone Response': {'good': 0, 'poor': 1},
        'CNS Disease': {'no': 0, 'yes': 1},
        'Cytogenetic groups ': {'non-high risk': 1, 'not required': 2, 'high risk': 3},
        'Provisional risk': {'standard': 0, 'intermediate': 1, 'high': 2, 't': 3},
        'Remission status _EOI': {'not in remission': 0, 'in remission': 1},
        'MRD Status_EOI': {'negative': 1, 'not required': 2, 'positive': 3},
        'Final Risk ': {'standard': 0, 'intermediate': 1, 'high': 2, 't': 3},
        'Relapsed': {'no': 0, 'yes': 1}
    }

    mapping = custom_mapping if custom_mapping is not None else default_mapping
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    encoding_info = {'label_mapping': {}}

    # Encode target
    y_train_encoded, y_test_encoded = None, None
    if y_train is not None and y_test is not None:
        if 'Relapsed' in mapping:
            # Create mapping function
            relapsed_map = mapping['Relapsed']
            y_train_encoded = y_train.map(lambda x: relapsed_map.get(str(x).lower(), 0)).astype(int)
            y_test_encoded = y_test.map(lambda x: relapsed_map.get(str(x).lower(), 0)).astype(int)
            encoding_info['label_mapping']['Relapsed'] = mapping['Relapsed']

    # Manual one-hot encoding for 'Detail cytogenetics'
    if 'Detail cytogenetics' in X_train_encoded.columns:
        try:
            # Get all unique categories from training data
            unique_categories = X_train_encoded['Detail cytogenetics'].fillna('unknown').astype(str).unique()

            # Create one-hot encoded columns for training data
            for category in unique_categories:
                col_name = f"cytogenetics_{category}"
                X_train_encoded[col_name] = (X_train_encoded['Detail cytogenetics'].fillna('unknown').astype(str) == category).astype(int)

            # Create same columns for test data (some might be all zeros)
            for category in unique_categories:
                col_name = f"cytogenetics_{category}"
                X_test_encoded[col_name] = (X_test_encoded['Detail cytogenetics'].fillna('unknown').astype(str) == category).astype(int)

            # Drop original column
            X_train_encoded = X_train_encoded.drop('Detail cytogenetics', axis=1)
            X_test_encoded = X_test_encoded.drop('Detail cytogenetics', axis=1)

            encoding_info['onehot_features'] = [f"cytogenetics_{cat}" for cat in unique_categories]

        except Exception as e:
            print(f"Manual one-hot encoding failed: {e}")
            # Fallback to label encoding
            X_train_encoded['Detail cytogenetics'] = X_train_encoded['Detail cytogenetics'].astype('category').cat.codes
            X_test_encoded['Detail cytogenetics'] = X_test_encoded['Detail cytogenetics'].astype('category').cat.codes

    # Apply label encoding to other columns
    for column, value_map in mapping.items():
        if column != 'Relapsed' and column in X_train_encoded.columns:
            # Create a mapping function for this specific column
            def map_value(x, mapping_dict=value_map):
                x_str = str(x).lower().strip() if pd.notna(x) else 'unknown'
                return mapping_dict.get(x_str, 0)  # Default to 0 for unknown values

            X_train_encoded[column] = X_train_encoded[column].apply(map_value)
            if column in X_test_encoded.columns:
                X_test_encoded[column] = X_test_encoded[column].apply(map_value)
            encoding_info['label_mapping'][column] = value_map

    # Handle any remaining categorical columns
    categorical_cols = X_train_encoded.select_dtypes(include=['object', 'category']).columns
    for col in categorical_cols:
        if col not in encoding_info['label_mapping']:
            X_train_encoded[col] = X_train_encoded[col].astype('category').cat.codes
            X_test_encoded[col] = X_test_encoded[col].astype('category').cat.codes
            encoding_info['label_mapping'][col] = 'categorical_encoding_auto'

    # Ensure all columns are numeric
    X_train_encoded = X_train_encoded.apply(pd.to_numeric, errors='coerce').fillna(0)
    X_test_encoded = X_test_encoded.apply(pd.to_numeric, errors='coerce').fillna(0)

    print("Encoding completed successfully!")
    print(f"X_train shape after encoding: {X_train_encoded.shape}")
    print(f"X_test shape after encoding: {X_test_encoded.shape}")

    return X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded, encoding_info

In [25]:
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded, encoding_info = encode_data_simple(
    X_train_imputed,
    X_test_imputed,
    y_train,
    y_test,
    custom_mapping=None
)

Encoding completed successfully!
X_train shape after encoding: (1805, 26)
X_test shape after encoding: (447, 26)


In [26]:
print(y_train_encoded.unique())
print(y_test_encoded.unique())

[0 1]
[0 1]


In [27]:
print(X_train_encoded.dtypes)
print("\n")
print(y_train_encoded.dtypes)
print("\n")
print(X_test_encoded.dtypes)
print("\n")
print(y_test_encoded.dtypes)

Previous Treatment                   int64
NCI Risk                             int64
Sex                                  int64
Age                                float64
Lineage                              int64
Bulky Disease                        int64
Highest presenting WBC             float64
Prednisolone Response                int64
CNS Disease                          int64
Cytogenetic groups                   int64
Provisional risk                     int64
MRD Status_EOI                       int64
Final Risk                           int64
cytogenetics_not required            int64
cytogenetics_b-other                 int64
cytogenetics_high hyperdiploidy      int64
cytogenetics_etv6-runx1              int64
cytogenetics_<NA>                    int64
cytogenetics_bcr-abl1                int64
cytogenetics_iamp21                  int64
cytogenetics_tcf3-pbx1               int64
cytogenetics_kmt2a rearranged        int64
cytogenetics_low hypodiploidy        int64
cytogenetic

### Save Encoded data

In [28]:
# Define the folder path in Google Drive
folder_path = '/content/drive/MyDrive/NIR dataset/'

# Create the directory if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Define the file paths for the encoded data
save_path_X_train_encoded = os.path.join(folder_path, 'X_train_encoded.csv')
save_path_y_train_encoded = os.path.join(folder_path, 'y_train_encoded.csv')
save_path_X_test_encoded = os.path.join(folder_path, 'X_test_encoded.csv')
save_path_y_test_encoded = os.path.join(folder_path, 'y_test_encoded.csv')

# Save the encoded DataFrames/Series to CSV files
X_train_encoded.to_csv(save_path_X_train_encoded, index=False)
print(f"X_train_encoded saved successfully to {save_path_X_train_encoded}")

y_train_encoded.to_csv(save_path_y_train_encoded, index=False)
print(f"y_train_encoded saved successfully to {save_path_y_train_encoded}")

X_test_encoded.to_csv(save_path_X_test_encoded, index=False)
print(f"X_test_encoded saved successfully to {save_path_X_test_encoded}")

y_test_encoded.to_csv(save_path_y_test_encoded, index=False)
print(f"y_test_encoded saved successfully to {save_path_y_test_encoded}")

X_train_encoded saved successfully to /content/drive/MyDrive/NIR dataset/X_train_encoded.csv
y_train_encoded saved successfully to /content/drive/MyDrive/NIR dataset/y_train_encoded.csv
X_test_encoded saved successfully to /content/drive/MyDrive/NIR dataset/X_test_encoded.csv
y_test_encoded saved successfully to /content/drive/MyDrive/NIR dataset/y_test_encoded.csv


## Model training

### Optuna tune function

In [210]:
def tune_with_optuna_cv(X_train, y_train, n_trials=50, n_splits=5, random_state=42):
    """
    Optuna-tune XGBoost with 5-fold Stratified CV on PR-AUC (Average Precision).
    Handles class imbalance via scale_pos_weight computed per fold.
    Returns: best_model, best_params, best_cv_ap
    """

    # Ensure y is 1-D Series/array
    if isinstance(y_train, pd.DataFrame):
        y_train = y_train.iloc[:, 0]
    y = np.asarray(y_train).reshape(-1)
    X = np.asarray(X_train)

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 150, 900),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),

            # fixed settings
            "random_state": random_state,
            "n_jobs": -1,
            "eval_metric": "aucpr",          # track PR-AUC during training logs
            "objective": "binary:logistic",
            "base_score": 0.5, # Set base_score to 0.5 to avoid error with single-class folds
            # tip: you can add tree_method="hist" for speed on CPU
        }

        aps = []
        for tr_idx, va_idx in cv.split(X, y):
            X_tr, X_va = X[tr_idx], X[va_idx]
            y_tr, y_va = y[tr_idx], y[va_idx]

            # scale_pos_weight per fold
            pos = (y_tr == 1).sum()
            neg = (y_tr == 0).sum()
            spw = (neg / max(pos, 1)) if pos > 0 else 1.0

            model = XGBClassifier(**params, scale_pos_weight=spw)
            model.fit(X_tr, y_tr, verbose=False)

            y_va_prob = model.predict_proba(X_va)[:, 1]
            aps.append(average_precision_score(y_va, y_va_prob))

        return float(np.mean(aps))

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    # Train best model on full training data (scale_pos_weight from full y)
    best_params = study.best_params.copy()
    pos_full = int((y == 1).sum())
    neg_full = int((y == 0).sum())
    spw_full = (neg_full / max(pos_full, 1)) if pos_full > 0 else 1.0

    best_model = XGBClassifier(
        **best_params,
        random_state=random_state,
        n_jobs=-1,
        eval_metric="aucpr",
        objective="binary:logistic",
        scale_pos_weight=spw_full,
        base_score=0.5, # Set base_score to 0.5 for the final model as well
    )
    best_model.fit(X, y, verbose=False)

    return best_model, best_params

In [221]:
xgb_model, xgb_params = tune_with_optuna_cv(X_train_encoded, y_train_encoded, n_trials=100)

[I 2025-09-29 21:32:12,752] A new study created in memory with name: no-name-f1be3d0b-d04e-4682-992f-f5fc7f62eb9c
[I 2025-09-29 21:32:18,099] Trial 0 finished with value: 0.37136309660738037 and parameters: {'n_estimators': 582, 'max_depth': 3, 'learning_rate': 0.07423412753938873, 'subsample': 0.6973157983373572, 'colsample_bytree': 0.9954152094575881, 'gamma': 0.654563448777537, 'reg_lambda': 0.0346259325892464, 'reg_alpha': 3.242698509422461, 'min_child_weight': 1}. Best is trial 0 with value: 0.37136309660738037.
[I 2025-09-29 21:32:20,196] Trial 1 finished with value: 0.385363019801512 and parameters: {'n_estimators': 900, 'max_depth': 8, 'learning_rate': 0.004033243210805995, 'subsample': 0.5847039687193727, 'colsample_bytree': 0.7506559612080177, 'gamma': 4.052267813666568, 'reg_lambda': 0.0056208323337075166, 'reg_alpha': 3.5976703634804257, 'min_child_weight': 1}. Best is trial 1 with value: 0.385363019801512.
[I 2025-09-29 21:32:20,704] Trial 2 finished with value: 0.38560292

In [222]:
print(xgb_params)

{'n_estimators': 781, 'max_depth': 4, 'learning_rate': 0.04328991961554912, 'subsample': 0.5131436938731899, 'colsample_bytree': 0.531722906302823, 'gamma': 3.8101068964654874, 'reg_lambda': 0.016373977131130917, 'reg_alpha': 0.3149315576971052, 'min_child_weight': 4}


### Threshold Tuning

In [218]:
def tune_threshold(model, X_val, y_val, beta = 1):
    y_probs = model.predict_proba(X_val)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_val, y_probs)

    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)
    #metric = (1 + beta) * (precisions * recalls) / (precisions + (recalls * beta) + 1e-6)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]

    return best_threshold, f1_scores[best_idx]

In [223]:
xgb_threshold, xgb_f1 = tune_threshold(xgb_model, X_train_encoded, y_train_encoded, 0.1)
print(f"Best Threshold: {xgb_threshold}")
print(f"Best F1 Score: {xgb_f1}")

Best Threshold: 0.5192791223526001
Best F1 Score: 0.5917998746957573


### Evaluate function

In [216]:
def evaluate_model(model, X_test, y_test, threshold=0.5):
    # Probabilities
    y_probs = model.predict_proba(X_test)[:, 1]

    # Apply threshold
    y_pred = (y_probs >= threshold).astype(int)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_probs)
    cm = confusion_matrix(y_test, y_pred)

    # Print neatly
    print("=== Evaluation Metrics (Summary) ===")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print(f"ROC AUC  : {auc:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    return

In [220]:
# 50 trials
evaluate_model(xgb_model, X_test_encoded, y_test_encoded, xgb_threshold)

=== Evaluation Metrics (Summary) ===
Accuracy : 0.5861
Precision: 0.3011
Recall   : 0.5045
F1-score : 0.3771
ROC AUC  : 0.6273

Confusion Matrix:
[[206 130]
 [ 55  56]]


In [225]:
# 100 trials
evaluate_model(xgb_model, X_test_encoded, y_test_encoded, xgb_threshold)

=== Evaluation Metrics (Summary) ===
Accuracy : 0.6398
Precision: 0.3457
Recall   : 0.5045
F1-score : 0.4103
ROC AUC  : 0.6252

Confusion Matrix:
[[230 106]
 [ 55  56]]


In [232]:
evaluate_model(xgb_model, X_test_encoded, y_test_encoded, threshold = 0.40)

=== Evaluation Metrics (Summary) ===
Accuracy : 0.5145
Precision: 0.2913
Recall   : 0.6667
F1-score : 0.4055
ROC AUC  : 0.6252

Confusion Matrix:
[[156 180]
 [ 37  74]]


In [231]:
evaluate_model(xgb_model, X_test_encoded, y_test_encoded, threshold = 0.395)

=== Evaluation Metrics (Summary) ===
Accuracy : 0.5123
Precision: 0.2918
Recall   : 0.6757
F1-score : 0.4076
ROC AUC  : 0.6252

Confusion Matrix:
[[154 182]
 [ 36  75]]


In [238]:
evaluate_model(xgb_model, X_test_encoded, y_test_encoded, threshold = 0.3885)

=== Evaluation Metrics (Summary) ===
Accuracy : 0.5078
Precision: 0.2928
Recall   : 0.6937
F1-score : 0.4118
ROC AUC  : 0.6252

Confusion Matrix:
[[150 186]
 [ 34  77]]


In [235]:
evaluate_model(xgb_model, X_test_encoded, y_test_encoded, threshold = 0.45)

=== Evaluation Metrics (Summary) ===
Accuracy : 0.5615
Precision: 0.3077
Recall   : 0.6126
F1-score : 0.4096
ROC AUC  : 0.6252

Confusion Matrix:
[[183 153]
 [ 43  68]]
