<a href="https://colab.research.google.com/github/subikkshas/PREPARE-ALL/blob/main/clinician_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
from google.colab import drive
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
drive.mount('/content/drive')
df_test = pd.read_excel('/content/drive/MyDrive/NIR dataset/df_test.xlsx')
print(df_test.shape)

Mounted at /content/drive
(461, 33)


In [4]:
df_cl_pred = pd.read_excel('/content/X_test for drs.xlsx')
print(df_cl_pred.shape)

(467, 17)


In [5]:
# Define key columns
# key_cols = ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage',
#             'Bulky Disease', 'Highest presenting WBC','Final Risk ']
key_cols = [
    'Previous Treatment',
    'NCI Risk',
    # 'Sex',
    'Age',
    'Lineage',
    'Bulky Disease',
    'Highest presenting WBC',
    'Prednisolone Response',
    'CNS Disease',
    'Cytogenetic groups ',
    'Detail cytogenetics',
    'Provisional risk',
    'MRD Status_EOI',
    'Final Risk '
    ]
# key_cols = ['Sex', 'Age','Highest presenting WBC']

# df_cl_strip = df_cl_strip.reset_index().rename(columns={'index': 'clinical_idx'})


df_test['Age'] = df_test['Age'].astype(float).fillna(0).astype(int)
df_test['Highest presenting WBC'] = df_test['Highest presenting WBC'].astype(float).fillna(0).astype(int)
df_cl_pred['Age'] = df_cl_pred['Age'].astype(float).fillna(0).astype(int)
df_cl_pred['Highest presenting WBC'] = df_cl_pred['Highest presenting WBC'].astype(float).fillna(0).astype(int)

df_test['Highest presenting WBC'] = df_test['Highest presenting WBC'].round()
df_cl_pred['Highest presenting WBC'] = df_cl_pred['Highest presenting WBC'].round()

# Step 1: Inner merge on key columns to find matches
matches = pd.merge(df_test,df_cl_pred, on=key_cols, how='left',indicator=True,suffixes=('_full', '_clinician'))

# Step 2: Count number of matching rows
print(f"Number of matching rows: {len(matches)}")

# Optional: preview some matching rows
# matches.head()

Number of matching rows: 461


In [6]:
# Keep only rows in df_cl_pred that exist in df_test
df_cl_pred_filtered = df_cl_pred.merge(df_test[key_cols], on=key_cols, how='inner')

print(f"Original df_cl_pred: {len(df_cl_pred)}")
print(f"Filtered df_cl_pred: {len(df_cl_pred_filtered)}")
print(f"df_test: {len(df_test)}")

Original df_cl_pred: 467
Filtered df_cl_pred: 461
df_test: 461


In [7]:
cols_to_append = ['Remission status _EOI', 'Relapsed']

df_cl_pred_filtered = pd.merge(df_cl_pred_filtered, df_test[key_cols + cols_to_append], on=key_cols, how='left')



In [8]:
print(df_cl_pred.shape)
print(df_cl_pred_filtered.shape)

(467, 17)
(461, 19)


In [9]:
print(df_cl_pred_filtered['Remission status _EOI'].value_counts(dropna=False))

Remission status _EOI
in remission        415
.                    28
not in remission     14
NaN                   4
Name: count, dtype: int64


In [10]:
# True where exactly 'not in remission', False elsewhere (NA -> <NA>), then fill NA -> False
mask = df_cl_pred_filtered['Remission status _EOI'].eq('not in remission').fillna(False)

# keep rows where mask is False (i.e. not 'not in remission')
df_cl_pred_rem = df_cl_pred_filtered.loc[~mask].copy()

In [11]:
print(df_cl_pred_rem['Remission status _EOI'].value_counts(dropna=False))

Remission status _EOI
in remission    415
.                28
NaN               4
Name: count, dtype: int64


In [12]:
print(df_cl_pred_rem.columns)

Index(['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage',
       'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response',
       'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics',
       'Provisional risk', 'MRD Status_EOI', 'Final Risk ',
       'Doctor Predictions - Gargi', 'Doctor's Predictions - Prasanth',
       'Doctor Predictions - Balaji', 'Remission status _EOI', 'Relapsed'],
      dtype='object')


In [13]:
def clean_string_columns(df, columns_to_clean):
  existing_cols_to_clean = [col for col in columns_to_clean if col in df.columns]

  for col in existing_cols_to_clean:
      df[col] = df[col].astype("string").str.strip().str.lower()

  return df

In [14]:
df_cl_pred_rem = clean_string_columns(df_cl_pred_rem, ['Previous Treatment', 'NCI Risk', 'Sex', 'Age', 'Lineage',
       'Bulky Disease', 'Highest presenting WBC', 'Prednisolone Response',
       'CNS Disease', 'Cytogenetic groups ', 'Detail cytogenetics',
       'Provisional risk', 'MRD Status_EOI', 'Final Risk ',
       'Doctor Predictions - Gargi', "Doctor's Predictions - Prasanth",
       'Doctor Predictions - Balaji', 'Remission status _EOI', 'Relapsed'])

In [15]:
cl1_pred = df_cl_pred_rem["Doctor Predictions - Gargi"]
cl2_pred = df_cl_pred_rem["Doctor's Predictions - Prasanth"]
cl3_pred = df_cl_pred_rem["Doctor Predictions - Balaji"]

In [16]:
cl1_pred

Unnamed: 0,Doctor Predictions - Gargi
0,yes
1,yes
2,yes
3,yes
4,no
...,...
456,no
457,no
458,yes
459,no


In [17]:
y_true = df_cl_pred_rem["Relapsed"]

In [18]:
# Encode 'yes' as 1 and 'no' as 0
cl1_pred_encoded = cl1_pred.str.lower().apply(lambda x: 1 if x == 'yes' else 0)
cl2_pred_encoded = cl2_pred.str.lower().apply(lambda x: 1 if x == 'yes' else 0)
cl3_pred_encoded = cl3_pred.str.lower().apply(lambda x: 1 if x == 'yes' else 0)
y_true_encoded = y_true.str.lower().apply(lambda x: 1 if x == 'yes' else 0)

In [20]:
# Define the directory to save the files
save_dir = '/content/drive/MyDrive/NIR dataset'

# Save the encoded Series to CSV files
cl1_pred_encoded.to_csv(f'{save_dir}/cl1_pred_encoded.csv', index=False)
cl2_pred_encoded.to_csv(f'{save_dir}/cl2_pred_encoded.csv', index=False)
cl3_pred_encoded.to_csv(f'{save_dir}/cl3_pred_encoded.csv', index=False)
y_true_encoded.to_csv(f'{save_dir}/y_true_encoded.csv', index=False)

print(f"Saved encoded predictions and true values to {save_dir}")

Saved encoded predictions and true values to /content/drive/MyDrive/NIR dataset


In [19]:
print(cl1_pred_encoded.value_counts())
print(f"\n{cl2_pred_encoded.value_counts()}")
print(f"\n{cl3_pred_encoded.value_counts()}")
print(f"\n{y_true_encoded.value_counts()}")

Doctor Predictions - Gargi
0    313
1    134
Name: count, dtype: int64

Doctor's Predictions - Prasanth
0    297
1    150
Name: count, dtype: int64

Doctor Predictions - Balaji
0    305
1    142
Name: count, dtype: int64

Relapsed
0    336
1    111
Name: count, dtype: int64


In [21]:
# Function to calculate and print metrics
def print_metrics(y_true, y_pred, predictor_name):
    print(f"--- Metrics for {predictor_name} ---")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_true, y_pred):.4f}")
    # ROC AUC requires probability estimates, which we don't have for clinician predictions.
    # print(f"ROC AUC: {roc_auc_score(y_true, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("-" * (len(predictor_name) + 18))

# Print metrics for each clinician's predictions using the encoded data
print_metrics(y_true_encoded, cl1_pred_encoded, "Clinician 1 (Gargi)")
print_metrics(y_true_encoded, cl2_pred_encoded, "Clinician 2 (Prasanth)")
print_metrics(y_true_encoded, cl3_pred_encoded, "Clinician 3 (Balaji)")

--- Metrics for Clinician 1 (Gargi) ---
Accuracy: 0.6085
Precision: 0.2612
Recall: 0.3153
F1 Score: 0.2857
ROC AUC: 0.5103
Confusion Matrix:
[[237  99]
 [ 76  35]]
-------------------------------------
--- Metrics for Clinician 2 (Prasanth) ---
Accuracy: 0.6264
Precision: 0.3133
Recall: 0.4234
F1 Score: 0.3602
ROC AUC: 0.5584
Confusion Matrix:
[[233 103]
 [ 64  47]]
----------------------------------------
--- Metrics for Clinician 3 (Balaji) ---
Accuracy: 0.6040
Precision: 0.2676
Recall: 0.3423
F1 Score: 0.3004
ROC AUC: 0.5164
Confusion Matrix:
[[232 104]
 [ 73  38]]
--------------------------------------


In [22]:
X_test_encoded = pd.read_csv('/content/drive/MyDrive/NIR dataset/X_test_encoded.csv')
y_test_encoded = pd.read_csv('/content/drive/MyDrive/NIR dataset/y_test_encoded.csv')
print(X_test_encoded.shape)
print(y_test_encoded.shape)

(447, 26)
(447, 1)


In [23]:
import joblib

# Define the path to the saved model
model_path = '/content/drive/MyDrive/NIR dataset/xgb_model.joblib'

# Load the model
xgb_model = joblib.load(model_path)

print("XGBoost model loaded successfully.")

XGBoost model loaded successfully.


In [24]:
def evaluate_model(model, X_test, y_test, threshold=0.5):
    # Probabilities
    y_probs = model.predict_proba(X_test)[:, 1]

    # Apply threshold
    y_pred = (y_probs >= threshold).astype(int)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_probs)
    cm = confusion_matrix(y_test, y_pred)

    # Print neatly
    print("=== Evaluation Metrics (Summary) ===")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print(f"ROC AUC  : {auc:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    return

In [25]:
evaluate_model(xgb_model, X_test_encoded, y_test_encoded, threshold = 0.437)

=== Evaluation Metrics (Summary) ===
Accuracy : 0.5481
Precision: 0.3128
Recall   : 0.6847
F1-score : 0.4294
ROC AUC  : 0.6321

Confusion Matrix:
[[169 167]
 [ 35  76]]


In [28]:
# Get the model's predictions using the same threshold as before
y_probs = xgb_model.predict_proba(X_test_encoded)[:, 1]
model_pred_encoded = (y_probs >= 0.437).astype(int)

# Convert the numpy array to a pandas Series to ensure both have an index for direct comparison
model_pred_encoded = pd.Series(model_pred_encoded, index=X_test_encoded.index)

# Ensure both Series have the same index for direct comparison
# Reset the index of cl1_pred_encoded to match model_pred_encoded's index
cl1_pred_encoded.index = model_pred_encoded.index


# Compare the model's predictions and Clinician 1's predictions
comparison = (model_pred_encoded == cl1_pred_encoded)

# Calculate concordance (where they are the same)
concordance_count = comparison.sum()

# Calculate discordance (where they are different)
discordance_count = (~comparison).sum()

total_count = len(comparison)

print(f"Comparison between Model and Clinician 1 (Gargi):")
print(f"Concordance (Agreement): {concordance_count} out of {total_count} ({concordance_count/total_count:.2%})")
print(f"Discordance (Disagreement): {discordance_count} out of {total_count} ({discordance_count/total_count:.2%})")

Comparison between Model and Clinician 1 (Gargi):
Concordance (Agreement): 220 out of 447 (49.22%)
Discordance (Disagreement): 227 out of 447 (50.78%)


In [30]:
# Ensure both Series have the same index for direct comparison
# Reset the index of cl2_pred_encoded to match model_pred_encoded's index
cl2_pred_encoded.index = model_pred_encoded.index

# Compare the model's predictions and Clinician 2's predictions
comparison1 = (model_pred_encoded == cl2_pred_encoded)

# Calculate concordance (where they are the same)
concordance_count1 = comparison1.sum()

# Calculate discordance (where they are different)
discordance_count1 = (~comparison1).sum()

total_count1 = len(comparison1)

print(f"Comparison between Model and Clinician 2 (Prasanth):")
print(f"Concordance (Agreement): {concordance_count1} out of {total_count1} ({concordance_count1/total_count1:.2%})")
print(f"Discordance (Disagreement): {discordance_count1} out of {total_count1} ({discordance_count1/total_count1:.2%})")

Comparison between Model and Clinician 2 (Prasanth):
Concordance (Agreement): 230 out of 447 (51.45%)
Discordance (Disagreement): 217 out of 447 (48.55%)


In [31]:
# Ensure both Series have the same index for direct comparison
# Reset the index of cl2_pred_encoded to match model_pred_encoded's index
cl3_pred_encoded.index = model_pred_encoded.index

# Compare the model's predictions and Clinician 2's predictions
comparison2 = (model_pred_encoded == cl3_pred_encoded)

# Calculate concordance (where they are the same)
concordance_count2 = comparison2.sum()

# Calculate discordance (where they are different)
discordance_count2 = (~comparison2).sum()

total_count2 = len(comparison2)

print(f"Comparison between Model and Clinician 3 (Balaji):")
print(f"Concordance (Agreement): {concordance_count2} out of {total_count2} ({concordance_count2/total_count2:.2%})")
print(f"Discordance (Disagreement): {discordance_count2} out of {total_count2} ({discordance_count2/total_count2:.2%})")

Comparison between Model and Clinician 3 (Balaji):
Concordance (Agreement): 224 out of 447 (50.11%)
Discordance (Disagreement): 223 out of 447 (49.89%)


In [33]:
from sklearn.metrics import cohen_kappa_score

# Ensure all Series have the same index for comparison
# We already aligned the indices of cl1_pred_encoded, cl2_pred_encoded, cl3_pred_encoded
# with model_pred_encoded in previous cells.
# So, we can directly use them here, assuming those cells were run successfully.

# Cohen's Kappa for Model vs Clinicians
kappa_model_gargi = cohen_kappa_score(model_pred_encoded, cl1_pred_encoded)
kappa_model_prasanth = cohen_kappa_score(model_pred_encoded, cl2_pred_encoded)
kappa_model_balaji = cohen_kappa_score(model_pred_encoded, cl3_pred_encoded)

print("Cohen's Kappa (Model vs Clinicians):")
print(f"Model vs Gargi: {kappa_model_gargi:.4f}")
print(f"Model vs Prasanth: {kappa_model_prasanth:.4f}")
print(f"Model vs Balaji: {kappa_model_balaji:.4f}")

# Cohen's Kappa between Clinicians
kappa_gargi_prasanth = cohen_kappa_score(cl1_pred_encoded, cl2_pred_encoded)
kappa_gargi_balaji = cohen_kappa_score(cl1_pred_encoded, cl3_pred_encoded)
kappa_prasanth_balaji = cohen_kappa_score(cl2_pred_encoded, cl3_pred_encoded)

print("\nCohen's Kappa (Between Clinicians):")
print(f"Gargi vs Prasanth: {kappa_gargi_prasanth:.4f}")
print(f"Gargi vs Balaji: {kappa_gargi_balaji:.4f}")
print(f"Prasanth vs Balaji: {kappa_prasanth_balaji:.4f}")

Cohen's Kappa (Model vs Clinicians):
Model vs Gargi: 0.0186
Model vs Prasanth: 0.0562
Model vs Balaji: 0.0330

Cohen's Kappa (Between Clinicians):
Gargi vs Prasanth: 0.6496
Gargi vs Balaji: -0.0688
Prasanth vs Balaji: -0.0168
