# Cancer Classification
This notebook trains classifiers to detect cancer in MRI scans, using automated labels from the LLM pipeline.

In [33]:
import os
import pickle
import misvm
import torch
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, balanced_accuracy_score, roc_curve

ivd_arrays_path = '/work/robinpark/AutoLabelClassifier/data/ncimi_ivd_arrays'

# SEED
seed=0
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [34]:
# Pickle the dictionary
with open(f'{ivd_arrays_path}/ncimi_samples_dict.pkl', 'rb') as handle:
    samples = pickle.load(handle)

In [35]:
train_samples = samples['train_samples']
val_samples = samples['val_samples']
test_samples = samples['test_samples']

In [36]:
# Summarise splits
def sum_samples(samples):
    df = pd.DataFrame.from_dict(samples, orient='index', columns=['results']).reset_index()

    # Split index into columns
    df[['pat_id','stu_id','ser_id','level1','level2']] = df['index'].str.split('_',expand=True)

    # Sum rows by pat_id, date, level, and unique results 
    df = df.groupby(['pat_id','stu_id','ser_id','level1','level2', 'results']).size().reset_index(name='counts')

    df['pat_stu_id'] = df['pat_id'] + '_' + df['stu_id']

    print('unique pat:', len(df[['pat_id']].drop_duplicates()))
    print('unique studies:', len(df[['pat_stu_id']].drop_duplicates()))

    display(df.groupby(['results'])[['counts']].sum())

In [37]:
sum_samples(train_samples)

unique pat: 1048
unique studies: 1081


Unnamed: 0_level_0,counts
results,Unnamed: 1_level_1
0,4777
1,14313


In [38]:
sum_samples(val_samples)

unique pat: 269
unique studies: 277


Unnamed: 0_level_0,counts
results,Unnamed: 1_level_1
0,1400
1,3628


In [39]:
sum_samples(test_samples)

unique pat: 392
unique studies: 393


Unnamed: 0_level_0,counts
results,Unnamed: 1_level_1
0,2236
1,4760


## ResNet18 Encodings + SVC

In [40]:
with open(f'{ivd_arrays_path}/ncimi_resnet_encodings.pkl', 'rb') as handle:
    ncimi_resnet_encodings = pickle.load(handle)

train_features_cpu = ncimi_resnet_encodings['train_features_cpu']
label_train_array = ncimi_resnet_encodings['label_train_array']

val_features_cpu = ncimi_resnet_encodings['val_features_cpu']
label_val_array = ncimi_resnet_encodings['label_val_array']

test_features_cpu = ncimi_resnet_encodings['test_features_cpu']
label_test_array = ncimi_resnet_encodings['label_test_array']

In [41]:
# Load pickled arrays
with open(f'{ivd_arrays_path}/ncimi_arrays_dict.pkl', 'rb') as handle:
    ncimi_array_dict = pickle.load(handle)

label_test_report = ncimi_array_dict['label_test_report']
label_test_scores = ncimi_array_dict['label_test_scores']
label_test_con = ncimi_array_dict['label_test_con']
compare_test_array = ncimi_array_dict['compare_test_array']

test_pat_id_date = ncimi_array_dict['test_pat_id_date']
val_pat_id_date = ncimi_array_dict['val_pat_id_date']
train_pat_id_date = ncimi_array_dict['train_pat_id_date']

In [42]:
# Standardise
scaler = StandardScaler()
train_features_cpu = scaler.fit_transform(train_features_cpu)
val_features_cpu = scaler.transform(val_features_cpu)
test_features_cpu = scaler.transform(test_features_cpu)

In [43]:
train_features_cpu = [torch.Tensor(i) for i in train_features_cpu]
val_features_cpu = [torch.Tensor(i) for i in val_features_cpu]
test_features_cpu = [torch.Tensor(i) for i in test_features_cpu]

### SVC Using Max Bag

In [44]:
def get_max_bag(features, labels, patient_slices):
    # Combine features, labels, and patient slices into a DataFrame
    df = pd.DataFrame({'Features': features, 'Labels': labels, 'Patient_Slices': patient_slices})
    
    # Group by patient
    grouped = df.groupby('Patient_Slices')
    
    mil_bags = []
    mil_label = []
    
    for patient_slice, group in grouped:
        # Combine slices into a bag
        bag_features = group['Features'].tolist()
        
        # For MIL, we usually have a single label per bag
        # For simplicity, let's assume the label is the same for all slices of the patient
        bag_label = group['Labels'].iloc[0]
        # if bag_label == 0:
        #     bag_label = -1
        
        # Add bag to MIL bags
        mil_bags.append(bag_features)
        mil_label.append(bag_label)

    max_bags = []
    for bag in mil_bags:
        max_bag, _ = torch.max(torch.stack(bag), dim=0)
        max_bags.append(max_bag)
    
    return max_bags, mil_label

In [45]:
train_max_bags, train_labels = get_max_bag(train_features_cpu, label_train_array, train_pat_id_date)
test_max_bags, test_labels = get_max_bag(test_features_cpu, label_test_array, test_pat_id_date)
val_max_bags, val_labels = get_max_bag(val_features_cpu, label_val_array, val_pat_id_date)

In [46]:
# Initialize SVM classifier
svm_classifier = svm.SVC(kernel='linear', C=4, probability=True)

# Train the classifier
svm_classifier.fit(train_max_bags, train_labels)

In [47]:
# Predict the validation set
val_pred = svm_classifier.predict(val_max_bags)
val_prob = svm_classifier.predict_proba(val_max_bags)[:, 1]

# Evaluate using AUROC, F1 and balanced accuracy
val_auc = roc_auc_score(val_labels, val_prob)
val_f1 = f1_score(val_labels, val_pred)
val_bal_acc = balanced_accuracy_score(val_labels, val_pred)
val_acc = np.mean(val_pred == val_labels)

fpr, tpr, thresholds = roc_curve(val_labels, val_prob)

# Find the point where FPR equals FRR
val_eer_threshold = thresholds[np.nanargmin(np.absolute((1 - tpr) - fpr))]
val_eer = fpr[np.nanargmin(np.absolute((1 - tpr) - fpr))]  # Equal to FRR

print(f'Val Accuracy: {val_acc:.3f}')
print(f'Val AUC: {val_auc:.3f}')
print(f'Validation F1: {val_f1:.3f}')
print(f'Validation Balanced Accuracy: {val_bal_acc:.3f}')
print(f'EER: {val_eer:.3f}')
print(f'EER Threshold: {val_eer_threshold:.3f}\n')

# Predict test set
test_prob = svm_classifier.predict_proba(test_max_bags)[:, 1]

fpr, tpr, thresholds = roc_curve(test_labels, test_prob)

# Find the point where FPR equals FRR
test_eer_threshold = thresholds[np.nanargmin(np.absolute((1 - tpr) - fpr))]
test_eer = fpr[np.nanargmin(np.absolute((1 - tpr) - fpr))]  # Equal to FRR

test_pred = test_prob > val_eer_threshold

# Evaluate using accuracy, F1 and balanced accuracy
test_auc = roc_auc_score(test_labels, test_prob)
test_f1 = f1_score(test_labels, test_pred)
test_bal_acc = balanced_accuracy_score(test_labels, test_pred)
test_acc = np.mean(test_pred == test_labels)

print(f'Test Accuracy: {test_acc:.3f}')
print(f'Test AUC: {test_auc:.3f}')
print(f'Test F1: {test_f1:.3f}')
print(f'Test Balanced Accuracy: {test_bal_acc:.3f}')
print(f'EER: {test_eer:.3f}')
print(f'EER Threshold: {test_eer_threshold:.3f}')

Val Accuracy: 0.665
Val AUC: 0.642
Validation F1: 0.760
Validation Balanced Accuracy: 0.606
EER: 0.409
EER Threshold: 0.708

Test Accuracy: 0.639
Test AUC: 0.692
Test F1: 0.687
Test Balanced Accuracy: 0.663
EER: 0.353
EER Threshold: 0.695


### MIL-SVM

In [48]:
def create_mil_bags(features, labels, patient_slices):
    # Combine features, labels, and patient slices into a DataFrame
    df = pd.DataFrame({'Features': features, 'Labels': labels, 'Patient_Slices': patient_slices})
    
    # Group by patient
    grouped = df.groupby('Patient_Slices')
    
    mil_bags = []
    mil_label = []
    
    for patient_slice, group in grouped:
        # Combine slices into a bag
        bag_features = group['Features'].tolist()
        
        # For MIL, we usually have a single label per bag
        # For simplicity, let's assume the label is the same for all slices of the patient
        bag_label = group['Labels'].iloc[0]
        if bag_label == 0:
            bag_label = -1
        
        # Add bag to MIL bags
        mil_bags.append(bag_features)
        mil_label.append(bag_label)
    
    return mil_bags, mil_label

In [49]:
train_bags, train_labels = create_mil_bags(train_features_cpu, label_train_array, train_pat_id_date)
test_bags, test_labels = create_mil_bags(test_features_cpu, label_test_array, test_pat_id_date)
val_bags, val_labels = create_mil_bags(val_features_cpu, label_val_array, val_pat_id_date)

In [50]:
mi_svm = misvm.MISVM(kernel='linear', C=2, max_iters=30)
mi_svm.fit(train_bags, train_labels)

Non-random start...

Iteration 1...
Training SVM...
     pcost       dcost       gap    pres   dres
 0: -8.1366e+02 -1.3717e+00  3e+04  2e+02  2e-12
 1: -9.4948e+00 -1.3668e+00  4e+02  2e+00  2e-12
 2: -7.8504e-01 -1.2584e+00  2e+01  1e-01  1e-13
 3: -3.4353e-01 -1.0535e+00  2e+00  8e-03  8e-15
 4: -2.8851e-01 -5.0310e-01  2e-01  3e-04  1e-14
 5: -2.9403e-01 -3.4956e-01  6e-02  5e-05  4e-15
 6: -2.9769e-01 -3.1409e-01  2e-02  9e-17  1e-14
 7: -2.9927e-01 -3.0375e-01  4e-03  2e-16  2e-14
 8: -2.9959e-01 -3.0239e-01  3e-03  2e-16  1e-13
 9: -2.9986e-01 -3.0056e-01  7e-04  3e-16  1e-13
10: -2.9994e-01 -3.0016e-01  2e-04  1e-16  6e-13
11: -2.9996e-01 -3.0001e-01  5e-05  1e-16  1e-12
12: -2.9997e-01 -2.9997e-01  9e-06  2e-16  4e-12
13: -2.9997e-01 -2.9997e-01  3e-07  4e-16  5e-12
Optimal solution found.
Recomputing classes...
Selector differences: 5974
Updating QP...

Iteration 2...
Training SVM...
     pcost       dcost       gap    pres   dres
 0: -5.9856e+02 -1.5274e+00  3e+04  2e+02  2e

In [51]:
mil_val_scores = mi_svm.predict(val_bags)
mil_val_pred = np.sign(mil_val_scores)

fpr, tpr, thresholds = roc_curve(val_labels, mil_val_scores)

# Find the point where FPR equals FRR
val_eer_threshold = thresholds[np.nanargmin(np.absolute((1 - tpr) - fpr))]
val_eer = fpr[np.nanargmin(np.absolute((1 - tpr) - fpr))]  # Equal to FRR

print('Validation Balanced Accuracy:', balanced_accuracy_score(val_labels, mil_val_pred))
print('Validation F1 Score:', f1_score(val_labels, mil_val_pred))
print('Validation AUC:', roc_auc_score(val_labels, mil_val_scores))
print('Validation EER:', val_eer)
print('Validation EER Threshold:', val_eer_threshold)

mil_test_scores = mi_svm.predict(test_bags)
mil_test_pred = np.where(mil_test_scores > val_eer_threshold, 1, -1)

fpr, tpr, thresholds = roc_curve(test_labels, mil_test_scores)

# Find the point where FPR equals FRR
test_eer_threshold = thresholds[np.nanargmin(np.absolute((1 - tpr) - fpr))]
test_eer = fpr[np.nanargmin(np.absolute((1 - tpr) - fpr))]  # Equal to FRR

print('\nTest Balanced Accuracy:', balanced_accuracy_score(test_labels, mil_test_pred))
print('Test F1 Score:', f1_score(test_labels, mil_test_pred))
print('Test AUC:', roc_auc_score(test_labels, mil_test_scores))
print('Test EER:', test_eer)
print('Test EER Threshold:', test_eer_threshold)

Validation Balanced Accuracy: 0.6892208326900314
Validation F1 Score: 0.768141592920354
Validation AUC: 0.7489578508568782
Validation EER: 0.30708661417322836
Validation EER Threshold: 0.0072417440674475975

Test Balanced Accuracy: 0.6957564383275988
Test F1 Score: 0.7247956403269755
Test AUC: 0.7681043570940349
Test EER: 0.3140096618357488
Test EER Threshold: -0.007328331197580529


### NSK-SVM

In [56]:
nsk_svm = misvm.NSK(kernel='linear', C=1)
nsk_svm.fit(train_bags, train_labels)

Setup QP...
Solving QP...
     pcost       dcost       gap    pres   dres
 0: -3.2159e+02 -2.3754e+00  1e+04  1e+02  1e-10
 1: -4.2930e+00 -2.3404e+00  1e+02  1e+00  1e-10
 2: -6.2005e-01 -2.1709e+00  9e+00  7e-02  7e-12
 3: -3.8982e-01 -1.5925e+00  2e+00  1e-02  1e-12
 4: -3.4010e-01 -1.0322e+00  1e+00  4e-03  4e-13
 5: -3.5563e-01 -4.8956e-01  1e-01  4e-04  1e-13
 6: -3.8237e-01 -4.3542e-01  6e-02  1e-04  9e-14
 7: -3.9452e-01 -4.1266e-01  2e-02  2e-05  1e-13
 8: -3.9909e-01 -4.0446e-01  5e-03  3e-06  1e-13
 9: -4.0059e-01 -4.0201e-01  1e-03  4e-07  1e-13
10: -4.0111e-01 -4.0123e-01  1e-04  1e-08  1e-13
11: -4.0116e-01 -4.0116e-01  7e-06  7e-10  1e-13
12: -4.0116e-01 -4.0116e-01  7e-07  4e-11  1e-13
13: -4.0116e-01 -4.0116e-01  8e-08  4e-12  1e-13
Optimal solution found.


In [57]:
mil_val_scores = nsk_svm.predict(val_bags)
mil_val_pred = np.sign(mil_val_scores)

fpr, tpr, thresholds = roc_curve(val_labels, mil_val_scores)

# Find the point where FPR equals FRR
val_eer_threshold = thresholds[np.nanargmin(np.absolute((1 - tpr) - fpr))]
val_eer = fpr[np.nanargmin(np.absolute((1 - tpr) - fpr))]  # Equal to FRR

print('Validation Balanced Accuracy:', balanced_accuracy_score(val_labels, mil_val_pred))
print('Validation F1 Score:', f1_score(val_labels, mil_val_pred))
print('Validation AUC:', roc_auc_score(val_labels, mil_val_scores))
print('Validation EER:', val_eer)
print('Validation EER Threshold:', val_eer_threshold)

mil_test_scores = nsk_svm.predict(test_bags)
# mil_test_pred = np.sign(mil_test_scores)
mil_test_pred = np.where(mil_test_scores > val_eer_threshold, 1, -1)

fpr, tpr, thresholds = roc_curve(test_labels, mil_test_scores)

# Find the point where FPR equals FRR
test_eer_threshold = thresholds[np.nanargmin(np.absolute((1 - tpr) - fpr))]
test_eer = fpr[np.nanargmin(np.absolute((1 - tpr) - fpr))]  # Equal to FRR

print('\nTest Balanced Accuracy:', balanced_accuracy_score(test_labels, mil_test_pred))
print('Test F1 Score:', f1_score(test_labels, mil_test_pred))
print('Test AUC:', roc_auc_score(test_labels, mil_test_scores))
print('Test EER:', test_eer)
print('Test EER Threshold:', test_eer_threshold)


Validation Balanced Accuracy: 0.7039910452369924
Validation F1 Score: 0.8449367088607594
Validation AUC: 0.8015542174875199
Validation EER: 0.2755905511811024
Validation EER Threshold: 0.5104375800373984

Test Balanced Accuracy: 0.7102144371459354
Test F1 Score: 0.7356948228882834
Test AUC: 0.7836976795375294
Test EER: 0.28019323671497587
Test EER Threshold: 0.3521497942331151


## Visualise Resnet18 Encodings + NSK-SVM Results

In [54]:
# Load pickled arrays
with open(f'{ivd_arrays_path}/ncimi_arrays_dict.pkl', 'rb') as handle:
    ncimi_array_dict = pickle.load(handle)

ivd_train_array = ncimi_array_dict['ivd_train_array']
label_train_array = ncimi_array_dict['label_train_array']

ivd_val_array = ncimi_array_dict['ivd_val_array']
label_val_array = ncimi_array_dict['label_val_array']

ivd_test_array = ncimi_array_dict['ivd_test_array']
label_test_array = ncimi_array_dict['label_test_array']
label_test_report = ncimi_array_dict['label_test_report']
label_test_scores = ncimi_array_dict['label_test_scores']
label_test_con = ncimi_array_dict['label_test_con']
compare_test_array = ncimi_array_dict['compare_test_array']
ivd_test_array_names = ncimi_array_dict['ivd_test_array_names']

In [55]:
def mil_level_report(
    label_test_report, label_test_con, label_test_scores, 
    compare_test_array, ivd_test_array, ivd_test_array_names, 
    patient_slices):
    # Combine features, labels, and patient slices into a DataFrame
    df = pd.DataFrame(
        {'Reports': label_test_report, 
        'Conclusions': label_test_con, 
        'Label Scores': label_test_scores,
        'Predicted Label': compare_test_array,
        'IVD Test Array': ivd_test_array,
        'IVD Names': ivd_test_array_names,
        'Patient Slices': patient_slices})

    # Group by patient
    grouped = df.groupby('Patient Slices')
    
    bag_report = []
    bag_con = []
    bag_score = []
    bag_pred_label = []
    bag_ivd_test_array = []
    bag_names = []
    
    for patient_slice, group in grouped:
        # Combine slices into a bag
        
        bag_report.append(group['Reports'].iloc[0])
        bag_con.append(group['Conclusions'].iloc[0])
        bag_score.append(group['Label Scores'].iloc[0])
        bag_pred_label.append(group['Predicted Label'].iloc[0])
        bag_ivd_test_array.append(group['IVD Test Array'].tolist())
        bag_names.append(group['IVD Names'].tolist())
    
    return bag_report, bag_con, bag_score, bag_pred_label, bag_ivd_test_array, bag_names

(df_lab_test_report, 
 df_lab_test_con, 
 df_lab_test_scores, 
 df_lab_pred_labels, 
 bag_ivd_test_array, 
 bag_of_names) = mil_level_report(label_test_report, 
                                  label_test_con, 
                                  label_test_scores, 
                                  compare_test_array, 
                                  ivd_test_array, 
                                  ivd_test_array_names, 
                                  test_pat_id_date)

In [None]:
# # print outputs at each step 
# counter = 0
# counter_spec = 3
# zipped_results = list(zip(df_lab_test_report, df_lab_test_con, df_lab_test_scores, df_lab_pred_labels, test_labels, mil_test_pred, mil_test_scores, bag_ivd_test_array, bag_of_names))

# for report, pred_con, pred_report_score, pred_report_label, true_report_label, pred_scan_label, pred_scan_score, ivd, ivd_name in zipped_results:
#     counter += 1
#     if counter > 7:
#         if pred_report_label == 1 and true_report_label == 1 and pred_scan_label == 1:
#             print(f'Report: {report}')
#             print(f'Conclusion: {pred_con}')
#             print(f'Predicted Report Score: {pred_report_score}')
#             print(f'Predicted Report Label: {pred_report_label}')
#             print(f'True Report Label: {true_report_label}')
#             print(f'Predicted Scan Label: {pred_scan_label}')
#             print(f'Predicted Scan Score: {pred_scan_score}')
#             # Plot ivd
#             for i in range(len(ivd)):
#                 print(f'IVD: {ivd_name[i]}')
#                 plt.imshow(ivd[i][3,:,:], cmap='gray')
#                 plt.show()
#             print('\n')
#             counter_spec-=1
#     elif counter_spec == 0:
#         break