**Model Instantiation and Loading Saved Weights**

Restores the best trained model from drive/storage and prepares it for testing.

In [None]:
# Instantiate the model for evaluation
model = CNN_RNN_PatientClassifier_MTL(in_channels=3)

# Use GPU if available, otherwise fall back to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Load saved model weights from training (multi-task best checkpoint)
model.load_state_dict(
    torch.load('/content/drive/MyDrive/MSc_project/best_patient_model_mtl.pth', map_location=device)
)

# Switch to evaluation mode
model.eval()

**RUN TEST SET**

In [None]:
@torch.no_grad()
def evaluate_test_set(model, loader, device):
    """
    Evaluates the model on the test set, prints metrics, and returns arrays for further analysis.
    """
    model.eval()
    all_y_true, all_y_prob = [], []

    for batch_x, batch_y, batch_mask, lengths in tqdm(loader, desc="Evaluating on Test Set", leave=False):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        logits, _ = model(batch_x)
        true_patient_labels = torch.max(batch_y, dim=1).values.long()
        probs = torch.softmax(logits, 1)[:, 1].cpu().numpy()

        all_y_true.extend(true_patient_labels.cpu().numpy())
        all_y_prob.extend(probs)

    all_y_true = np.array(all_y_true)
    all_y_prob = np.array(all_y_prob)
    all_y_pred = (all_y_prob >= 0.5).astype(int)

    # Print metrics
    acc, sens, spec, auc = compute_metrics(all_y_true, all_y_pred, all_y_prob)
    print("\nTest Set Evaluation Results:")
    print(f"Test Patient AUC: {auc:.4f}")
    print(f"Test Patient Accuracy: {acc:.4f}")
    print(f"Test Patient Sensitivity (Recall): {sens:.4f}")
    print(f"Test Patient Specificity: {spec:.4f}")

    # Confusion Matrix Visualization
    cm = confusion_matrix(all_y_true, all_y_pred, labels=[0, 1])
    cm_sum = np.sum(cm)
    cm_percent = cm / cm_sum * 100

    plt.figure(figsize=(6, 5))
    ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                     xticklabels=['No Bleed', 'Bleed'],
                     yticklabels=['No Bleed', 'Bleed'],
                     cbar=False)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j+0.5, i+0.8, f"{cm_percent[i,j]:.1f}%", ha='center', va='center', color='gray', fontsize=10)
    plt.title('Confusion Matrix on Test Set')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.show()

    return all_y_true, all_y_pred, all_y_prob

# Main execution
ckpt_path = '/content/drive/MyDrive/MSc_project/best_patient_model_mtl.pth'

print("Loading the best model for final evaluation...")
evaluation_model = CNN_RNN_PatientClassifier_MTL(in_channels=3)
evaluation_model.load_state_dict(torch.load(ckpt_path, map_location=device))
evaluation_model.to(device)

y_true, y_pred, y_prob = evaluate_test_set(model=evaluation_model, loader=test_loader, device=device)