# Mental Health Crisis Detection Evaluation
This notebook handles the evaluation of the trained model. 

## Key Metrics
- **F1 Score**: Balanced measure of precision and recall, crucial for imbalanced datasets
- **Recall**: Particularly important to minimize false negatives in suicide detection
- **Batch Processing**: Efficient GPU utilization with batch size 256

In [1]:
from transformers import pipeline
from sklearn.metrics import f1_score, recall_score
import torch
import numpy as np
import pandas as pd

In [2]:
def evaluate_model(model, tokenizer, X_test, y_test, label_encoder, batch_size=256):
    """Evaluate BERT model performance on test data
    
    Args:
        model: Fine-tuned BERT model
        tokenizer: BERT tokenizer
        X_test: List of text samples
        y_test: Ground truth labels
        label_encoder: Fitted LabelEncoder for class mapping
        batch_size: Batch size for GPU optimization
        
    Returns:
        f1: Macro F1 score
        recall: Recall score
    """
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    predictions = []
    
    # Batch processing to handle large datasets and prevent GPU OOM
    for i in range(0, len(X_test), batch_size):
        batch_texts = X_test[i:i + batch_size].tolist()
        
        # Tokenization with truncation/padding for BERT input
        inputs = tokenizer(
            batch_texts, 
            return_tensors="pt", 
            truncation=True, 
            padding=True, 
            max_length=512 # Matches BERT's max sequence length
        ).to(device)
        
        # Disable gradient calculation for inference
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
        
        # Get class predictions from logits
        batch_preds = torch.argmax(logits, dim=-1).cpu().numpy()
        predictions.extend(batch_preds)
        # Clear GPU cache after each batch
        torch.cuda.empty_cache() # Clear GPU cache between batches
    
    # Convert string labels to encoded integers
    y_test_encoded = label_encoder.transform(y_test)

    # Calculate evaluation metrics
    f1 = f1_score(y_test_encoded, predictions)
    recall = recall_score(y_test_encoded, predictions)
    
    print(f"F1 Score: {f1}")
    print(f"Recall: {recall}")
    return f1, recall