# AI Model Evaluation Notebook

Evaluation for TherapyBeagle model using local API endpoint

In [1]:
# Install required libraries
%pip install langchain langchain_community
%pip install evaluate rouge_score bert-score requests pandas
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
%pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu124
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import requests
import pandas as pd
import evaluate
import os
from transformers import AutoTokenizer, GPT2LMHeadModel

# Device Configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(0)
print(f"Using device: {DEVICE}")

Using device: cuda


In [6]:
# Load tokenizer and language model for perplexity calculation
tokenizer = AutoTokenizer.from_pretrained("TheBloke/vicuna-13b-v1.3.0-GPTQ")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Configuration
BASE_URL = "http://localhost:5092"
BERT_MODEL = "distilbert-base-uncased"

# Evaluation Helper Functions
def create_user() -> str:
    """Create a new user and return their ID"""
    response = requests.post(
        f"{BASE_URL}/user/create",
        json={
            "name": "Test User",
            "emotion": "Neutral",
            "pronouns": "They/Them"
        }
    )
    return response.json()["id"]

def delete_user(user_id: str) -> None:
    """Delete a user"""
    requests.post(f"{BASE_URL}/user/delete", json={"id": user_id})

: 

In [5]:
def get_chat_response(user_id: str, message: str) -> str:
    """Get response from chat endpoint"""
    response = requests.post(
        f"{BASE_URL}/chat",
        json={
            "user_id": user_id,
            "message": message
        }
    )
    return response.json()["response"]

In [None]:
def calculate_perplexity(text: str) -> float:
    """Calculate perplexity score for given text"""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        loss = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"]).loss
    return torch.exp(loss).item()

In [None]:
def calculate_bert_score(prediction: str, reference: str) -> dict:
    """Calculate BERT score between prediction and reference"""
    bert = evaluate.load("bertscore")
    results = bert.compute(
        predictions=[prediction],
        references=[reference],
        lang="en",
        model_type=BERT_MODEL
    )
    return {
        "precision": results["precision"][0],
        "recall": results["recall"][0],
        "f1": results["f1"][0]
    }

## Kaggle Dataset Evaluation

In [None]:
# Load Kaggle Dataset
kaggle_df = pd.read_csv('kaggle-therapy-conversations - Copy.csv')

# Evaluation for Kaggle Dataset
kaggle_results = []

for index, row in kaggle_df.iterrows():
    # Create a new user for each evaluation
    user_id = create_user()
    
    try:
        # Get model response
        model_response = get_chat_response(user_id, row['Context'])
        
        # Calculate metrics
        perplexity_model = calculate_perplexity(model_response)
        bert_scores = calculate_bert_score(model_response, row['Response'])
        
        # Store results
        kaggle_results.append({
            "context": row['Context'],
            "reference_response": row['Response'],
            "model_response": model_response,
            "perplexity": perplexity_model,
            "bert_precision": bert_scores['precision'],
            "bert_recall": bert_scores['recall'],
            "bert_f1": bert_scores['f1']
        })
    
    except Exception as e:
        print(f"Error processing row {index}: {e}")
    
    finally:
        # Always delete the user
        delete_user(user_id)

# Convert results to DataFrame
kaggle_results_df = pd.DataFrame(kaggle_results)
kaggle_results_df.to_csv('kaggle_evaluation_results.csv', index=False)

## DAIC-WOZ Dataset Evaluation

In [None]:
def process_daic_woz_file(file_path):
    """Process a single DAIC-WOZ conversation file"""
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Filter for participant and interviewer conversations
    conversations = []
    current_context = ""
    
    for _, row in df.iterrows():
        if row['speaker'] == 'Participant':
            current_context += row['value'] + " "
        elif row['speaker'] == 'Ellie':
            conversations.append({
                'context': current_context.strip(),
                'response': row['value']
            })
            current_context = ""
    
    return conversations

In [None]:
# Iterate through DAIC-WOZ files
daic_woz_results = []
daic_woz_directory = './daic_woz_files/'  # Update with your directory path

for filename in os.listdir(daic_woz_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(daic_woz_directory, filename)
        conversations = process_daic_woz_file(file_path)
        
        for conv in conversations:
            # Create a new user for each evaluation
            user_id = create_user()
            
            try:
                # Get model response
                model_response = get_chat_response(user_id, conv['context'])
                
                # Calculate metrics
                perplexity_model = calculate_perplexity(model_response)
                bert_scores = calculate_bert_score(model_response, conv['response'])
                
                # Store results
                daic_woz_results.append({
                    "filename": filename,
                    "context": conv['context'],
                    "reference_response": conv['response'],
                    "model_response": model_response,
                    "perplexity": perplexity_model,
                    "bert_precision": bert_scores['precision'],
                    "bert_recall": bert_scores['recall'],
                    "bert_f1": bert_scores['f1']
                })
            
            except Exception as e:
                print(f"Error processing conversation in {filename}: {e}")
            
            finally:
                # Always delete the user
                delete_user(user_id)

# Convert results to DataFrame
daic_woz_results_df = pd.DataFrame(daic_woz_results)
daic_woz_results_df.to_csv('daic_woz_evaluation_results.csv', index=False)

## Analysis and Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize Perplexity Distribution
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(kaggle_results_df['perplexity'], kde=True)
plt.title('Perplexity Distribution - Kaggle Dataset')

plt.subplot(1, 2, 2)
sns.histplot(daic_woz_results_df['perplexity'], kde=True)
plt.title('Perplexity Distribution - DAIC-WOZ Dataset')

plt.tight_layout()
plt.savefig('perplexity_distribution.png')

In [None]:
# Comprehensive Statistics
print("Kaggle Dataset Metrics:")
print(kaggle_results_df[['perplexity', 'bert_precision', 'bert_recall', 'bert_f1']].describe())

print("\nDAIC-WOZ Dataset Metrics:")
print(daic_woz_results_df[['perplexity', 'bert_precision', 'bert_recall', 'bert_f1']].describe())