In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
login()
# Load model and tokenizer for the medical expert LLM
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", use_auth_token=True)

llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Function to assess question quality
def evaluate_question_quality(dialogue):
    prompt = f"""
    You are a senior medical expert. Based on the following patient-doctor dialogue, rate the quality of the questions asked:
    "{dialogue}"
    The quality of the questions should be evaluated based on their clarity, relevance, and how well they help in diagnosing the patient.
    Provide a score from 1 (poor) to 10 (excellent) and a brief explanation.
    """
    response = llm(prompt, max_new_tokens=100, do_sample=False)[0]['generated_text']
    return response.strip()

# Function to assess the correctness of the assigned specialist
def evaluate_specialist(dialogue, assigned_specialist):
    prompt = f"""
    You are a senior medical expert. Given the following patient-doctor dialogue, assess if the assigned specialist is correct:
    Dialogue: "{dialogue}"
    Assigned Specialist: {assigned_specialist}
    Based on the patient's condition described in the dialogue, is this the correct specialist to be assigned? Answer with "Correct" or "Incorrect" and provide a brief explanation.
    """
    response = llm(prompt, max_new_tokens=100, do_sample=False)[0]['generated_text']
    return response.strip()

# Function to compute evaluation score for each case
def evaluate_case(row):
    dialogue = row['Dialogue']
    assigned_specialist = row['Assigned Specialist']

    # Evaluate the question quality and specialist correctness
    question_quality = evaluate_question_quality(dialogue)
    specialist_evaluation = evaluate_specialist(dialogue, assigned_specialist)

    # Generate a score for each
    question_score = int(question_quality.split()[0]) if question_quality.split()[0].isdigit() else 0
    specialist_score = 1 if "Correct" in specialist_evaluation else 0

    # Return the results
    return question_score, specialist_score, question_quality, specialist_evaluation

# Load the dataset containing the cases
df = pd.read_csv("/content/your_labeled_data.csv")

# Apply the evaluation function
df[['Question Quality Score', 'Specialist Score', 'Question Quality Explanation', 'Specialist Evaluation']] = df.apply(evaluate_case, axis=1, result_type="expand")

# Calculate overall accuracy for specialists (using predicted labels vs. evaluated correctness)
accuracy = df['Specialist Score'].mean() * 100

# Calculate overall question quality score
average_question_quality = df['Question Quality Score'].mean()

# Save the results to a new CSV
df.to_csv("/content/evaluated_results_with_question_quality.csv", index=False)

# Output evaluation scores
print(f"✅ Evaluation complete. Overall specialist accuracy: {accuracy:.2f}%")
print(f"✅ Average question quality score: {average_question_quality:.2f}")

