<a href="https://colab.research.google.com/github/ssheikhorg/dissertation/blob/main/models_evaluations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install all the packages needed and import them all

In [15]:
# Install uv in Colab's default environment
!curl -LsSf https://astral.sh/uv/install.sh | sh
!uv --version

# get python version
!python --version

downloading uv 0.8.15 x86_64-unknown-linux-gnu
no checksums to verify
installing to /usr/local/bin
  uv
  uvx
everything's installed!
uv 0.8.15
Python 3.12.11


In [16]:
!uv pip install -q -U \
  numpy==2.0.2 \
  scikit-learn==1.5.2 \
  transformers \
  accelerate \
  bitsandbytes \
  pillow \
  sentence-transformers \
  faiss-cpu \
  datasets \
  pandas \
  matplotlib \
  seaborn \
  plotly \
  tqdm \
  openai \
  anthropic \
  boto3 \
  langchain \
  langchain-huggingface \
  huggingface_hub \
  langchain-community \
  transformers_stream_generator

# Import libraries and add global variables

In [17]:
from huggingface_hub import login
from google.colab import userdata
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Login to Hugging Face with your token
login(token=userdata.get('HF_TOKEN'))

import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import json
import re
import glob
import base64
from tqdm import tqdm
from io import BytesIO
from datetime import datetime
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig
)
# Fixed imports for LangChain compatibility
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Check GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Configuration
TEMPERATURE = 0.3
MAX_TOKENS = 1000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
ENABLE_RAG = True
RAG_TOP_K = 3

# Medical Test Data
MEDICAL_PROMPTS = [
    {
        "question": "What are the common symptoms of diabetes?",
        "reference": "Common diabetes symptoms include increased thirst, frequent urination, extreme fatigue, blurred vision, and slow healing of cuts or wounds.",
        "category": "endocrinology"
    },
    {
        "question": "How does aspirin work in the body?",
        "reference": "Aspirin works by inhibiting cyclooxygenase enzymes, reducing the production of prostaglandins that cause pain, inflammation, and fever. It also has antiplatelet effects.",
        "category": "pharmacology"
    },
    {
        "question": "What is hypertension and what are its risk factors?",
        "reference": "Hypertension, or high blood pressure, is a condition where the force of blood against artery walls is too high. Risk factors include age, family history, obesity, lack of exercise, tobacco use, high sodium diet, and stress.",
        "category": "cardiology"
    },
    {
        "question": "What are the main functions of the liver?",
        "reference": "The liver performs several vital functions including detoxification of chemicals, protein synthesis, production of biochemicals necessary for digestion, glycogen storage, and decomposition of red blood cells.",
        "category": "gastroenterology"
    },
    {
        "question": "What are the common symptoms of COVID-19?",
        "reference": "Common COVID-19 symptoms include fever, cough, shortness of breath, fatigue, muscle aches, loss of taste or smell, sore throat, and headache.",
        "category": "infectious_disease"
    }
]

MEDICAL_DATASETS = {
    "pubmedqa": [
        {
            "question": "What is the first-line treatment for hypertension?",
            "reference": "First-line treatments for hypertension include thiazide diuretics, ACE inhibitors, angiotensin II receptor blockers, and calcium channel blockers.",
            "category": "cardiology",
            "dataset": "pubmedqa"
        },
        {
            "question": "How does metformin work in type 2 diabetes?",
            "reference": "Metformin decreases hepatic glucose production, reduces intestinal glucose absorption, and improves insulin sensitivity.",
            "category": "endocrinology",
            "dataset": "pubmedqa"
        }
    ],
    "medqa": [
        {
            "question": "A 45-year-old patient presents with chest pain radiating to the left arm. What is the most likely diagnosis?",
            "reference": "Chest pain radiating to the left arm is characteristic of myocardial infarction and requires immediate cardiac evaluation.",
            "category": "cardiology",
            "dataset": "medqa"
        },
        {
            "question": "What is the gold standard test for diagnosing pulmonary embolism?",
            "reference": "CT pulmonary angiography is the gold standard for diagnosing pulmonary embolism.",
            "category": "pulmonology",
            "dataset": "medqa"
        }
    ],
    "mimic_cxr": [
        {
            "question": "Describe the findings in a chest X-ray showing cardiomegaly and pulmonary edema.",
            "reference": "Cardiomegaly appears as an enlarged cardiac silhouette, while pulmonary edema manifests as bilateral interstitial opacities and Kerley B lines.",
            "category": "radiology",
            "dataset": "mimic_cxr"
        },
        {
            "question": "What radiographic signs suggest pneumothorax?",
            "reference": "Pneumothorax is characterized by a visible visceral pleural edge, absence of lung markings peripheral to this edge, and possible mediastinal shift.",
            "category": "radiology",
            "dataset": "mimic_cxr"
        }
    ]
}

# Medical knowledge base for fact checking
MEDICAL_KNOWLEDGE_BASE = {
    "diabetes": ["increased thirst", "frequent urination", "fatigue", "blurred vision", "slow healing", "metformin", "insulin"],
    "aspirin": ["pain relief", "anti-inflammatory", "blood thinner", "fever reducer", "inhibit cyclooxygenase", "myocardial infarction"],
    "hypertension": ["high blood pressure", "silent killer", "cardiovascular risk", "artery damage", "ACE inhibitors", "beta blockers"],
    "liver": ["detoxification", "protein synthesis", "bile production", "glycogen storage", "jaundice", "cirrhosis"],
    "covid": ["fever", "cough", "shortness of breath", "loss of taste/smell", "coronavirus", "pandemic"],
    "cardiology": ["myocardial infarction", "angina", "arrhythmia", "ECG", "troponin", "stent"],
    "pulmonology": ["asthma", "COPD", "pneumonia", "spirometry", "bronchodilator", "oxygen therapy"],
    "radiology": ["x-ray", "CT scan", "MRI", "ultrasound", "contrast", "radiation"]
}

GPU available: True
GPU: NVIDIA A100-SXM4-40GB


# Model Configuration Functions

In [18]:
def get_model_configs():
    """Return model configurations with their HuggingFace IDs and quantization requirements"""
    return {
        "llama-2-7b": {
            "model_id": "meta-llama/Llama-2-7b-chat-hf",
            "requires_special_handling": False,
            "quantization_support": True
        },
        "mistral-7b": {
            "model_id": "mistralai/Mistral-7B-v0.1",
            "requires_special_handling": False,
            "quantization_support": True
        },
        "qwen-7b": {
            "model_id": "Qwen/Qwen2.5-7B-Instruct",
            "requires_special_handling": True,
            "quantization_support": True,
            "padding_side": "left",
            "trust_remote_code": True
        },
        "meditron-7b": {
            "model_id": "epfl-llm/meditron-7b",
            "requires_special_handling": False,
            "quantization_support": True
        },
        "biomedgpt": {
            "model_id": "stanford-crfm/BioMedLM",
            "requires_special_handling": False,
            "quantization_support": True
        },
        "gpt-oss-20b": {
            "model_id": "openai/gpt-oss-20b",
            "requires_special_handling": True,
            "quantization_support": False,  # This model uses different quantization
            "trust_remote_code": True
        },
        "claude-3.7-sonnet": {
            "model_id": "reedmayhew/claude-3.7-sonnet-reasoning-gemma3-12B",
            "requires_special_handling": True,
            "quantization_support": True,
            "trust_remote_code": True
        },
        "grok-2": {
            "model_id": "xai-org/grok-2",
            "requires_special_handling": True,
            "quantization_support": True,
            "trust_remote_code": True
        }
    }


def load_local_model(model_name, **kwargs):
    """Load a local model with pattern matching and proper error handling"""
    model_configs = get_model_configs()

    if model_name not in model_configs:
        raise ValueError(f"Model {model_name} not supported")

    config = model_configs[model_name]
    model_id = config["model_id"]

    try:
        # Common tokenizer parameters
        tokenizer_kwargs = {
            "use_fast": True,
            "trust_remote_code": config.get("trust_remote_code", False)
        }

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id, **tokenizer_kwargs)

        # Handle different model types with pattern matching
        if "qwen" in model_name.lower():
            # Qwen specific handling
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            if tokenizer.pad_token_id is None:
                tokenizer.pad_token_id = tokenizer.eos_token_id
            tokenizer.padding_side = config.get("padding_side", "left")

        elif "gpt-oss" in model_name.lower():
            # GPT-OSS-20B specific handling - this model uses different quantization
            # Don't use BitsAndBytesConfig for this model
            quantization_config = None

        elif "claude" in model_name.lower():
            # Claude model handling
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            if tokenizer.pad_token_id is None:
                tokenizer.pad_token_id = tokenizer.eos_token_id

        else:
            # Default handling for other models
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            if tokenizer.pad_token_id is None:
                tokenizer.pad_token_id = tokenizer.eos_token_id

        # Handle quantization configuration
        if config["quantization_support"] and torch.cuda.is_available() and "gpt-oss" not in model_name.lower():
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4"
            )
        else:
            quantization_config = None

        # Model loading parameters
        model_kwargs = {
            "device_map": "auto" if torch.cuda.is_available() else None,
            "dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
            "trust_remote_code": config.get("trust_remote_code", False),
            "low_cpu_mem_usage": True,
        }

        # Add quantization config if applicable
        if quantization_config:
            model_kwargs["quantization_config"] = quantization_config

        # Load model
        model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

        # Post-processing for specific models
        if "qwen" in model_name.lower():
            model.config.pad_token_id = tokenizer.pad_token_id
            if hasattr(model, 'transformer'):
                model.transformer.padding_idx = tokenizer.pad_token_id

        # Generation parameters
        generation_kwargs = {
            "max_new_tokens": kwargs.get("max_new_tokens", 384),
            "temperature": kwargs.get("temperature", TEMPERATURE),
            "do_sample": kwargs.get("do_sample", True),
            "truncation": kwargs.get("truncation", True),
        }

        # Add token IDs if available
        if hasattr(tokenizer, 'pad_token_id') and tokenizer.pad_token_id is not None:
            generation_kwargs["pad_token_id"] = tokenizer.pad_token_id
        if hasattr(tokenizer, 'eos_token_id') and tokenizer.eos_token_id is not None:
            generation_kwargs["eos_token_id"] = tokenizer.eos_token_id

        # Create pipeline
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            **generation_kwargs
        )

        print(f"Successfully loaded {model_name}")
        return pipe

    except Exception as e:
        print(f"Error loading {model_name}: {str(e)}")
        import traceback
        traceback.print_exc()

        # Return fallback function
        def fallback_pipeline(prompt, **kwargs):
            return [{'generated_text': f"Model {model_name} could not be loaded: {str(e)}"}]
        return fallback_pipeline


def load_model_by_category(model_name):
    """Load model based on its category"""
    categories = get_model_categories()

    if model_name in categories["local_models"]:
        return load_local_model(model_name)
    else:
        raise ValueError(f"Model {model_name} not found in any category")

# Model category handlers using pattern matching
MODEL_HANDLERS = {
    "qwen": {
        "tokenizer_config": {
            "padding_side": "left",
            "trust_remote_code": True
        },
        "post_processing": lambda model, tokenizer: setattr(model.config, 'pad_token_id', tokenizer.pad_token_id)
    },
    "default": {
        "quantization": True,
        "trust_remote_code": False
    }
}

def get_model_handler(model_name):
    """Get the appropriate handler for a model based on pattern matching"""
    model_name_lower = model_name.lower()

    for pattern, handler in MODEL_HANDLERS.items():
        if pattern in model_name_lower and pattern not in ["gpt-oss", "claude", "grok"]:
            return handler

    return MODEL_HANDLERS["default"]

# RAG Functions

In [19]:
def create_medical_retriever():
    """Create a medical knowledge retriever for RAG"""
    medical_knowledge = [
        "Diabetes symptoms include increased thirst, frequent urination, fatigue, blurred vision.",
        "Aspirin is a nonsteroidal anti-inflammatory drug that reduces pain and inflammation.",
        "Hypertension (high blood pressure) is a condition where blood pressure is consistently too high.",
        "The liver performs detoxification, protein synthesis, and produces biochemicals for digestion.",
        "COVID-19 symptoms include fever, cough, shortness of breath, fatigue, and loss of taste/smell.",
        "Antibiotics treat bacterial infections but are ineffective against viral infections.",
        "Vaccines stimulate the immune system to produce antibodies against specific diseases.",
        "Cancer treatments include surgery, chemotherapy, radiation therapy, and immunotherapy.",
        "Heart disease risk factors include high blood pressure, high cholesterol, smoking, and diabetes.",
        "Mental health conditions like depression can be treated with therapy and medication."
    ]

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=50
    )
    documents = text_splitter.create_documents(medical_knowledge)
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )

    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

# Response Generation Functions

In [20]:
def generate_response_local(model, prompt, use_rag=True):
    """Generate response using local model with error handling"""
    try:
        if use_rag:
            vectorstore = create_medical_retriever()
            docs = vectorstore.similarity_search(prompt, k=RAG_TOP_K)
            context = "\n".join([doc.page_content for doc in docs])

            prompt_template = """You are a medical AI assistant. Use the following medical context to answer the question accurately and factually.
            If you don't know the answer based on the context, say you don't know. Be concise and avoid speculation.

            Medical Context:
            {context}

            Question: {question}

            Answer:"""

            formatted_prompt = prompt_template.format(context=context, question=prompt)
        else:
            formatted_prompt = prompt

        # Clear memory before generation
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Generate response with proper error handling
        try:
            # Try the standard approach first
            generation_args = {
                'max_new_tokens': 384,
                'do_sample': True,
                'temperature': TEMPERATURE,
                'truncation': True,
            }

            # Add padding token ID if available
            if hasattr(model, 'tokenizer') and hasattr(model.tokenizer, 'pad_token_id'):
                generation_args['pad_token_id'] = model.tokenizer.pad_token_id

            response = model(formatted_prompt, **generation_args)[0]['generated_text']

        except Exception as gen_error:
            print(f"Generation error: {str(gen_error)}")
            # Try alternative approach without padding
            try:
                response = model(
                    formatted_prompt,
                    max_length=512,
                    do_sample=True,
                    temperature=TEMPERATURE,
                    truncation=True
                )[0]['generated_text']
            except Exception as alt_error:
                print(f"Alternative generation also failed: {str(alt_error)}")
                return f"Error in text generation: {str(alt_error)}"

        # Extract answer if using RAG
        if use_rag and "Answer:" in response:
            response = response.split("Answer:")[-1].strip()

        # Clear memory after generation
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return response.strip()

    except Exception as e:
        print(f"Error in generation: {str(e)}")
        # Try to recover memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        return f"Error generating response: {str(e)}"

def generate_response(model, prompt, use_rag=True, model_name=""):
    """Generate response using local model only"""
    return generate_response_local(model, prompt, use_rag)

# Evaluation Functions

In [21]:
def calculate_semantic_similarity(reference, response):
    """Calculate semantic similarity between reference and response"""
    try:
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform([reference, response])
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        return max(similarity, 0)
    except:
        return 0.5

def check_factual_consistency(response, category):
    """Check factual consistency with medical knowledge base"""
    if category not in MEDICAL_KNOWLEDGE_BASE:
        return 0.7

    relevant_facts = MEDICAL_KNOWLEDGE_BASE[category]
    response_lower = response.lower()

    fact_matches = sum(1 for fact in relevant_facts if fact in response_lower)

    if not relevant_facts:
        return 0.5

    coverage = fact_matches / len(relevant_facts)

    contradictions = check_contradictions(response_lower, category)

    final_score = coverage * (1 - 0.5 * contradictions)

    return max(min(final_score, 1.0), 0.0)

def check_contradictions(response, category):
    """Check for contradictions with known medical facts"""
    contradiction_patterns = {
        "diabetes": [r"diabetes.*curable", r"diabetes.*not serious", r"insulin.*addictive"],
        "aspirin": [r"aspirin.*safe for everyone", r"aspirin.*no side effects", r"aspirin.*cures"],
        "covid": [r"covid.*just flu", r"vaccines.*dangerous", r"masks.*don't work"],
        "cancer": [r"cancer.*always fatal", r"alternative.*cures cancer", r"chemotherapy.*poison"]
    }

    if category not in contradiction_patterns:
        return 0.0

    patterns = contradiction_patterns[category]
    contradiction_count = sum(1 for pattern in patterns if re.search(pattern, response))

    return min(contradiction_count / len(patterns), 1.0) if patterns else 0.0

def pattern_based_hallucination_detection(response):
    """Fallback hallucination detection using patterns"""
    score = 0.0
    response_lower = response.lower()

    uncertainty_patterns = [
        r"\b(I think|I believe|probably|maybe|perhaps|likely)\b",
        r"\b(studies show|research indicates|experts say)\b",
    ]

    for pattern in uncertainty_patterns:
        matches = re.findall(pattern, response_lower)
        score += len(matches) * 0.1

    overgeneralizations = re.findall(r"\b(always|never|every|all|none)\b", response_lower)
    score += len(overgeneralizations) * 0.15

    sensational_claims = re.findall(r"\b(cure|miracle|breakthrough|revolutionary)\b", response_lower)
    score += len(sensational_claims) * 0.2

    return min(score, 1.0)

def calculate_confidence(response):
    """Calculate confidence score based on response characteristics"""
    confidence = 1.0
    response_lower = response.lower()

    uncertainty_markers = ["maybe", "perhaps", "I think", "I believe", "probably"]
    for marker in uncertainty_markers:
        if marker in response_lower:
            confidence -= 0.1

    if len(response.split()) < 5:
        confidence -= 0.2

    return max(confidence, 0.1)

def evaluate_hallucination(reference, response, category):
    """Evaluate hallucination using multiple methods"""
    similarity_score = calculate_semantic_similarity(reference, response)

    factual_score = check_factual_consistency(response, category)

    pattern_score = pattern_based_hallucination_detection(response)

    hallucination_score = 0.4 * (1 - similarity_score) + 0.4 * (1 - factual_score) + 0.2 * pattern_score

    return min(hallucination_score, 1.0)

def load_test_prompts(sample_count=3, dataset_name="all"):
    """Load test prompts for evaluation from specified datasets - IMPROVED"""
    prompts = []

    if dataset_name == "all":
        # Sample from ALL medical datasets
        for dataset_key, dataset_prompts in MEDICAL_DATASETS.items():
            samples_to_take = min(sample_count, len(dataset_prompts))
            for i, prompt_data in enumerate(dataset_prompts[:samples_to_take]):
                prompts.append({
                    "original_prompt": prompt_data["question"],
                    "clean_prompt": prompt_data["question"],
                    "original_reference": prompt_data["reference"],
                    "clean_reference": prompt_data["reference"],
                    "category": prompt_data["category"],
                    "dataset": prompt_data["dataset"]
                })
        # Also include some basic medical prompts
        medical_samples = min(sample_count, len(MEDICAL_PROMPTS))
        for i, prompt_data in enumerate(MEDICAL_PROMPTS[:medical_samples]):
            prompts.append({
                "original_prompt": prompt_data["question"],
                "clean_prompt": prompt_data["question"],
                "original_reference": prompt_data["reference"],
                "clean_reference": prompt_data["reference"],
                "category": prompt_data["category"],
                "dataset": "medical_qa"
            })
    elif dataset_name in MEDICAL_DATASETS:
        # Sample from specific medical dataset
        dataset_prompts = MEDICAL_DATASETS[dataset_name]
        samples_to_take = min(sample_count, len(dataset_prompts))
        for i, prompt_data in enumerate(dataset_prompts[:samples_to_take]):
            prompts.append({
                "original_prompt": prompt_data["question"],
                "clean_prompt": prompt_data["question"],
                "original_reference": prompt_data["reference"],
                "clean_reference": prompt_data["reference"],
                "category": prompt_data["category"],
                "dataset": prompt_data["dataset"]
            })
    else:
        # Fallback to original medical prompts
        samples_to_take = min(sample_count, len(MEDICAL_PROMPTS))
        for i, prompt_data in enumerate(MEDICAL_PROMPTS[:samples_to_take]):
            prompts.append({
                "original_prompt": prompt_data["question"],
                "clean_prompt": prompt_data["question"],
                "original_reference": prompt_data["reference"],
                "clean_reference": prompt_data["reference"],
                "category": prompt_data["category"],
                "dataset": "medical_qa"
            })

    return prompts

def evaluate_model_responses(model, prompts, model_name, dataset="medical_qa"):
    """Evaluate model responses for hallucinations with better error handling"""
    results = {
        "model": model_name,
        "dataset": dataset,
        "sample_count": len(prompts),
        "metrics": {},
        "dataset_metrics": {},
        "sample_responses": []
    }

    hallucination_scores = []
    accuracy_scores = []
    confidence_scores = []

    for prompt in tqdm(prompts, desc=f"Evaluating {model_name}"):
        try:
            response = generate_response(model, prompt["clean_prompt"], use_rag=ENABLE_RAG, model_name=model_name)

            if response.startswith("Error:"):
                accuracy = 0.0
                hallucination_score = 0.5
                confidence = 0.1
            else:
                accuracy = calculate_semantic_similarity(prompt["clean_reference"], response)
                hallucination_score = evaluate_hallucination(prompt["clean_reference"], response, prompt["category"])
                confidence = calculate_confidence(response)
        except Exception as e:
            print(f"Error evaluating prompt: {str(e)}")
            response = f"Error: {str(e)}"
            accuracy = 0.0
            hallucination_score = 0.5
            confidence = 0.1

        accuracy_scores.append(accuracy)
        hallucination_scores.append(hallucination_score)
        confidence_scores.append(confidence)

        results["sample_responses"].append({
            "prompt": prompt["original_prompt"],
            "reference": prompt["original_reference"],
            "response": response,
            "accuracy": accuracy,
            "hallucination_score": hallucination_score,
            "confidence": confidence
        })

    if accuracy_scores and hallucination_scores:
        results["metrics"] = {
            "accuracy": np.mean(accuracy_scores),
            "hallucination_rate": np.mean(hallucination_scores),
            "confidence": np.mean(confidence_scores),
            "response_length": np.mean([len(str(r["response"])) for r in results["sample_responses"]]),
            "consistency": 1.0 - np.std(hallucination_scores) if len(hallucination_scores) > 1 else 1.0
        }

        # Calculate metrics by dataset
        dataset_metrics = {}
        for prompt in prompts:
            dataset_name = prompt.get("dataset", "unknown")
            if dataset_name not in dataset_metrics:
                dataset_metrics[dataset_name] = {
                    "accuracy_scores": [],
                    "hallucination_scores": [],
                    "confidence_scores": []
                }

        for i, response_data in enumerate(results["sample_responses"]):
            dataset_name = prompts[i].get("dataset", "unknown")
            dataset_metrics[dataset_name]["accuracy_scores"].append(response_data["accuracy"])
            dataset_metrics[dataset_name]["hallucination_scores"].append(response_data["hallucination_score"])
            dataset_metrics[dataset_name]["confidence_scores"].append(response_data["confidence"])

        # Compute average metrics for each dataset
        for dataset_name, metrics in dataset_metrics.items():
            results["dataset_metrics"][dataset_name] = {
                "accuracy": np.mean(metrics["accuracy_scores"]) if metrics["accuracy_scores"] else 0,
                "hallucination_rate": np.mean(metrics["hallucination_scores"]) if metrics["hallucination_scores"] else 0,
                "confidence": np.mean(metrics["confidence_scores"]) if metrics["confidence_scores"] else 0,
                "sample_count": len(metrics["accuracy_scores"])
            }

    return results

def generate_improvement_suggestions(metrics):
    """Generate suggestions based on evaluation results"""
    suggestions = []

    hallucination_rate = metrics.get("hallucination_rate", 0)
    accuracy = metrics.get("accuracy", 0)
    confidence = metrics.get("confidence", 0)
    consistency = metrics.get("consistency", 0)

    if hallucination_rate > 0.3:
        suggestions.append({
            "category": "High Priority",
            "suggestion": "Implement RAG with verified medical knowledge base",
            "expected_impact": "40-60% reduction in hallucinations"
        })

    if accuracy < 0.6:
        suggestions.append({
            "category": "High Priority",
            "suggestion": "Fine-tune with curated medical QA pairs and implement fact-checking",
            "expected_impact": "30-50% accuracy improvement"
        })

    if confidence < 0.6:
        suggestions.append({
            "category": "Medium Priority",
            "suggestion": "Add confidence calibration and uncertainty quantification",
            "expected_impact": "Better reliability estimation and fewer overconfident errors"
        })

    if consistency < 0.7:
        suggestions.append({
            "category": "Medium Priority",
            "suggestion": "Implement response consistency checks and self-verification",
            "expected_impact": "More consistent and reliable responses"
        })

    suggestions.append({
        "category": "General",
        "suggestion": "Implement multi-step verification: claim extraction → fact checking → response generation",
        "expected_impact": "Overall quality and reliability improvement"
    })

    return suggestions

# Model Evaluation Functions

In [22]:
def evaluate_local_model(model_name, sample_count=3):
    """Evaluate a local model"""
    print(f"Evaluating local model: {model_name}...")

    try:
        model_pipeline = load_local_model(model_name)

        prompts = load_test_prompts(sample_count)

        results = evaluate_model_responses(model_pipeline, prompts, model_name)

        return results

    except Exception as e:
        print(f"Error evaluating {model_name}: {str(e)}")
        return {
            "model": model_name,
            "error": str(e),
            "metrics": {
                "accuracy": 0,
                "hallucination_rate": 1.0,
                "confidence": 0,
                "response_length": 0,
                "consistency": 0
            },
            "evaluation_date": datetime.now().isoformat()
        }

def evaluate_single_model(model_name, sample_count=3):
    """Evaluate a single local model"""
    results = evaluate_local_model(model_name, sample_count)

    # NEW: Evaluate on ALL medical datasets
    dataset_results = {}
    for dataset_name in ["pubmedqa", "medqa", "mimic_cxr"]:
        try:
            dataset_eval = evaluate_model_on_dataset(model_name, dataset_name, sample_count)
            dataset_results[dataset_name] = dataset_eval
        except Exception as e:
            print(f"Error evaluating on {dataset_name}: {str(e)}")
            dataset_results[dataset_name] = {
                "error": str(e),
                "dataset": dataset_name
            }

    baseline = {
        "accuracy": 0.7,
        "hallucination_rate": 0.25,
        "fact_score": 0.75
    }

    if "error" not in results:
        hallucination_reduction = (
            ((baseline["hallucination_rate"] - results["metrics"]["hallucination_rate"]) / baseline["hallucination_rate"] * 100)
            if baseline["hallucination_rate"] > 0
            else 0
        )

        accuracy_improvement = (
            ((results["metrics"]["accuracy"] - baseline["accuracy"]) / baseline["accuracy"] * 100)
            if baseline["accuracy"] > 0
            else 0
        )

        response_data = {
            "model": model_name,
            "dataset": "combined_medical",  # Changed from "medical_qa"
            "sample_count": sample_count,
            "metrics": results["metrics"],
            "dataset_metrics": dataset_results,  # NEW: Include all dataset results
            "baseline": baseline,
            "improvement": {
                "hallucination_reduction": hallucination_reduction,
                "accuracy_improvement": accuracy_improvement,
            },
            "suggestions": generate_improvement_suggestions(results["metrics"]),
            "sample_responses": results.get("sample_responses", [])[:3],
            "evaluation_date": datetime.now().isoformat()
        }

        return response_data
    else:
        return results

def evaluate_all_models(sample_count=3):
    """Evaluate all available models"""
    model_configs = get_model_configs()
    all_results = {}

    for model_name in model_configs.keys():
        results = evaluate_single_model(model_name, sample_count)
        all_results[model_name] = results

        with open(f"{model_name}_results.json", "w") as f:
            json.dump(results, f, indent=2)
        print(f"Results for {model_name} saved to {model_name}_results.json")

    with open("all_model_results.json", "w") as f:
        json.dump(all_results, f, indent=2)
    print("All results saved to all_model_results.json")

    return all_results

def evaluate_models_by_category(category, sample_count=3):
    """Evaluate models by category"""
    categories = get_model_categories()

    if category not in categories:
        raise ValueError(f"Unknown category: {category}")

    model_names = categories[category]
    results = {}

    for model_name in model_names:
        results[model_name] = evaluate_local_model(model_name, sample_count)

    with open(f"{category}_results.json", "w") as f:
        json.dump(results, f, indent=2)
    print(f"Results for {category} saved to {category}_results.json")

    return results

def evaluate_model_on_dataset(model_name, dataset_name, sample_count=3):
    """Evaluate a model on a specific dataset"""
    print(f"Evaluating {model_name} on {dataset_name} dataset...")
    try:
        # Load local model
        model = load_local_model(model_name)

        # Load test prompts for specific dataset
        prompts = load_test_prompts(sample_count, dataset_name)

        # Evaluate model
        results = evaluate_model_responses(model, prompts, model_name, dataset_name)

        return results

    except Exception as e:
        print(f"Error evaluating {model_name} on {dataset_name}: {str(e)}")
        return {
            "model": model_name,
            "dataset": dataset_name,
            "error": str(e),
            "metrics": {
                "accuracy": 0,
                "hallucination_rate": 1.0,
                "confidence": 0,
                "response_length": 0,
                "consistency": 0
            },
            "evaluation_date": datetime.now().isoformat()
        }

def evaluate_all_models_on_datasets(sample_count=3):
    """Evaluate all models on all datasets"""
    model_configs = get_model_configs()
    all_results = {}

    for model_name in model_configs.keys():
        model_results = {}

        for dataset_name in ["pubmedqa", "medqa", "mimic_cxr", "medical_qa"]:
            results = evaluate_model_on_dataset(model_name, dataset_name, sample_count)
            model_results[dataset_name] = results

        # Also evaluate on all datasets combined
        combined_results = evaluate_single_model(model_name, sample_count)
        model_results["combined"] = combined_results

        all_results[model_name] = model_results

        # Save individual model results
        with open(f"{model_name}_dataset_results.json", "w") as f:
            json.dump(model_results, f, indent=2)
        print(f"Results for {model_name} saved to {model_name}_dataset_results.json")

    # Save all results together
    with open("all_models_dataset_results.json", "w") as f:
        json.dump(all_results, f, indent=2)
    print("All dataset results saved to all_models_dataset_results.json")

    return all_results

# Visualization Functions

In [23]:
def create_bar_chart_base64(model_results, metric="accuracy"):
    """Create bar chart and return as base64 string"""
    valid_models = {k: v for k, v in model_results.items() if "error" not in v}

    if not valid_models:
        return None

    models = list(valid_models.keys())
    values = [valid_models[model]["metrics"].get(metric, 0) for model in models]

    fig, ax = plt.subplots(figsize=(12, 6))
    bars = ax.bar(models, values, color=['#4CAF50', '#2196F3', '#FF9800', '#E91E63', '#9C27B0'])

    # Customize the chart
    ax.set_ylabel(metric.replace('_', ' ').title(), fontsize=12)
    ax.set_title(f'Model Comparison - {metric.replace("_", " ").title()}', fontsize=14, fontweight='bold')
    ax.set_ylim(0, 1)

    # Add value labels on bars
    for i, v in enumerate(values):
        ax.text(i, v + 0.01, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    # Convert to base64
    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
    buf.seek(0)
    img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
    plt.close()

    return img_base64

# evaluate data, visualization

In [24]:
import os
import json
import base64
import zipfile
from google.colab import files
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def create_ui_export_data(model_results, model_name):
    """Create the specialized UI export data that includes medical datasets"""
    if "error" in model_results:
        return {
            "model": model_name,
            "error": model_results["error"],
            "evaluation_date": model_results.get("evaluation_date", datetime.now().isoformat())
        }

    # Extract sample responses for UI display
    sample_responses = []
    for i, response_data in enumerate(model_results.get("sample_responses", [])[:5]):
        sample_responses.append({
            "id": i + 1,
            "prompt": response_data["prompt"],
            "reference": response_data["reference"],
            "response": response_data["response"],
            "accuracy": response_data["accuracy"],
            "hallucination_score": response_data["hallucination_score"],
            "dataset": response_data.get("dataset", "medical_qa")  # NEW: Include dataset info
        })

    # Extract dataset metrics for UI
    dataset_performance = {}
    for dataset_name, dataset_data in model_results.get("dataset_metrics", {}).items():
        if "metrics" in dataset_data:
            dataset_performance[dataset_name] = dataset_data["metrics"]

    # Create the UI export data structure
    ui_export_data = {
        "model": model_name,
        "evaluation_date": model_results.get("evaluation_date", datetime.now().isoformat()),
        "metrics": model_results.get("metrics", {}),
        "dataset_metrics": dataset_performance,  # NEW: Include dataset performance
        "dataset_details": model_results.get("dataset_metrics", {}),  # NEW: Full dataset details
        "sample_responses": sample_responses,
        "suggestions": model_results.get("suggestions", []),
        "improvement": model_results.get("improvement", {}),
        "baseline": model_results.get("baseline", {})
    }

    return ui_export_data


def create_visualization_directory_structure(model_name):
    """Create directory structure for storing visualization files - FIXED STRUCTURE"""
    # Create a unified directory structure
    base_dir = f"model_evaluation_{model_name}"

    sub_dirs = {
        'charts': f"{base_dir}/charts",
        'data': f"{base_dir}/data",
        'tables': f"{base_dir}/tables",
        'dataset_analysis': f"{base_dir}/dataset_analysis"
    }

    # Create all directories
    for dir_path in sub_dirs.values():
        os.makedirs(dir_path, exist_ok=True)
        print(f"Created directory: {dir_path}")

    return base_dir, sub_dirs

def export_visualizations_to_directories(model_results, model_name):
    """Export all visualizations to organized directory structure - FIXED VERSION"""
    try:
        # Handle different input types
        if isinstance(model_results, dict) and model_name in model_results:
            model_data = model_results[model_name]
        elif isinstance(model_results, dict) and len(model_results) == 1:
            model_data = next(iter(model_results.values()))
        else:
            model_data = model_results

        # Check if we have valid data
        if not isinstance(model_data, dict) or "error" in model_data:
            print(f"Error: Invalid model data for {model_name}")
            return None, []

        base_dir, sub_dirs = create_visualization_directory_structure(model_name)
        exported_files = []

        # Create and save UI export data
        ui_export_data = create_ui_export_data(model_data, model_name)
        ui_export_filename = f"{sub_dirs['data']}/{model_name}_ui_export_data.json"
        with open(ui_export_filename, "w") as f:
            json.dump(ui_export_data, f, indent=2)
        exported_files.append(ui_export_filename)

        # Create and save comprehensive results
        comprehensive_filename = f"{sub_dirs['data']}/{model_name}_comprehensive_results.json"
        with open(comprehensive_filename, "w") as f:
            json.dump(model_data, f, indent=2)
        exported_files.append(comprehensive_filename)

        # For single model, create a dict format for visualization functions
        model_results_dict = {model_name: model_data}

        # Bar charts for key metrics
        for metric in ["accuracy", "hallucination_rate", "confidence"]:
            img_data = create_bar_chart_base64(model_results_dict, metric)
            if img_data:
                filename = f"{sub_dirs['charts']}/{metric}_bar_chart.png"
                with open(filename, "wb") as f:
                    f.write(base64.b64decode(img_data))
                exported_files.append(filename)
                print(f"Created chart: {filename}")

        # Radar chart
        radar_img = create_radar_chart_base64(model_results_dict)
        if radar_img:
            filename = f"{sub_dirs['charts']}/radar_chart.png"
            with open(filename, "wb") as f:
                f.write(base64.b64decode(radar_img))
            exported_files.append(filename)
            print(f"Created chart: {filename}")

        # Comparison table
        comparison_table = create_comparison_table(model_results_dict)
        if not comparison_table.empty:
            html_filename = f"{sub_dirs['tables']}/comparison_table.html"
            with open(html_filename, "w") as f:
                f.write(comparison_table.to_html(classes='table table-striped', index=False))
            exported_files.append(html_filename)
            print(f"Created table: {html_filename}")

        # Additional visualizations for medical datasets if available
        if "dataset_metrics" in model_data and model_data["dataset_metrics"]:
            # Dataset comparison chart
            dataset_chart = create_dataset_comparison_chart(model_results_dict)
            if dataset_chart:
                filename = f"{sub_dirs['charts']}/dataset_comparison_chart.png"
                with open(filename, "wb") as f:
                    f.write(base64.b64decode(dataset_chart))
                exported_files.append(filename)
                print(f"Created chart: {filename}")

            # Individual dataset radar charts
            for dataset_name in ["pubmedqa", "medqa", "mimic_cxr"]:
                if dataset_name in model_data.get("dataset_metrics", {}):
                    dataset_radar = create_dataset_radar_chart(model_results_dict, dataset_name)
                    if dataset_radar:
                        filename = f"{sub_dirs['charts']}/{dataset_name}_radar_chart.png"
                        with open(filename, "wb") as f:
                            f.write(base64.b64decode(dataset_radar))
                        exported_files.append(filename)
                        print(f"Created chart: {filename}")

        print(f"Exported {len(exported_files)} files to {base_dir}/ directory structure")
        return base_dir, exported_files

    except Exception as e:
        print(f"Error in export_visualizations_to_directories: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, []

def create_zip_from_directory(base_dir):
    """Create ZIP archive from directory structure - SIMPLIFIED"""
    zip_filename = f"{base_dir}.zip"

    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(base_dir):
            for file in files:
                file_path = os.path.join(root, file)
                # Add file to zip with relative path
                arcname = os.path.relpath(file_path, base_dir)
                zipf.write(file_path, arcname)

    print(f"Created ZIP archive: {zip_filename}")
    return zip_filename

def create_radar_chart_base64(model_results):
    """Create radar chart comparing multiple metrics across models - UPDATED for single model"""
    valid_models = {k: v for k, v in model_results.items() if "error" not in v}

    # Handle single model case by creating a minimal comparison
    if len(valid_models) == 1:
        # For single model, create a radar chart with just that model
        model_name, results = next(iter(valid_models.items()))
        metrics = ['accuracy', 'confidence', 'consistency']
        labels = ['Accuracy', 'Confidence', 'Consistency']

        fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

        values = [results["metrics"].get(metric, 0) for metric in metrics]
        values += values[:1]  # Close the radar chart

        angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
        angles += angles[:1]

        ax.plot(angles, values, 'o-', linewidth=2, label=model_name, color='#4CAF50')
        ax.fill(angles, values, alpha=0.1, color='#4CAF50')

        ax.set_thetagrids(np.degrees(angles[:-1]), labels)
        ax.set_ylim(0, 1)
        ax.set_title(f'{model_name} Performance Radar Chart', size=14, fontweight='bold')
        ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
        plt.tight_layout()

        # Convert to base64
        buf = BytesIO()
        plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
        buf.seek(0)
        img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
        plt.close()

        return img_base64

    elif len(valid_models) >= 2:
        # Original multi-model code
        metrics = ['accuracy', 'confidence', 'consistency']
        labels = ['Accuracy', 'Confidence', 'Consistency']

        fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))

        colors = ['#4CAF50', '#2196F3', '#FF9800', '#E91E63', '#9C27B0']

        for i, (model_name, results) in enumerate(valid_models.items()):
            values = [results["metrics"].get(metric, 0) for metric in metrics]
            values += values[:1]  # Close the radar chart

            angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
            angles += angles[:1]

            ax.plot(angles, values, 'o-', linewidth=2, label=model_name, color=colors[i % len(colors)])
            ax.fill(angles, values, alpha=0.1, color=colors[i % len(colors)])

        ax.set_thetagrids(np.degrees(angles[:-1]), labels)
        ax.set_ylim(0, 1)
        ax.set_title('Model Performance Radar Chart', size=14, fontweight='bold')
        ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
        plt.tight_layout()

        # Convert to base64
        buf = BytesIO()
        plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
        buf.seek(0)
        img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
        plt.close()

        return img_base64

    return None

def create_dataset_comparison_chart(model_results):
    """Create chart comparing performance across medical datasets - UPDATED"""
    valid_models = {k: v for k, v in model_results.items() if "error" not in v and "dataset_metrics" in v}

    if not valid_models:
        return None

    # Get all medical dataset names
    dataset_names = ["pubmedqa", "medqa", "mimic_cxr"]

    fig, axes = plt.subplots(1, 2, figsize=(15, 6))

    # Accuracy by dataset
    for model_name, results in valid_models.items():
        accuracies = []
        for dataset in dataset_names:
            if dataset in results.get("dataset_metrics", {}):
                accuracies.append(results["dataset_metrics"][dataset].get("metrics", {}).get("accuracy", 0))
            else:
                accuracies.append(0)

        axes[0].plot(dataset_names, accuracies, 'o-', label=model_name, linewidth=2, markersize=8)

    axes[0].set_title('Accuracy Across Medical Datasets', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('Accuracy')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    axes[0].set_ylim(0, 1)

    # Hallucination rate by dataset
    for model_name, results in valid_models.items():
        hall_rates = []
        for dataset in dataset_names:
            if dataset in results.get("dataset_metrics", {}):
                hall_rates.append(results["dataset_metrics"][dataset].get("metrics", {}).get("hallucination_rate", 0))
            else:
                hall_rates.append(0)

        axes[1].plot(dataset_names, hall_rates, 'o-', label=model_name, linewidth=2, markersize=8)

    axes[1].set_title('Hallucination Rate Across Medical Datasets', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('Hallucination Rate')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    axes[1].set_ylim(0, 1)

    plt.tight_layout()

    # Convert to base64
    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
    buf.seek(0)
    img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
    plt.close()

    return img_base64


def create_dataset_radar_chart(model_results, dataset_name):
    """Create radar chart for a specific dataset"""
    valid_models = {k: v for k, v in model_results.items() if "error" not in v and "dataset_metrics" in v}

    if not valid_models or dataset_name not in next(iter(valid_models.values()))["dataset_metrics"]:
        return None

    metrics = ['accuracy', 'confidence']
    labels = ['Accuracy', 'Confidence']

    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

    colors = ['#4CAF50', '#2196F3', '#FF9800', '#E91E63', '#9C27B0']

    for i, (model_name, results) in enumerate(valid_models.items()):
        if dataset_name in results["dataset_metrics"]:
            values = [results["dataset_metrics"][dataset_name].get(metric, 0) for metric in metrics]
            values += values[:1]  # Close the radar chart

            angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
            angles += angles[:1]

            ax.plot(angles, values, 'o-', linewidth=2, label=model_name, color=colors[i % len(colors)])
            ax.fill(angles, values, alpha=0.1, color=colors[i % len(colors)])

    ax.set_thetagrids(np.degrees(angles[:-1]), labels)
    ax.set_ylim(0, 1)
    ax.set_title(f'Performance on {dataset_name.upper()} Dataset', size=14, fontweight='bold')
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
    plt.tight_layout()

    # Convert to base64
    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
    buf.seek(0)
    img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
    plt.close()

    return img_base64

def create_comparison_table(model_results):
    """Create comprehensive comparison table"""
    data = []
    for model_name, results in model_results.items():
        if "error" not in results:
            data.append({
                "Model": model_name,
                "Accuracy": f"{results['metrics'].get('accuracy', 0):.3f}",
                "Hallucination Rate": f"{results['metrics'].get('hallucination_rate', 0):.3f}",
                "Confidence": f"{results['metrics'].get('confidence', 0):.3f}",
                "Response Length": f"{results['metrics'].get('response_length', 0):.1f}",
                "Consistency": f"{results['metrics'].get('consistency', 0):.3f}",
                "Sample Count": results.get('sample_count', 0)
            })

    return pd.DataFrame(data)

In [26]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
sample_count = 2

def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

# List of free models to evaluate
free_models = ["llama-2-7b", "mistral-7b", "qwen-7b", "meditron-7b", "biomedgpt"]

# Evaluate a specific model
model_name = "mistral-7b"  # Change this to evaluate different models
clear_gpu_cache()
model_result = evaluate_single_model(model_name, sample_count)

# Export visualizations
base_dir, exported_files = export_visualizations_to_directories(model_result, model_name)

# Print what files were created
print("Files created:")
for file in exported_files:
    print(f"  - {file}")

# Create and download zip
zip_filename = create_zip_from_directory(base_dir)
files.download(zip_filename)

Evaluating local model: mistral-7b...


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1026, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 478, in cached_files
    hf_hub_download(
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1010, in hf_hub_download
    retur

Error loading mistral-7b: There was a specific connection error when trying to load mistralai/Mistral-7B-v0.1:
401 Client Error: Unauthorized for url: https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json (Request ID: Root=1-68c000fa-16c0c5d6275a87b673498474;440236e9-7be8-4943-8aaa-f68a46e0b5b3)

Invalid credentials in Authorization header


Evaluating mistral-7b:  12%|█▎        | 1/8 [00:00<00:04,  1.42it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c000fb-1295a62872b74fb1237f07b5;457b8357-4249-4265-931e-808dbcddab66)

Invalid credentials in Authorization header


Evaluating mistral-7b:  25%|██▌       | 2/8 [00:01<00:04,  1.31it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c000fc-29d53cbc5c93b6a1442d7e2f;83bcb34b-eaab-414f-ad20-d97c8c417855)

Invalid credentials in Authorization header


Evaluating mistral-7b:  38%|███▊      | 3/8 [00:02<00:03,  1.30it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c000fd-080a5c97686a4dfb1dc2bee4;5fa4ca96-074e-461b-94dd-49ecf8951663)

Invalid credentials in Authorization header


Evaluating mistral-7b:  50%|█████     | 4/8 [00:03<00:03,  1.26it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c000fd-110d2208564071e1548fa341;77499da3-5c16-4b79-8cec-cd58fc72935a)

Invalid credentials in Authorization header


Evaluating mistral-7b:  62%|██████▎   | 5/8 [00:03<00:02,  1.30it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c000fe-2bc5e57802d29d461bd337d2;d73ddb9d-f0b9-4c8c-ac51-4e316415fc20)

Invalid credentials in Authorization header


Evaluating mistral-7b:  75%|███████▌  | 6/8 [00:04<00:01,  1.31it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c000ff-25e3797e736a8b7e40a13e93;fe50ffb6-081c-45dd-a48b-c31fae972f44)

Invalid credentials in Authorization header


Evaluating mistral-7b:  88%|████████▊ | 7/8 [00:05<00:00,  1.28it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c00100-5d39c9d80672255c2c3becf9;f271ea96-dc19-4624-886e-21ca4bedfdd9)

Invalid credentials in Authorization header


Evaluating mistral-7b: 100%|██████████| 8/8 [00:06<00:00,  1.30it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c00100-51051ed97e812c8d594d87a9;81166b2a-7c56-4bd6-ab4e-cc976f2bb002)

Invalid credentials in Authorization header
Evaluating mistral-7b on pubmedqa dataset...



Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1026, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 478, in cached_files
    hf_hub_download(
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1010, in hf_hub_download
    retu

Error loading mistral-7b: There was a specific connection error when trying to load mistralai/Mistral-7B-v0.1:
401 Client Error: Unauthorized for url: https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json (Request ID: Root=1-68c00101-37c3d0083c1eac0f75e088dc;ea8885a7-b482-4d62-93f6-891872ce51c5)

Invalid credentials in Authorization header


Evaluating mistral-7b:  50%|█████     | 1/2 [00:00<00:00,  1.37it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c00102-4e0ebfba64e8651516deffb4;42de735b-fa5d-4d83-a83d-818c734c5922)

Invalid credentials in Authorization header


Evaluating mistral-7b: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c00103-3bbb96494847025f2228f2eb;5f1e709a-a80e-49e8-82b3-398697220d99)

Invalid credentials in Authorization header
Evaluating mistral-7b on medqa dataset...



Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1026, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 478, in cached_files
    hf_hub_download(
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1010, in hf_hub_download
    retu

Error loading mistral-7b: There was a specific connection error when trying to load mistralai/Mistral-7B-v0.1:
401 Client Error: Unauthorized for url: https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json (Request ID: Root=1-68c00103-07730158172a6b6c605624fd;2a123147-a302-4293-918f-aef887bee05c)

Invalid credentials in Authorization header


Evaluating mistral-7b:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c00104-4065358167149fc07219070c;6a653424-000b-4b6b-8ab8-92e2d66074a9)

Invalid credentials in Authorization header


Evaluating mistral-7b: 100%|██████████| 2/2 [00:01<00:00,  1.27it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c00105-6c3875ce52b1a4a414c4d7ac;21ec8259-445c-406a-b3af-5509f051d3f2)

Invalid credentials in Authorization header
Evaluating mistral-7b on mimic_cxr dataset...



Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1026, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 478, in cached_files
    hf_hub_download(
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1010, in hf_hub_download
    retu

Error loading mistral-7b: There was a specific connection error when trying to load mistralai/Mistral-7B-v0.1:
401 Client Error: Unauthorized for url: https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json (Request ID: Root=1-68c00105-0c3b042577a9060b78306cca;92065ee8-99c4-46fc-9a8b-11aff1590e6f)

Invalid credentials in Authorization header


Evaluating mistral-7b:  50%|█████     | 1/2 [00:01<00:01,  1.23s/it]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c00107-0f3f90441fd6c2f714995137;4606fc06-be83-436f-91b1-df65d06fafd6)

Invalid credentials in Authorization header


Evaluating mistral-7b: 100%|██████████| 2/2 [00:01<00:00,  1.00it/s]

Error in generation: There was a specific connection error when trying to load sentence-transformers/all-mpnet-base-v2:
401 Client Error: Unauthorized for url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/config.json (Request ID: Root=1-68c00107-5bb469b05f1bb7105ee35a2a;73f0742a-6dab-49d0-947f-f42e900c1e4c)

Invalid credentials in Authorization header
Created directory: model_evaluation_mistral-7b/charts
Created directory: model_evaluation_mistral-7b/data
Created directory: model_evaluation_mistral-7b/tables
Created directory: model_evaluation_mistral-7b/dataset_analysis
Created chart: model_evaluation_mistral-7b/charts/accuracy_bar_chart.png





Created chart: model_evaluation_mistral-7b/charts/hallucination_rate_bar_chart.png
Created chart: model_evaluation_mistral-7b/charts/confidence_bar_chart.png
Created chart: model_evaluation_mistral-7b/charts/radar_chart.png
Created table: model_evaluation_mistral-7b/tables/comparison_table.html
Created chart: model_evaluation_mistral-7b/charts/dataset_comparison_chart.png
Created chart: model_evaluation_mistral-7b/charts/pubmedqa_radar_chart.png
Created chart: model_evaluation_mistral-7b/charts/medqa_radar_chart.png
Created chart: model_evaluation_mistral-7b/charts/mimic_cxr_radar_chart.png
Exported 11 files to model_evaluation_mistral-7b/ directory structure
Files created:
  - model_evaluation_mistral-7b/data/mistral-7b_ui_export_data.json
  - model_evaluation_mistral-7b/data/mistral-7b_comprehensive_results.json
  - model_evaluation_mistral-7b/charts/accuracy_bar_chart.png
  - model_evaluation_mistral-7b/charts/hallucination_rate_bar_chart.png
  - model_evaluation_mistral-7b/charts/c

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>