In [9]:
# Comprehensive Guide to Evaluating Language Models (LLMs) with Python

# Install necessary libraries
# !pip install numpy pandas scikit-learn rouge-score nltk detoxify lm-eval matplotlib
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
import os
import warnings
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from detoxify import Detoxify


In [10]:

# Example dataset for evaluation
gold_standard = [
    {"query": "What is 2 + 2?", "correct_answer": "4"},
    {"query": "Who wrote Macbeth?", "correct_answer": "William Shakespeare"},
    {"query": "What is the boiling point of water?", "correct_answer": "100°C"}
]

model_outputs = [
    {"query": "What is 2 + 2?", "output": "4"},
    {"query": "Who wrote Macbeth?", "output": "Charles Dickens"},
    {"query": "What is the boiling point of water?", "output": "100°C"}
]

texts = [
    "This is a friendly and respectful comment.",
    "This is a hateful and offensive comment."
]


In [11]:

# --- 1. Hallucination Reduction Rate (HRR)
def calculate_hrr(baseline_outputs, validated_outputs):
    hallucinations_reduced = sum(
        1 for base, valid in zip(baseline_outputs, validated_outputs)
        if base.get("is_hallucination") and not valid.get("is_hallucination")
    )
    total_hallucinations = sum(1 for base in baseline_outputs if base.get("is_hallucination"))
    return (hallucinations_reduced / total_hallucinations) * 100 if total_hallucinations > 0 else 0

# Example HRR usage
baseline_outputs = [
    {"query": "What is the boiling point of water?", "output": "50°C", "is_hallucination": True},
    {"query": "Who wrote Hamlet?", "output": "Charles Dickens", "is_hallucination": True}
]
validated_outputs = [
    {"query": "What is the boiling point of water?", "output": "100°C", "is_hallucination": False},
    {"query": "Who wrote Hamlet?", "output": "William Shakespeare", "is_hallucination": False}
]
hrr_score = calculate_hrr(baseline_outputs, validated_outputs)
print(f"Hallucination Reduction Rate (HRR): {hrr_score:.2f}%")
print("This score reflects the percentage of hallucinations corrected by the model. A higher value indicates fewer factual errors.")


Hallucination Reduction Rate (HRR): 100.00%
This score reflects the percentage of hallucinations corrected by the model. A higher value indicates fewer factual errors.


In [12]:

# --- 2. Logical Consistency Score (LCS)
def calculate_lcs(responses):
    consistent_responses = sum(1 for response in responses if response.get("is_consistent"))
    return (consistent_responses / len(responses)) * 100

# Example LCS usage
responses = [
    {"query": "If A > B and B > C, is A > C?", "output": "Yes", "is_consistent": True},
    {"query": "Is it possible for a square to have three sides?", "output": "No", "is_consistent": True}
]
lcs_score = calculate_lcs(responses)
print(f"Logical Consistency Score (LCS): {lcs_score:.2f}%")
print("This score measures logical reasoning accuracy. A higher value suggests the model maintains logical coherence.")


Logical Consistency Score (LCS): 100.00%
This score measures logical reasoning accuracy. A higher value suggests the model maintains logical coherence.


In [13]:

# --- 3. Response Accuracy (RA)
def calculate_ra(gold_standard, model_outputs):
    correct_responses = sum(
        1 for gold, output in zip(gold_standard, model_outputs)
        if gold["correct_answer"] == output["output"]
    )
    return (correct_responses / len(gold_standard)) * 100

ra_score = calculate_ra(gold_standard, model_outputs)
print(f"Response Accuracy (RA): {ra_score:.2f}%")
print("Response Accuracy measures correctness in providing factual answers. A higher value means more accurate answers.")


Response Accuracy (RA): 66.67%
Response Accuracy measures correctness in providing factual answers. A higher value means more accurate answers.


In [14]:

# --- 4. Exact Match (EM)
def exact_match(prediction, target):
    return prediction == target

em_score = exact_match("Paris", "Paris")
print(f"Exact Match (EM): {em_score}")
print("Exact Match evaluates if the prediction exactly matches the reference.")


Exact Match (EM): True
Exact Match evaluates if the prediction exactly matches the reference.


In [15]:

# --- 5. F1 Score
def calculate_f1(predictions, targets):
    return f1_score(targets, predictions, average="binary")

predictions = [1, 0, 1, 1]
targets = [1, 0, 0, 1]
print(f"F1 Score: {calculate_f1(predictions, targets):.2f}")
print("F1 Score balances precision and recall. Higher values suggest fewer false positives and negatives.")


F1 Score: 0.80
F1 Score balances precision and recall. Higher values suggest fewer false positives and negatives.


In [16]:
# --- 6. ROUGE
def calculate_rouge(prediction, target):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    return scorer.score(target, prediction)

rouge_scores = calculate_rouge("The cat sat on the mat.", "The cat is on the mat.")
print("ROUGE Scores:", rouge_scores)
print("ROUGE measures text similarity. High precision and recall suggest strong alignment with reference.")


ROUGE Scores: {'rouge1': Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334), 'rougeL': Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334)}
ROUGE measures text similarity. High precision and recall suggest strong alignment with reference.


In [30]:
from rouge_score import rouge_scorer

def calculate_rouge(prediction, target):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    return scorer.score(target, prediction)

# Example Data
prediction = "The cat sat on the mat."
target = "The cat is on the mat."

# Calculate ROUGE
rouge_scores = calculate_rouge(prediction, target)
print("ROUGE Scores:", rouge_scores)

ROUGE Scores: {'rouge1': Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334), 'rougeL': Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334)}


In [17]:
# --- 7. BLEU
def calculate_bleu(prediction, target):
    reference = [target.split()]
    candidate = prediction.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu(reference, candidate, smoothing_function=smoothing_function)

bleu_score = calculate_bleu("The cat is on the mat.", "The cat sat on the mat.")
print(f"BLEU Score: {bleu_score:.2f}")
print("BLEU evaluates translation quality. Smoothing addresses low n-gram overlap warnings.")


BLEU Score: 0.25


In [18]:
# --- 8. Toxicity Detection
def detect_toxicity(text):
    model = Detoxify('original')
    return model.predict(text)

for text in texts:
    print(f"Toxicity for '{text}': {detect_toxicity(text)}")
print("Toxicity scores indicate harmful content. Lower scores are preferable for ethical AI.")


Toxicity for 'This is a friendly and respectful comment.': {'toxicity': np.float32(0.0005483509), 'severe_toxicity': np.float32(0.00014052661), 'obscene': np.float32(0.00019850969), 'threat': np.float32(0.00013926814), 'insult': np.float32(0.00018051789), 'identity_attack': np.float32(0.00014728199)}
Toxicity for 'This is a hateful and offensive comment.': {'toxicity': np.float32(0.15707134), 'severe_toxicity': np.float32(0.00023562438), 'obscene': np.float32(0.0022060007), 'threat': np.float32(0.00058931776), 'insult': np.float32(0.0024856713), 'identity_attack': np.float32(0.0010868483)}
Toxicity scores indicate harmful content. Lower scores are preferable for ethical AI.


In [19]:
from collections import Counter

def calculate_diversity(text, n=2):
    words = text.split()
    ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
    total_ngrams = len(ngrams)
    unique_ngrams = len(set(ngrams))
    return unique_ngrams / total_ngrams if total_ngrams > 0 else 0

# Example Data
text = "The quick brown fox jumps over the lazy dog. The quick brown fox repeats."
diversity_score = calculate_diversity(text, n=2)
print(f"Diversity Score (2-grams): {diversity_score:.2f}")


Diversity Score (2-grams): 0.77


In [20]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_coherence(context, response):
    vectorizer = TfidfVectorizer().fit([context, response])
    vectors = vectorizer.transform([context, response])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

# Example Data
context = "What is the capital of France?"
response = "Paris is the capital of France."
coherence_score = calculate_coherence(context, response)
print(f"Coherence Score: {coherence_score:.2f}")


Coherence Score: 0.72


In [23]:
from detoxify import Detoxify

def detect_bias(text):
    model = Detoxify('unbiased')  # Use the 'unbiased' model
    predictions = model.predict(text)
    toxicity = predictions['toxicity']
    insult = predictions['insult']

    # Set thresholds (adjust as needed)
    toxicity_threshold = 0.5 
    insult_threshold = 0.5

    if toxicity > toxicity_threshold:
        print("The text is likely toxic.")
    if insult > insult_threshold:
        print("The text is likely insulting.")

    return toxicity, insult

# Example Data
biased_text = "Men are better leaders than women."
toxicity, insult = detect_bias(biased_text)
print(f"Toxicity Score: {toxicity:.2f}, Insult Score: {insult:.2f}")

Toxicity Score: 0.12, Insult Score: 0.02


In [25]:
from detoxify import Detoxify

def detect_toxicity(text):
    model = Detoxify('original')
    return model.predict(text)

# Example Data
texts = [
    "This is a respectful comment.",
    "This is a hateful comment."
]

# Detect Toxicity
for text in texts:
    print(f"Toxicity for '{text}': {detect_toxicity(text)}")


Toxicity for 'This is a respectful comment.': {'toxicity': np.float32(0.00058519543), 'severe_toxicity': np.float32(0.00013003703), 'obscene': np.float32(0.00019055209), 'threat': np.float32(0.00012352254), 'insult': np.float32(0.00017691121), 'identity_attack': np.float32(0.00014319048)}
Toxicity for 'This is a hateful comment.': {'toxicity': np.float32(0.12656806), 'severe_toxicity': np.float32(0.00021170401), 'obscene': np.float32(0.002147647), 'threat': np.float32(0.00049468974), 'insult': np.float32(0.0020356625), 'identity_attack': np.float32(0.00074969034)}


In [26]:
from detoxify import Detoxify

def detect_toxicity(text):
    model = Detoxify('original')
    return model.predict(text)

# Example Data
texts = [
    "This is a respectful comment.",
    "This is a hateful comment."
]

# Detect Toxicity
for text in texts:
    print(f"Toxicity for '{text}':")
    results = detect_toxicity(text)
    for key, value in results.items():
        print(f"  {key.replace('_', ' ').title()}: {value:.4f}")

Toxicity for 'This is a respectful comment.':
  Toxicity: 0.0006
  Severe Toxicity: 0.0001
  Obscene: 0.0002
  Threat: 0.0001
  Insult: 0.0002
  Identity Attack: 0.0001
Toxicity for 'This is a hateful comment.':
  Toxicity: 0.1266
  Severe Toxicity: 0.0002
  Obscene: 0.0021
  Threat: 0.0005
  Insult: 0.0020
  Identity Attack: 0.0007


In [27]:
def evaluate_knowledge_retention(questions, correct_answers, model_outputs):
    retained = sum(1 for q, a, o in zip(questions, correct_answers, model_outputs) if a == o)
    return retained / len(questions) * 100

# Example Data
questions = ["Who wrote Hamlet?", "What is the capital of Italy?"]
correct_answers = ["William Shakespeare", "Rome"]
model_outputs = ["William Shakespeare", "Rome"]  # Outputs from the model
knowledge_retention_score = evaluate_knowledge_retention(questions, correct_answers, model_outputs)
print(f"Knowledge Retention Score: {knowledge_retention_score:.2f}%")

Knowledge Retention Score: 100.00%


In [28]:
from collections import Counter

def calculate_diversity(text, n=2):
    words = text.split()
    ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
    total_ngrams = len(ngrams)
    unique_ngrams = len(set(ngrams))
    return unique_ngrams / total_ngrams if total_ngrams > 0 else 0

# Example Data
text = "The quick brown fox jumps over the lazy dog. The quick brown fox repeats."
diversity_score = calculate_diversity(text, n=2)
print(f"Diversity Score (2-grams): {diversity_score:.2f}")

Diversity Score (2-grams): 0.77


In [None]:
# --- 9. Using `lm-evaluation-harness`
try:
    from lm_eval.evaluator import simple_evaluate
    results = simple_evaluate(
        model="hf",  # Use Hugging Face AutoModel
        model_args="pretrained=gpt2",  # Specify the pretrained model
        tasks=["lambada_openai", "piqa"],  # Tasks to evaluate
       #device="cpu",  # Force evaluation on CPU
    )
    print("LM Evaluation Results:", results)
    print("These results summarize model performance on standard benchmarks.")
except ImportError as e:
    warnings.warn("Ensure `lm-eval` is properly installed for this step.")
except AssertionError as e:
    print("Error:", e)
    print("Ensure your PyTorch installation matches your hardware capabilities (CPU or GPU).")
