Give me 5 case studies on gen ai eval coding interview questions with answers jupyter notebook

Case Study 1: Text Generation Evaluation
Problem:
Evaluate the quality of generated text from a Large Language Model (LLM) for a specific task (e.g., summarization, creative writing, chatbot responses).
Coding Task:
Implement metrics like BLEU, ROUGE, or perplexity to quantitatively assess text quality. Demonstrate how to compare different LLM outputs based on these metrics.
Jupyter Notebook Focus:
Data loading, text preprocessing, function implementations for metrics, visualization of scores, and qualitative analysis of generated text examples.

In [1]:
### Text Generation Evaluation Notebook

# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Set style for plots
sns.set(style="whitegrid")

### 1. Load and Explore Data
# Simulated generated outputs from two different LLMs for summarization task

data = pd.DataFrame({
    'reference': [
        "The cat sat on the mat and looked out the window.",
        "The stock market crashed due to unexpected economic news.",
        "The book explores themes of love and loss in a post-war setting."
    ],
    'LLM_A': [
        "The cat sat on a mat and stared out the window.",
        "Economic news led to a crash in the stock market.",
        "This novel is about love and loss after the war."
    ],
    'LLM_B': [
        "A cat was looking out the window while sitting on a mat.",
        "The financial markets tumbled after the news.",
        "The book discusses war, romance, and grief."
    ]
})

data.head()

### 2. Preprocessing (if necessary)
# Here we assume text is already clean for simplicity

### 3. Evaluation Metric Functions

def compute_bleu(reference, candidate):
    ref_tokens = reference.split()
    cand_tokens = candidate.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref_tokens], cand_tokens, smoothing_function=smoothie)

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

def compute_perplexity(texts, model, tokenizer):
    encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**encodings, labels=encodings['input_ids'])
    return torch.exp(outputs.loss).item()

### 4. Compute Scores

bleu_scores_A = [compute_bleu(ref, pred) for ref, pred in zip(data['reference'], data['LLM_A'])]
bleu_scores_B = [compute_bleu(ref, pred) for ref, pred in zip(data['reference'], data['LLM_B'])]

rouge_1_A = [compute_rouge(ref, pred)['rouge1'].fmeasure for ref, pred in zip(data['reference'], data['LLM_A'])]
rouge_1_B = [compute_rouge(ref, pred)['rouge1'].fmeasure for ref, pred in zip(data['reference'], data['LLM_B'])]

rouge_L_A = [compute_rouge(ref, pred)['rougeL'].fmeasure for ref, pred in zip(data['reference'], data['LLM_A'])]
rouge_L_B = [compute_rouge(ref, pred)['rougeL'].fmeasure for ref, pred in zip(data['reference'], data['LLM_B'])]

# Load GPT2 for perplexity scoring (simplified, not optimal for long texts)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()

perplexity_A = compute_perplexity(list(data['LLM_A']), model, tokenizer)
perplexity_B = compute_perplexity(list(data['LLM_B']), model, tokenizer)

### 5. Visualize Scores

score_df = pd.DataFrame({
    'BLEU_LLM_A': bleu_scores_A,
    'BLEU_LLM_B': bleu_scores_B,
    'ROUGE1_LLM_A': rouge_1_A,
    'ROUGE1_LLM_B': rouge_1_B,
    'ROUGE_L_LLM_A': rouge_L_A,
    'ROUGE_L_LLM_B': rouge_L_B
})

score_df.plot(kind='bar', figsize=(12,6))
plt.title("Comparison of LLM A and LLM B Across BLEU and ROUGE Metrics")
plt.xlabel("Example Index")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

### 6. Qualitative Examples
for i in range(len(data)):
    print(f"Example {i+1}")
    print("Reference: ", data['reference'][i])
    print("LLM_A: ", data['LLM_A'][i])
    print("LLM_B: ", data['LLM_B'][i])
    print(f"BLEU_A: {bleu_scores_A[i]:.2f}, BLEU_B: {bleu_scores_B[i]:.2f}")
    print(f"ROUGE-L_A: {rouge_L_A[i]:.2f}, ROUGE-L_B: {rouge_L_B[i]:.2f}\n")

### 7. Perplexity Summary
print(f"\nPerplexity (LLM_A): {perplexity_A:.2f}")
print(f"Perplexity (LLM_B): {perplexity_B:.2f}")

ModuleNotFoundError: No module named 'nltk'