# 🧪 Experiment 2: Prompting Strategies

## 🎯 Hypothesis
The "improved" model failed because the prompt was too generic and encouraged template-like responses. Different prompts might improve factual accuracy.

## 🔬 Changes from Baseline:
- ✅ Test 3 different prompting approaches:
  1. **Extractive**: "summarize: " (encourages extracting key facts)
  2. **Concise**: "medical answer: " (encourages direct medical answers)
  3. **Context-aware**: "question: {Q} context: medical knowledge answer: " (structured approach)
- ✅ Same T5-Small model (fair comparison)
- ✅ Same optimized hyperparameters as Experiment 1
- ✅ Sequence lengths: 256/512 (restored)

**Model:** T5-Small (77M params)

**Baseline to Beat:**
- BLEU: 0.0283
- ROUGE-L: 0.2102

**Note:** Run all 3 prompt variations, then compare results


In [None]:
%pip install -q tf-keras transformers tensorflow pandas numpy scikit-learn datasets evaluate rouge-score sacrebleu matplotlib seaborn
print("✓ Installed!")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from datasets import Dataset
import evaluate
import tensorflow as tf
import tf_keras
from datetime import datetime

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

print(f"TensorFlow: {tf.__version__}")


In [None]:
# Load preprocessed data
train_df = pd.read_csv('data/improved/train_improved.csv')
val_df = pd.read_csv('data/improved/val_improved.csv')
test_df = pd.read_csv('data/improved/test_improved.csv')
print(f"Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}")


## 🔬 KEY DIFFERENCE: Prompting Strategies

Choose ONE prompt strategy below and run the experiment:


In [None]:
# CHOOSE PROMPT STRATEGY (uncomment ONE):
# PROMPT_STRATEGY = "extractive"   # Option 1: Extractive style
PROMPT_STRATEGY = "concise"      # Option 2: Concise medical answer (RECOMMENDED)
# PROMPT_STRATEGY = "contextual"  # Option 3: Context-aware structured

PROMPT_TEMPLATES = {
    "extractive": "summarize: ",
    "concise": "medical answer: ",
    "contextual": "question: {question} context: medical knowledge answer: "
}

print(f"🎯 TESTING PROMPT STRATEGY: {PROMPT_STRATEGY}")
print(f"Template: '{PROMPT_TEMPLATES[PROMPT_STRATEGY]}'")


In [None]:
MODEL_NAME = "t5-small"
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"✓ {MODEL_NAME} loaded | Vocab: {len(tokenizer):,}")


In [None]:
def preprocess_function(examples):
    """Tokenization with selected prompt strategy."""
    if PROMPT_STRATEGY == "contextual":
        inputs = [PROMPT_TEMPLATES[PROMPT_STRATEGY].format(question=q) for q in examples['question']]
    else:
        inputs = [PROMPT_TEMPLATES[PROMPT_STRATEGY] + q for q in examples['question']]
    
    targets = examples['answer']
    
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding='max_length')
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

print("✓ Preprocessing function ready with", PROMPT_STRATEGY, "strategy")


In [None]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

tok_train = train_ds.map(preprocess_function, batched=True, remove_columns=train_ds.column_names)
tok_val = val_ds.map(preprocess_function, batched=True, remove_columns=val_ds.column_names)
tok_test = test_ds.map(preprocess_function, batched=True, remove_columns=test_ds.column_names)

print(f"✓ Tokenized: {len(tok_train):,} train samples")


In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
print(f"✓ Model: {model.num_parameters():,} params")


In [None]:
# Same optimized hyperparameters as Experiment 1
BATCH_SIZE = 8
GRADIENT_ACCUM = 2
LEARNING_RATE = 5e-5
EPOCHS = 8
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1

print(f"Hyperparameters: LR={LEARNING_RATE}, Epochs={EPOCHS}, Batch={BATCH_SIZE}")

tf_train = model.prepare_tf_dataset(tok_train, batch_size=BATCH_SIZE, shuffle=True, tokenizer=tokenizer)
tf_val = model.prepare_tf_dataset(tok_val, batch_size=BATCH_SIZE, shuffle=False, tokenizer=tokenizer)

# Setup optimizer (same as Experiment 1)
num_steps = (len(tf_train) // GRADIENT_ACCUM) * EPOCHS
num_warmup = int(WARMUP_RATIO * num_steps)

lr_schedule = tf_keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=LEARNING_RATE, decay_steps=num_steps - num_warmup, end_learning_rate=1e-7
)

class WarmupSchedule(tf_keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, warmup_steps, post_warmup_schedule):
        super().__init__()
        self.warmup_steps = warmup_steps
        self.post_warmup_schedule = post_warmup_schedule
        self.learning_rate = LEARNING_RATE
        
    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        warmup_steps = tf.cast(self.warmup_steps, tf.float32)
        warmup_lr = self.learning_rate * (step / warmup_steps)
        decay_lr = self.post_warmup_schedule(step - warmup_steps)
        return tf.cond(step < warmup_steps, lambda: warmup_lr, lambda: decay_lr)

final_schedule = WarmupSchedule(num_warmup, lr_schedule)
optimizer = tf_keras.optimizers.AdamW(learning_rate=final_schedule, weight_decay=WEIGHT_DECAY)
model.compile(optimizer=optimizer)

print("✓ Optimizer ready")


In [None]:
from tf_keras.callbacks import EarlyStopping, ModelCheckpoint

exp_dir = f"models/experiment_2_{PROMPT_STRATEGY}"
Path(exp_dir).mkdir(parents=True, exist_ok=True)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
    ModelCheckpoint(f'{exp_dir}/best.h5', monitor='val_loss', save_best_only=True, save_weights_only=True, verbose=1)
]

print(f"✓ Will save to {exp_dir}")


In [None]:
print("="*80)
print(f"🚀 EXPERIMENT 2: PROMPTING STRATEGY = {PROMPT_STRATEGY.upper()}")
print("="*80)

start_time = datetime.now()

history = model.fit(tf_train, validation_data=tf_val, epochs=EPOCHS, callbacks=callbacks, verbose=1)

training_time = (datetime.now() - start_time).total_seconds()

print(f"\\n✓ Training complete in {training_time/60:.1f} minutes")


In [None]:
# Plot
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], 'o-', label='Train', lw=2)
plt.plot(history.history['val_loss'], 's-', label='Val', lw=2)
plt.xlabel('Epoch'); plt.ylabel('Loss')
plt.title(f'Exp 2: {PROMPT_STRATEGY} prompting')
plt.legend(); plt.grid(True, alpha=0.3)
plt.savefig(f'data/improved/experiment_2_{PROMPT_STRATEGY}.png', dpi=300)
plt.show()

print(f"Best Val Loss: {min(history.history['val_loss']):.4f}")


## 📊 Evaluation


In [None]:
def generate_answer(question, model, tokenizer):
    if PROMPT_STRATEGY == "contextual":
        prefix = PROMPT_TEMPLATES[PROMPT_STRATEGY].format(question=question)
    else:
        prefix = PROMPT_TEMPLATES[PROMPT_STRATEGY] + question
    
    inputs = tokenizer(prefix, return_tensors='tf', max_length=MAX_INPUT_LENGTH, truncation=True)
    outputs = model.generate(**inputs, max_length=512, min_length=10, num_beams=4, early_stopping=True,
                            no_repeat_ngram_size=3, length_penalty=1.0, do_sample=False)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

print("Generating 200 predictions...")
predictions, references = [], []

for i in range(min(200, len(test_df))):
    q = test_df.iloc[i]['question']
    predictions.append(generate_answer(q, model, tokenizer))
    references.append(test_df.iloc[i]['answer'])
    if (i+1) % 50 == 0:
        print(f"  {i+1}/200...")

print("✓ Done")


In [None]:
bleu_result = bleu_metric.compute(predictions=predictions, references=[[r] for r in references])
rouge_result = rouge_metric.compute(predictions=predictions, references=references)
val_loss = min(history.history['val_loss'])

print("\\n" + "="*80)
print(f"📊 EXPERIMENT 2 RESULTS ({PROMPT_STRATEGY})")
print("="*80)
print(f"BLEU:     {bleu_result['bleu']:.4f}")
print(f"ROUGE-L:  {rouge_result['rougeL']:.4f}")
print(f"Val Loss: {val_loss:.4f}")
print("="*80)

# Save results
results = {
    'experiment_id': f'exp2_{PROMPT_STRATEGY}',
    'experiment_name': f'Prompting Strategy: {PROMPT_STRATEGY}',
    'model_name': MODEL_NAME,
    'prompt_strategy': PROMPT_STRATEGY,
    'prompt_template': PROMPT_TEMPLATES[PROMPT_STRATEGY],
    'hyperparameters': {
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE,
        'epochs': EPOCHS,
        'max_input_length': MAX_INPUT_LENGTH,
        'max_target_length': MAX_TARGET_LENGTH
    },
    'metrics': {
        'bleu': float(bleu_result['bleu']),
        'rougeL': float(rouge_result['rougeL']),
        'val_loss': float(val_loss)
    },
    'training_time_seconds': training_time
}

Path('results').mkdir(exist_ok=True)
with open(f'results/experiment_2_{PROMPT_STRATEGY}.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\\n✓ Results saved to results/experiment_2_{PROMPT_STRATEGY}.json")
