# 🧪 Experiment 4: Lower Learning Rate + Longer Training

## 🎯 Hypothesis
The baseline might have been undertrained. A lower learning rate with more epochs and patience could find a better minimum.

## 🔬 Changes:
- ✅ Model: T5-Small (same as baseline for fair comparison)
- ✅ **Very low LR: 2e-5** (vs 5e-5 in Exp 1)
- ✅ More epochs: 10 (vs 8 in Exp 1)
- ✅ More patience: 4 (vs 3)
- ✅ Sequence lengths: 256/512 (restored)

**Strategy:** Train slower but longer, let the model converge better

**Baseline to Beat:**
- BLEU: 0.0283
- ROUGE-L: 0.2102


In [None]:
%pip install -q tf-keras transformers tensorflow pandas numpy scikit-learn datasets evaluate rouge-score sacrebleu matplotlib seaborn
import pandas as pd, numpy as np, matplotlib.pyplot as plt, json
from pathlib import Path
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from datasets import Dataset
import evaluate, tensorflow as tf, tf_keras
from datetime import datetime

SEED = 42; np.random.seed(SEED); tf.random.set_seed(SEED)
print("✓ Setup complete")


In [None]:
train_df = pd.read_csv('data/improved/train_improved.csv')
val_df = pd.read_csv('data/improved/val_improved.csv')
test_df = pd.read_csv('data/improved/test_improved.csv')

MODEL_NAME = "t5-small"
MAX_INPUT_LENGTH, MAX_TARGET_LENGTH = 256, 512

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    inputs = ["answer: " + q for q in examples['question']]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding='max_length')
    labels = tokenizer(examples['answer'], max_length=MAX_TARGET_LENGTH, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_ds, val_ds, test_ds = Dataset.from_pandas(train_df), Dataset.from_pandas(val_df), Dataset.from_pandas(test_df)
tok_train = train_ds.map(preprocess_function, batched=True, remove_columns=train_ds.column_names)
tok_val = val_ds.map(preprocess_function, batched=True, remove_columns=val_ds.column_names)

print(f"✓ Data ready: {len(tok_train):,} train")


In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# LOWER LR, MORE EPOCHS
BATCH_SIZE = 8
LEARNING_RATE = 2e-5  # LOWER!
EPOCHS = 10           # MORE!
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1

print(f"🎯 KEY DIFFERENCE: LR={LEARNING_RATE} (lower) | Epochs={EPOCHS} (more)")

tf_train = model.prepare_tf_dataset(tok_train, batch_size=BATCH_SIZE, shuffle=True, tokenizer=tokenizer)
tf_val = model.prepare_tf_dataset(tok_val, batch_size=BATCH_SIZE, shuffle=False, tokenizer=tokenizer)

num_steps = len(tf_train) * EPOCHS
num_warmup = int(WARMUP_RATIO * num_steps)

class WarmupSchedule(tf_keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, lr, warmup_steps, total_steps):
        super().__init__(); self.lr, self.warmup_steps, self.total_steps = lr, warmup_steps, total_steps
    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        warmup = self.lr * (step / tf.cast(self.warmup_steps, tf.float32))
        decay_steps = self.total_steps - self.warmup_steps
        decay = self.lr * (1 - (step - self.warmup_steps) / decay_steps)
        return tf.cond(step < self.warmup_steps, lambda: warmup, lambda: tf.maximum(decay, 1e-7))

optimizer = tf_keras.optimizers.AdamW(learning_rate=WarmupSchedule(LEARNING_RATE, num_warmup, num_steps), weight_decay=WEIGHT_DECAY)
model.compile(optimizer=optimizer)
print("✓ Optimizer ready")


In [None]:
from tf_keras.callbacks import EarlyStopping, ModelCheckpoint

Path('models/experiment_4').mkdir(parents=True, exist_ok=True)
callbacks = [
    EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True, verbose=1),  # More patience!
    ModelCheckpoint('models/experiment_4/best.h5', monitor='val_loss', save_best_only=True, save_weights_only=True, verbose=1)
]

print("🚀 EXPERIMENT 4: LOWER LR + LONGER TRAINING")
start_time = datetime.now()
history = model.fit(tf_train, validation_data=tf_val, epochs=EPOCHS, callbacks=callbacks, verbose=1)
training_time = (datetime.now() - start_time).total_seconds()
print(f"✓ Complete in {training_time/60:.1f} min")


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], 'o-', label='Train', lw=2)
plt.plot(history.history['val_loss'], 's-', label='Val', lw=2)
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Exp 4: Lower LR')
plt.legend(); plt.grid(True, alpha=0.3)
plt.savefig('data/improved/experiment_4_training.png', dpi=300)
plt.show()


In [None]:
# Evaluation
def generate_answer(q, model, tok):
    inp = tok("answer: " + q, return_tensors='tf', max_length=MAX_INPUT_LENGTH, truncation=True)
    out = model.generate(**inp, max_length=512, num_beams=4, early_stopping=True, no_repeat_ngram_size=3, do_sample=False)
    return tok.decode(out[0], skip_special_tokens=True)

bleu_metric, rouge_metric = evaluate.load("bleu"), evaluate.load("rouge")
predictions, references = [], []

for i in range(min(200, len(test_df))):
    predictions.append(generate_answer(test_df.iloc[i]['question'], model, tokenizer))
    references.append(test_df.iloc[i]['answer'])
    if (i+1) % 50 == 0: print(f"{i+1}/200...")

bleu_result = bleu_metric.compute(predictions=predictions, references=[[r] for r in references])
rouge_result = rouge_metric.compute(predictions=predictions, references=references)
val_loss = min(history.history['val_loss'])

print("\\n" + "="*80)
print("📊 EXPERIMENT 4 RESULTS")
print("="*80)
print(f"BLEU:     {bleu_result['bleu']:.4f}")
print(f"ROUGE-L:  {rouge_result['rougeL']:.4f}")
print(f"Val Loss: {val_loss:.4f}")
print("="*80)

results = {
    'experiment_id': 'exp4_lower_lr',
    'experiment_name': 'Lower LR + Longer Training',
    'model_name': MODEL_NAME,
    'hyperparameters': {'batch_size': BATCH_SIZE, 'learning_rate': LEARNING_RATE, 'epochs': EPOCHS,
                       'max_input_length': MAX_INPUT_LENGTH, 'max_target_length': MAX_TARGET_LENGTH},
    'metrics': {'bleu': float(bleu_result['bleu']), 'rougeL': float(rouge_result['rougeL']), 'val_loss': float(val_loss)},
    'training_time_seconds': training_time
}

Path('results').mkdir(exist_ok=True)
with open('results/experiment_4_results.json', 'w') as f:
    json.dump(results, f, indent=2)
print("✓ Saved to results/experiment_4_results.json")
