# AutoMode GLUE Experiments

This notebook implements a dynamic fine-tuning strategy for GLUE tasks. 
It includes implementations for:
- **LoRA** (Low-Rank Adaptation)
- **Dynamic Gradient Norm** (Adaptive Layer Freezing)
- **BitFit** and **Top-K** baselines
- **Full Fine-Tuning**

Experiments are driven by a configuration CSV.

## 1. Setup and Dependencies

In [60]:
!pip install protobuf==3.20.3



In [61]:
!pip install transformers datasets evaluate peft pandas torch numpy -q

import torch
import numpy as np
import pandas as pd
import evaluate
import json
import copy
import time
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    get_scheduler
)
from peft import get_peft_model, LoraConfig, TaskType
from collections import defaultdict
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from google.colab import drive

## 2. Configuration

In [62]:
import os

# --- DEFINE SAVE PATH ---
# Ensure you have created this directory in your Google Drive or local machine.
SAVE_PATH = "/home/jupyter/NLP_Project_Results_dec7/"

# Create directory if it does not exist
os.makedirs(SAVE_PATH, exist_ok=True)
print(f"All results will be saved to: {SAVE_PATH}")

All results will be saved to: /home/jupyter/NLP_Project_Results_dec7/


In [63]:
TASK_TO_KEYS = {
    'sst2': ('sentence', None),
    'rte': ('sentence1', 'sentence2'),
    'qnli': ('question', 'sentence'),
    'mrpc': ('sentence1', 'sentence2'),
    'mnli': ('premise', 'hypothesis'),
    'stsb': ('sentence1', 'sentence2'),
}

In [64]:
import random
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

## 3. Checkpointing
Logic to skip experiments that have already been completed.

In [None]:
# --- CHECKPOINTING LOGIC ---
intermediate_csv_path = os.path.join(SAVE_PATH, "master_results_log_intermediate.csv")
completed_exp_ids = set()

try:
    df_intermediate = pd.read_csv(intermediate_csv_path)
    # Ensure 'exp_id' is read as a zero-padded string to match '0001'
    completed_exp_ids = set(df_intermediate['exp_id'].astype(str))
    print(f"Found {len(completed_exp_ids)} completed experiments. They will be skipped.")
except FileNotFoundError:
    print("No intermediate results file found. Starting a fresh run.")
except pd.errors.EmptyDataError:
    print("Intermediate results file is empty. Starting a fresh run.")

Found 38 completed experiments. They will be skipped.


## 4. Hyperparameter Search Space

In [None]:
# =======================================================
# --- ‚öôÔ∏è HYPERPARAMETER SEARCH SPACE ‚öôÔ∏è ---
# =======================================================

# --- Global Configs ---
GLUE_TASKS = ['qnli','sst2','rte','mrpc']
MODEL_CHECKPOINTS = ['distilbert-base-uncased','roberta-base','bert-base-uncased']
STRATEGIES = ['dynamic_grad_norm']
SEEDS = [42,25,8] 
LEARNING_RATES = [2e-5,3e-5]
BATCH_SIZE = 32 



# --- Task-Specific Configs ---
EPOCHS_PER_TASK = {
    'sst2': 3,     # Medium-Large dataset
    'rte': 7,     # Tiny dataset
    'qnli': 3,     # Large dataset
    'mrpc': 7,     # Tiny dataset
}

# --- Strategy-Specific Configs ---

# 1. LoRA Configs
LORA_CONFIGS = [
    {'r': 4, 'lora_alpha': 8, 'lora_dropout': 0.1},
    {'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1},
    {'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.1}
]

# 2. Dynamic Method Configs
DYNAMIC_CONFIGS = [
    {'num_updates_per_epoch': 6, 'threshold_percentile': 10},
    {'num_updates_per_epoch': 6, 'threshold_percentile': 25},
    {'num_updates_per_epoch': 10, 'threshold_percentile': 10},
    {'num_updates_per_epoch': 10, 'threshold_percentile': 25},
]
TOPK_VALUES = [1,2,3,4,5]   # iterate over any k you want


## 5. Data Loading and Helpers

In [None]:
def get_layer_name(param_name, model_type):
    """
    Groups parameters by their Transformer layer.
    Handles names from both base models and PEFT-wrapped models.
    e.g., 'base_model.model.bert.encoder.layer.0.attention.self.query.weight' -> 'bert.encoder.layer.0'
    e.g., 'bert.encoder.layer.0.attention.self.query.weight' -> 'bert.encoder.layer.0'
    """
    parts = param_name.split('.')

    # Find the model prefix (bert, roberta, distilbert)
    model_prefix_index = -1
    if model_type in parts:
        model_prefix_index = parts.index(model_type)

    if model_prefix_index != -1 and 'layer' in parts[model_prefix_index:]:
        # Find 'layer' *after* the model type
        layer_index = parts.index('layer', model_prefix_index)
        return ".".join(parts[model_prefix_index:layer_index+2])

    return 'other_params' # Embeddings, pooler, classifier

def load_data(task_name, model_checkpoint):
    """Loads and tokenizes a GLUE task, handling regression for STS-B."""
    print(f"Loading dataset for task: {task_name}")
    dataset = load_dataset("glue", task_name)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)


    def tokenize_function(examples):
        key1, key2 = TASK_TO_KEYS[task_name]
        args = (examples[key1],) if key2 is None else (examples[key1], examples[key2])
        return tokenizer(*args, truncation=True, padding='max_length', max_length=128)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Rename 'label' to 'labels' and set format
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")


    tokenized_datasets.set_format(
        "torch",
        columns=["input_ids", "attention_mask", "labels"]
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Create dataloaders
    train_dataloader = DataLoader(
        tokenized_datasets['train'],
        shuffle=True,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator
    )

    # Handle MNLI dual-validation
    if task_name == 'mnli':
        eval_dataloaders = {
            'validation_matched': DataLoader(
                tokenized_datasets['validation_matched'],
                batch_size=BATCH_SIZE,
                collate_fn=data_collator
            ),
            'validation_mismatched': DataLoader(
                tokenized_datasets['validation_mismatched'],
                batch_size=BATCH_SIZE,
                collate_fn=data_collator
            )
        }
    else:
        eval_dataloaders = {
            'validation': DataLoader(
                tokenized_datasets['validation'],
                batch_size=BATCH_SIZE,
                collate_fn=data_collator
            )
        }

    num_labels = dataset['train'].features['label'].num_classes

    return train_dataloader, eval_dataloaders, num_labels

def get_model(tuning_strategy, model_checkpoint, num_labels, lora_config, dynamic_config=None):
    """
    Loads the model based on the hybrid strategy.
    Now accepts a lora_config dict.
    """
    print(f"Loading model: {model_checkpoint} with strategy: {tuning_strategy}")

    task_type = TaskType.SEQ_CLS

    # --- THIS IS THE CHANGE ---
    # Parameters are now read from the config
    peft_config = LoraConfig(
        task_type=task_type,
        inference_mode=False,
        r=lora_config.get('r', 8), # .get() provides a default
        lora_alpha=lora_config.get('lora_alpha', 16),
        lora_dropout=lora_config.get('lora_dropout', 0.1),
        target_modules=["query", "value", "q_lin", "v_lin"]
    )
    # --- END OF CHANGE ---

    # Base model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels
    )

    if tuning_strategy == 'lora':
        model = get_peft_model(model, peft_config)
    elif tuning_strategy == 'full_ft':
        pass
    elif tuning_strategy.startswith('dynamic_'):
        model = get_peft_model(model, peft_config)
    elif tuning_strategy == 'bitfit':
        # Bias-only fine-tuning
        apply_bitfit(model)
    elif tuning_strategy == 'topk_full':
        model_type = model_checkpoint.split('-')[0]
        # Top-k blocks fully fine-tuned, everything else frozen
        apply_topk_full_ft(model, model_type=model_type, k=dynamic_config['topk_layers'])
    else:
        raise ValueError("Unknown tuning strategy")

    print("Trainable parameters:")
    if tuning_strategy in ['full_ft', 'bitfit', 'topk_full']:
        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
        total = sum(p.numel() for p in model.parameters())
        print(f"trainable params: {trainable:,} || all params: {total:,} || trainable%: 100.00")
    else:
        model.print_trainable_parameters()

    return model

def create_optimizer(model, learning_rate):
    """Creates a new optimizer for ONLY the currently trainable parameters."""
    return torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=learning_rate
    )

## 6. Model Construction

In [None]:
## 4. ‚ùÑÔ∏è Dynamic Freezing Implementation (

# This dictionary will store tuples: (sum_of_squared_norms, parameter_count)
gradient_accumulator = defaultdict(lambda: (0.0, 0))
freezing_log = []

def accumulate_gradients(model, model_type):
    """
    Accumulates squared gradients AND parameter counts for each logical layer.
    """
    for name, param in model.named_parameters():
        if param.grad is not None and param.requires_grad:
            layer_name = get_layer_name(name, model_type)
            if layer_name != 'other_params':
                current_norm_sq, current_count = gradient_accumulator[layer_name]

                # Add this parameter's contribution
                new_norm_sq = current_norm_sq + (torch.norm(param.grad, p=2).item()**2)
                new_count = current_count + param.numel()

                gradient_accumulator[layer_name] = (new_norm_sq, new_count)

# We need to import the LoRA layer types to find them
from peft.tuners.lora import LoraLayer


def update_frozen_layers_HYBRID(model, model_type, threshold_percentile, global_step):
    """
    The core of "Adaptive-Freeze LoRA" - CORRECTED.
    1. Compares AVERAGE gradient norms.
    2. MERGES LoRA weights before switching to Full-FT.
    3. RESETS LoRA weights before switching to LoRA-Frozen.
    """
    if not gradient_accumulator:
        return False

    # 1. Calculate AVERAGE gradient norm for each layer
    avg_layer_norms = {}
    for layer, (sum_sq_norm, param_count) in gradient_accumulator.items():
        if param_count > 0:
            # We add a small epsilon to prevent division by zero, just in case
            avg_norm = (sum_sq_norm / (param_count + 1e-9))**0.5
            avg_layer_norms[layer] = avg_norm

    if not avg_layer_norms:
        return False

    # 2. Determine the threshold based on average norms
    norms = list(avg_layer_norms.values())
    if not norms:
        return False

    threshold_val = np.percentile(norms, threshold_percentile)

    print(f"\n--- Dynamic Hybrid Check @ Step {global_step+1} ---")
    print(f"AVERAGE Gradient Norm {threshold_percentile}th percentile threshold: {threshold_val}")

    # 3. Build a map of what each layer's target state should be
    target_state_map = {} # {'layer_name': 'full_ft' or 'lora_frozen'}
    log_entry = {'step': global_step+1, 'threshold': threshold_val, 'layers': {}}

    for layer_name, avg_norm in avg_layer_norms.items():
        is_full_ft_target = (avg_norm >= threshold_val)
        target_state = 'full_ft' if is_full_ft_target else 'lora_frozen'
        target_state_map[layer_name] = target_state
        log_entry['layers'][layer_name] = {'norm': avg_norm, 'action': target_state}

    # 4. Apply the state changes by iterating through the model's named modules
    params_changed = False

    # --- THIS IS THE FIX ---
    # We iterate over NAMED modules to get both the name and the module
    for module_name, module in model.named_modules():
    # --- END OF FIX ---

        if isinstance(module, LoraLayer):
            # The 'module_name' is the full string name, like:
            # 'base_model.model.distilbert.transformer.layer.0.attention.q_lin'

            # --- THIS IS THE FIX ---
            # We pass the string 'module_name' to get_layer_name
            layer_name = get_layer_name(module_name, model_type)
            # --- END OF FIX ---

            if layer_name in target_state_map:
                target_state = target_state_map[layer_name]

                # Check current state by seeing if the base layer is trainable
                is_currently_full_ft = next(module.get_base_layer().parameters()).requires_grad

                if target_state == 'full_ft' and not is_currently_full_ft:
                    # --- SWITCHING: LoRA-Frozen -> Full-FT ---
                    print(f"Switching layer {layer_name} to Full-FT (merging weights)...")

                    # 1. Merge LoRA weights into base weights
                    module.merge()

                    # 2. Freeze LoRA adapters
                    for param in module.lora_A.parameters(): param.requires_grad = False
                    for param in module.lora_B.parameters(): param.requires_grad = False

                    # 3. Unfreeze base layer
                    for param in module.get_base_layer().parameters(): param.requires_grad = True

                    params_changed = True

                elif target_state == 'lora_frozen' and is_currently_full_ft:
                    # --- SWITCHING: Full-FT -> LoRA-Frozen ---
                    print(f"Switching layer {layer_name} to LoRA-Frozen (resetting adapters)...")

                    # 1. Freeze base layer
                    for param in module.get_base_layer().parameters(): param.requires_grad = False

                    # 2. Unfreeze LoRA adapters
                    for param in module.lora_A.parameters(): param.requires_grad = True
                    for param in module.lora_B.parameters(): param.requires_grad = True

                    # 3. Reset LoRA weights to zero
                    module.reset_lora_parameters("default", True)

                    params_changed = True

    if params_changed:
        print("Toggled layer states between Full-FT and LoRA-Frozen.")
        model.print_trainable_parameters()

    # Calculate and log the *actual* trainable param count at this step
    current_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    log_entry['current_trainable_params'] = current_trainable_params

    # Clear the accumulator for the next cycle
    gradient_accumulator.clear()

    # Log the decision
    freezing_log.append(log_entry)

    return params_changed

## 7. Dynamic Freezing Logic

This section contains the core logic for the **Dynamic Gradient Norm** strategy.
It tracks gradient norms per layer and freezes/unfreezes layers based on a percentile threshold.

In [69]:
def apply_bitfit(model):
    """
    BitFit: freeze all weights, train only bias terms + classifier head.
    """
    # 1. Freeze everything
    for p in model.parameters():
        p.requires_grad = False

    # 2. Enable only bias terms
    for name, p in model.named_parameters():
        if "bias" in name:
            p.requires_grad = True

    # 3. Make sure classifier head is fully trainable
    if hasattr(model, "classifier"):
        for p in model.classifier.parameters():
            p.requires_grad = True
    else:
        # Fallback: if classifier is named differently, unfreeze any param with 'classifier' in its name
        for name, p in model.named_parameters():
            if "classifier" in name:
                p.requires_grad = True

    print("Applied BitFit: only bias terms + classifier head are trainable.")


def apply_topk_full_ft(model, model_type, k):
    """
    Top-k full fine-tuning baseline (no LoRA):
    - Freeze all parameters.
    - Unfreeze only the top-k Transformer blocks + classifier head.
    """
    # Freeze everything first
    for p in model.parameters():
        p.requires_grad = False

    # Get the backbone (bert / roberta / distilbert)
    backbone = getattr(model, model_type, None)
    if backbone is None:
        print(f"[WARN] Could not find backbone '{model_type}' on model. "
              f"Falling back to full fine-tuning.")
        for p in model.parameters():
            p.requires_grad = True
    else:
        # Try to find encoder layers
        layers = None
        if hasattr(backbone, "encoder") and hasattr(backbone.encoder, "layer"):
            layers = backbone.encoder.layer
        elif hasattr(backbone, "transformer") and hasattr(backbone.transformer, "layer"):
            layers = backbone.transformer.layer

        if layers is None:
            print(f"[WARN] Could not find encoder.layer/transformer.layer on '{model_type}'. "
                  f"Falling back to full fine-tuning.")
            for p in model.parameters():
                p.requires_grad = True
        else:
            n = len(layers)
            k = min(k, n)
            top_indices = list(range(n - k, n))
            print(f"Applying top-k full FT: unfreezing top {k} layers: {top_indices}")

            # Unfreeze only top-k layers
            for i in top_indices:
                for p in layers[i].parameters():
                    p.requires_grad = True

    # Always unfreeze classifier head
    if hasattr(model, "classifier"):
        for p in model.classifier.parameters():
            p.requires_grad = True
    else:
        for name, p in model.named_parameters():
            if "classifier" in name:
                p.requires_grad = True

    print("Applied top-k full FT baseline.")


## 8. Baseline Strategies (BitFit & Top-K)

In [None]:
def custom_train(config):
    """
    A custom training loop that is now fully driven by the 'config' dict.
    """
    # --- 1. SET SEED ---
    set_seed(config['seed'])

    # 2. Load Data
    model_type = config['model'].split('-')[0]

    train_dataloader, eval_dataloaders, num_labels, _ = load_data(
        config['task'], config['model']
    )

    # 3. Load Metric
    metric = evaluate.load("glue", config['task'])

    # 4. Load Model
    # Pass the lora_config from the main config
    model = get_model(
        config['strategy'],
        config['model'],
        num_labels,
        config['lora_config'],  # <-- Pass lora_config
        dynamic_config=config['dynamic_config']
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # 5. Setup Optimizer and Scheduler
    # Use 'lr' from config
    optimizer = create_optimizer(model, config['lr'])
    num_training_steps = config['epochs'] * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    # 6. Setup Dynamic Check Frequency
    steps_per_epoch = len(train_dataloader)
    num_updates=6
    if config['dynamic_config'] is not None:
      num_updates = config['dynamic_config'].get('num_updates_per_epoch', 6)
    print(f"num updates per epoch: {num_updates}")
    dynamic_check_frequency = max(1, steps_per_epoch // num_updates)
    print(f"Dataset has {steps_per_epoch} steps/epoch. Dynamic check will run every {dynamic_check_frequency} steps.")


    # 7. Training
    global_step = 0
    all_results = []

    gradient_accumulator.clear()
    freezing_log.clear()

    print(f"\n--- Starting Training (ID: {config['exp_id']}) ---")
    for epoch in range(config['epochs']):
        model.train()
        progress_bar = tqdm(
            train_dataloader,
            desc=f"Epoch {epoch+1}/{config['epochs']}",
            leave=False
        )

        for batch in progress_bar:
            # ... (forward pass, loss, backward) ...
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            # --- DYNAMIC STRATEGY LOGIC ---
            if config['strategy'] == 'dynamic_grad_norm':
                accumulate_gradients(model, model_type)

                # Use the new dynamic frequency
                if (global_step + 1) % dynamic_check_frequency == 0:
                    params_changed = update_frozen_layers_HYBRID(
                        model,
                        model_type,
                        # Pass threshold from config
                        config['dynamic_config']['threshold_percentile'],
                        global_step
                    )

                    if params_changed:
                        print("Parameters changed, re-creating optimizer.")
                        optimizer = create_optimizer(model, config['lr']) # Use config lr
                        lr_scheduler = get_scheduler(
                            "linear",
                            optimizer=optimizer,
                            num_warmup_steps=0,
                            num_training_steps=num_training_steps
                        )
                        lr_scheduler.last_epoch = global_step
                        print(f"Set new scheduler's last_epoch to {global_step}")

            # ... (optimizer step, zero_grad, etc.) ...
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            global_step += 1
            progress_bar.set_postfix(loss=loss.item())

        # 8. Evaluation

        print(f"Running evaluation for epoch {epoch+1}...")
        for eval_split_name, eval_dataloader in eval_dataloaders.items():
            metric = evaluate.load("glue", config['task'])
            model.eval()

            for batch in eval_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                with torch.no_grad():
                    outputs = model(**batch)

                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)

                metric.add_batch(
                    predictions=predictions,
                    references=batch["labels"]
                )
            eval_metric = metric.compute()
            print(f"Epoch {epoch+1} metrics for {eval_split_name}: {eval_metric}")

            run_result = {
                # Log all config params for easy filtering
                **config,
                'epoch': epoch + 1,
                'eval_split': eval_split_name,
                'metrics': eval_metric,
                'trainable_params': sum(p.numel() for p in model.parameters() if p.requires_grad),
                'total_params': sum(p.numel() for p in model.parameters()),
            }
            all_results.append(run_result)

    print(f"--- Finished Training ---")
    return all_results, copy.deepcopy(freezing_log)

## 9. Training Loop - Using csv source containing experiment configurations

In [71]:
# ======================================================================
# --- 6. GRID BUILDER & EXPERIMENT RUNNER (With HASH ID FIX)
# ======================================================================
import json
import hashlib

# ----------------------------------------------------------------------
# Load previous intermediate log
# ----------------------------------------------------------------------
if os.path.exists(intermediate_csv_path):
    try:
        master_results_log = pd.read_csv(intermediate_csv_path).to_dict('records')
        print(f"Loaded {len(master_results_log)} previous results into log.")
    except pd.errors.EmptyDataError:
        master_results_log = []
        print("Intermediate file was empty. Starting new log.")
else:
    master_results_log = []

completed_exp_ids = set(pd.DataFrame(master_results_log)['exp_id'].astype(str)) \
                     if len(master_results_log) > 0 and 'exp_id' in master_results_log[0] else set()

print("Building experiment grid...")

# ----------------------------------------------------------------------
# Deterministic experiment ID
# ----------------------------------------------------------------------
def generate_exp_id(config_dict):
    cfg = dict(config_dict)
    cfg.pop("exp_id", None)  # important safeguard
    config_str = json.dumps(cfg, sort_keys=True, default=str)
    return hashlib.md5(config_str.encode()).hexdigest()[:10]

# ======================================================================
# üîÅ NEW GRID BUILDER ‚Äî USE CSV INSTEAD OF HARD-CODED SEARCH SPACE
# ======================================================================
csv_path = "run_exp_dynamic_grad_norm_25_03_scond.csv"
runs_df = pd.read_csv(csv_path).replace({np.nan: None})

total_experiments_in_grid = len(runs_df)
print(f"Loaded {total_experiments_in_grid} experiments from CSV")

def build_config_from_row(row):
    """Convert a CSV row into a training config dict."""
    task = row["task"]

    # -------- LoRA config --------
    lora_cfg = {
        "r":             row["lora_config.r"],
        "lora_alpha":    row["lora_config.lora_alpha"],
        "lora_dropout":  row["lora_config.lora_dropout"],
    }

    # -------- Dynamic method config --------
    if row["strategy"] == "dynamic_grad_norm":
        dynamic_cfg = {
            "num_updates_per_epoch":   row["dynamic_config.num_updates_per_epoch"],
            "threshold_percentile":    row["dynamic_config.threshold_percentile"],
        }
    else:
        dynamic_cfg = None

    # -------- Final config --------
    return {
        "task": task,
        "model": row["model"],
        "strategy": row["strategy"],
        "seed": int(row["seed"]),
        "epochs": EPOCHS_PER_TASK.get(task, 3),
        "batch_size": BATCH_SIZE,
        "lr": float(row["lr"]),
        "lora_config": lora_cfg,
        "dynamic_config": dynamic_cfg,
    }


# Convert CSV rows ‚Üí configs
raw_configs = [build_config_from_row(row) for _, row in runs_df.iterrows()]

# Add exp_id + filter already completed
experiments_to_run = []
for cfg in raw_configs:
    exp_id = generate_exp_id(cfg)
    cfg["exp_id"] = exp_id

    if exp_id not in completed_exp_ids:
        experiments_to_run.append(cfg)

print(f"Total experiments loaded      : {total_experiments_in_grid}")
print(f"Completed experiments skipped : {len(completed_exp_ids)}")
print(f"Experiments to run            : {len(experiments_to_run)}")

# ======================================================================
# --- PART B: RUN EXPERIMENTS ---
# ======================================================================
for i, config in enumerate(experiments_to_run):
    print(f"\n{'='*60}")
    print(f"--- STARTING EXPERIMENT {i+1}/{len(experiments_to_run)} (ID: {config['exp_id']}) ---")
    print(f"Config: {config}")
    print(f"{'='*60}\n")

    try:
        start = time.time()
        run_results, run_grad_log = custom_train(config)
        duration = time.time() - start

        # annotate results
        for r in run_results:
            r["training_time_seconds"] = duration

        master_results_log.extend(run_results)

        # Save dynamic logs
        if config["strategy"].startswith("dynamic_"):
            log_path = os.path.join(SAVE_PATH, f"grad_log_{config['exp_id']}_{config['task']}_{config['model']}.json")
            with open(log_path, "w") as f:
                json.dump(run_grad_log, f, indent=2)

        # Save intermediate log
        pd.json_normalize(master_results_log).to_csv(intermediate_csv_path, index=False)

    except Exception as e:
        print(f"\n{'!'*20} EXPERIMENT FAILED {'!'*20}")
        print("Error:", e)

        fail_rec = {"exp_id": config["exp_id"], "error": str(e), **config}
        master_results_log.append(fail_rec)
        pd.json_normalize(master_results_log).to_csv(intermediate_csv_path, index=False)

    print(f"--- FINISHED EXPERIMENT {i+1}/{len(experiments_to_run)} ---")

print("\n\nAll experiments complete!")

final_csv = os.path.join(SAVE_PATH, "final_experiment_results.csv")
pd.json_normalize(master_results_log).to_csv(final_csv, index=False)
print("Saved final results to:", final_csv)


Loaded 114 previous results into log.
Building experiment grid...
Loaded 38 experiments from CSV
Total experiments loaded      : 38
Completed experiments skipped : 38
Experiments to run            : 0


All experiments complete!
Saved final results to: /home/jupyter/NLP_Project_Results_dec7/final_experiment_results.csv


## 10. Training Loop - Building grid from existing configurations

In [None]:
# ======================================================================
# --- 6. GRID BUILDER & EXPERIMENT RUNNER (With HASH ID FIX) ---
# ======================================================================
import json
import hashlib

# --- PART A: THE GRID BUILDER (FIXED) ---

# Load previous results so we just append to the log
if os.path.exists(intermediate_csv_path):
    try:
        master_results_log = pd.read_csv(intermediate_csv_path).to_dict('records')
        print(f"Loaded {len(master_results_log)} previous results into log.")
    except pd.errors.EmptyDataError:
        master_results_log = []
        print("Intermediate file was empty. Starting new log.")
else:
    master_results_log = []

experiments_to_run = []
total_experiments_in_grid = 0

print("Building experiment grid...")

# This helper function creates the unique, deterministic hash
def generate_exp_id(config_dict):
    # Create a canonical JSON string (sorted keys ensure order is always the same)
    config_str = json.dumps(config_dict, sort_keys=True, default=str)
    # Create an MD5 hash and take the first 10 chars for a unique, short ID
    return hashlib.md5(config_str.encode()).hexdigest()[:10]

for task in GLUE_TASKS:
    for model_cp in MODEL_CHECKPOINTS:
        for seed in SEEDS:
            for strategy in STRATEGIES:

                if strategy == 'full_ft':
                    for lr in LEARNING_RATES:
                        config = {
                            'task': task, 'model': model_cp, 'strategy': strategy,
                            'seed': seed, 'epochs': EPOCHS_PER_TASK.get(task, 3),
                            'batch_size': BATCH_SIZE, 'lr': lr,
                            'lora_config': {}, 'dynamic_config': None
                        }

                        # --- THIS IS THE NEW LOGIC ---
                        exp_id_str = generate_exp_id(config)
                        config['exp_id'] = exp_id_str
                        total_experiments_in_grid += 1
                        if exp_id_str not in completed_exp_ids:
                          experiments_to_run.append(config)
                        # --- END NEW LOGIC ---

                elif strategy == 'lora':
                    for lr in LEARNING_RATES:
                        for lora_cfg in LORA_CONFIGS:
                            config = {
                                'task': task, 'model': model_cp, 'strategy': strategy,
                                'seed': seed, 'epochs': EPOCHS_PER_TASK.get(task, 3),
                                'batch_size': BATCH_SIZE, 'lr': lr,
                                'lora_config': lora_cfg, 'dynamic_config': None
                            }

                            # --- THIS IS THE NEW LOGIC ---
                            exp_id_str = generate_exp_id(config)
                            config['exp_id'] = exp_id_str
                            total_experiments_in_grid += 1
                            if exp_id_str not in completed_exp_ids:
                              experiments_to_run.append(config)
                            # --- END NEW LOGIC ---

                elif strategy == 'dynamic_grad_norm':
                    for lr in LEARNING_RATES:
                        for lora_cfg in LORA_CONFIGS:
                            for dynamic_cfg in DYNAMIC_CONFIGS:
                                config = {
                                    'task': task, 'model': model_cp, 'strategy': strategy,
                                    'seed': seed, 'epochs': EPOCHS_PER_TASK.get(task, 3),
                                    'batch_size': BATCH_SIZE, 'lr': lr,
                                    'lora_config': lora_cfg, 'dynamic_config': dynamic_cfg
                                }

                                # --- THIS IS THE NEW LOGIC ---
                                exp_id_str = generate_exp_id(config)
                                config['exp_id'] = exp_id_str
                                total_experiments_in_grid += 1
                                if exp_id_str not in completed_exp_ids:
                                    experiments_to_run.append(config)
                                # --- END NEW LOGIC ---

                elif strategy == 'bitfit':
                    for lr in LEARNING_RATES:
                        config = {
                            'task': task, 'model': model_cp, 'strategy': strategy,
                            'seed': seed, 'epochs': EPOCHS_PER_TASK.get(task, 3),
                            'batch_size': BATCH_SIZE, 'lr': lr,
                            'lora_config': {}, 'dynamic_config': None
                        }
                        exp_id_str = generate_exp_id(config)
                        config['exp_id'] = exp_id_str
                        total_experiments_in_grid += 1
                        if exp_id_str not in completed_exp_ids:
                            experiments_to_run.append(config)

                elif strategy == 'topk_full':
                    for lr in LEARNING_RATES:
                      for k in TOPK_VALUES:
                          config = {
                              'task': task, 'model': model_cp, 'strategy': strategy,
                              'seed': seed, 'epochs': EPOCHS_PER_TASK.get(task, 3),
                              'batch_size': BATCH_SIZE, 'lr': lr,
                              'lora_config': {}, 'dynamic_config': {'topk_layers' : k}
                              # If you later want different k values, you can add 'topk_layers': k here.
                          }
                          exp_id_str = generate_exp_id(config)
                          config['exp_id'] = exp_id_str
                          total_experiments_in_grid += 1
                          if exp_id_str not in completed_exp_ids:
                              experiments_to_run.append(config)




print(f"--- Total experiments in grid: {total_experiments_in_grid} ---")
print(f"--- Completed: {len(completed_exp_ids)}. New experiments to run: {len(experiments_to_run)} ---")


# --- PART B: THE EXPERIMENT RUNNER ---
# We now iterate over 'experiments_to_run'
for i, config in enumerate(experiments_to_run):
    print(f"\n{'='*60}")
    # This now shows progress on the *remaining* jobs
    print(f"--- STARTING EXPERIMENT {i+1}/{len(experiments_to_run)} (ID: {config['exp_id']}) ---")
    print(f"Config: {config}")
    print(f"{'='*60}\n")

    try:
        start_time = time.time()
        run_results, run_grad_log = custom_train(config)
        end_time = time.time()
        duration_seconds = end_time - start_time
        print(f"Total training time for this run: {duration_seconds:.2f} seconds")

        for res in run_results:
            res['training_time_seconds'] = duration_seconds

        master_results_log.extend(run_results)

        exp_id = config['exp_id']

        if config['strategy'].startswith('dynamic_'):
            log_filename = f"grad_log_{exp_id}_{config['task']}_{config['model']}.json"
            json_save_path = os.path.join(SAVE_PATH, log_filename)
            print(f"Saving gradient log to {json_save_path}")
            with open(json_save_path, 'w') as f:
                json.dump(run_grad_log, f, indent=2)

        # Save intermediate CSV
        csv_save_path = os.path.join(SAVE_PATH, "master_results_log_intermediate.csv")
        # Save the *entire* log (old + new)
        pd.json_normalize(master_results_log).to_csv(csv_save_path, index=False)

    except Exception as e:
        print(f"\n{'!'*20} EXPERIMENT FAILED {'!'*20}")
        print(f"Failed on config: {config}")
        print(f"Error: {e}")
        master_results_log.append({ 'exp_id': config['exp_id'], 'error': str(e), **config })
        # Save the log even on failure
        csv_save_path = os.path.join(SAVE_PATH, "master_results_log_intermediate.csv")
        pd.json_normalize(master_results_log).to_csv(csv_save_path, index=False)

    print(f"--- FINISHED EXPERIMENT {i+1}/{len(experiments_to_run)} ---")

print("\n\nAll experiments complete!")

# --- FINAL SAVE ---
final_csv_path = os.path.join(SAVE_PATH, "final_experiment_results.csv")
pd.json_normalize(master_results_log).to_csv(final_csv_path, index=False)
print(f"Saved final results to {final_csv_path}")