In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import torch
import gc
import random
import numpy as np
import pandas as pd
import evaluate
from typing import Dict, Any, List, Optional, Tuple
from dataclasses import dataclass, asdict

# Hugging Face Libraries
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EvalPrediction
)
from peft import LoraConfig, TaskType, get_peft_model

In [None]:
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Hyperparameter Search Space -> discrete sets
LR_MIN, LR_MAX = 1e-5, 2e-4
WARMUP_OPTIONS = [0.0, 0.06, 0.1]
RANK_OPTIONS = [2, 4, 8, 16, 24]
ALPHA_OPTIONS = [8, 16, 32, 64, 96]
DROPOUT_OPTIONS = [0.0, 0.05, 0.1, 0.2]
TARGET_MODULE_OPTIONS = [
    ["q_lin", "v_lin"],
    ["q_lin", "v_lin", "ffn.lin1", "ffn.lin2"]
]  # -> binary choice

# Search bounds -> indices for discrete, actual values for continuous params
MIN_BOUNDS = [LR_MIN, 0, 0, 0, 0, 0]
MAX_BOUNDS = [
    LR_MAX,
    len(WARMUP_OPTIONS) - 0.01,
    len(RANK_OPTIONS) - 0.01,
    len(ALPHA_OPTIONS) - 0.01,
    len(DROPOUT_OPTIONS) - 0.01,
    len(TARGET_MODULE_OPTIONS) - 0.01
]

In [None]:
# Data Loading
dataset = load_dataset('dair-ai/emotion')

train_dataset = dataset['train'].shuffle(seed=SEED).select(range(3000)) # recommendation from cw brief to reduce compute time
val_dataset = dataset['validation']

In [17]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_func(examples):
  return tokenizer(
      examples['text'],
      truncation=True,
      padding=True,
      max_length=128
  )

tokenized_train = train_dataset.map(tokenize_func, batched=True)
tokenized_val = val_dataset.map(tokenize_func, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# Helper functions
def set_global_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)


def cleanup_memory():
    """Forcefully releases GPU memory"""
    torch.cuda.empty_cache()
    gc.collect()


def find_nearest(value, options):
    return min(options, key=lambda x: abs(x - value)) # find nearest value from discrete set


def random_individual() -> List[float]: # -> generate one vector of random (index based) hyperparams from options
    return [
        random.uniform(LR_MIN, LR_MAX), # LR -> continuous
        random.uniform(0, len(WARMUP_OPTIONS) - 0.01), # warmup index
        random.uniform(0, len(RANK_OPTIONS) - 0.01), # rank index
        random.uniform(0, len(ALPHA_OPTIONS) - 0.01), # alpha index
        random.uniform(0, len(DROPOUT_OPTIONS) - 0.01), # dropout index
        random.uniform(0, len(TARGET_MODULE_OPTIONS) - 0.01) # modules index
    ]


# takes in an individual and repairs it -> need to change name later
def repair_pop_list(pop_list: list) -> list:
    """Repair bounds and snap to valid discrete values"""
    repaired = []
    
    # LR - continuous, just clip
    repaired.append(float(np.clip(pop_list[0], MIN_BOUNDS[0], MAX_BOUNDS[0])))
    
    # discrete params - clip index then map to actual value
    repaired.append(WARMUP_OPTIONS[int(np.clip(pop_list[1], MIN_BOUNDS[1], MAX_BOUNDS[1]))])
    repaired.append(RANK_OPTIONS[int(np.clip(pop_list[2], MIN_BOUNDS[2], MAX_BOUNDS[2]))])
    repaired.append(ALPHA_OPTIONS[int(np.clip(pop_list[3], MIN_BOUNDS[3], MAX_BOUNDS[3]))])
    repaired.append(DROPOUT_OPTIONS[int(np.clip(pop_list[4], MIN_BOUNDS[4], MAX_BOUNDS[4]))])
    repaired.append(int(np.clip(round(pop_list[5]), MIN_BOUNDS[5], MAX_BOUNDS[5])))
    
    return repaired

In [None]:
# SHADE Settings
SHADE_POPULATION_SIZE = 20
MAX_GENERATIONS = 5
MEMORY_SIZE = 20       # H parameter (matching pop_size)
ARCHIVE_RATE = 1.0     # Archive size = pop_size × archive_rate
P_BEST_RATE = 0.4      # Top 20% for pbest selection

''' Collect all individuals -> this will be sorted and used for final training loop
                               of top 5 best solutions trained on 3 different seeds'''

In [None]:
class SHADE_HyperparameterOptimizer:
    def __init__(self):
        self.metric = evaluate.load("accuracy")

        # SHADE Parameters
        self.pop_size = SHADE_POPULATION_SIZE
        self.H = MEMORY_SIZE
        self.arc_size = int(ARCHIVE_RATE * self.pop_size)
        self.p_num = max(2, int(P_BEST_RATE * self.pop_size))  # At least 2 for pbest

        # Initialize Memory (Historical CR and F values)
        self.M_cr = [0.5] * self.H
        self.M_f = [0.5] * self.H
        self.mem_k = 0

        # Population & Archive
        self.population = [random_individual() for _ in range(self.pop_size)]
        self.fitness = [0.0] * self.pop_size
        self.archive = []

        # Tracking
        self.nfes = 0  # Number of function evaluations
        self.results = []
        self.best_solution = None
        self.best_fitness = -float('inf')
        
        # Result Collection
        self.all_individuals = {}
        self.final_results = [] 


    def _compute_metrics(self, eval_pred: EvalPrediction):
        preds, labels = eval_pred
        preds = np.argmax(preds, axis=1)
        return self.metric.compute(predictions=preds, references=labels)


    def evaluate_individual(self, individual: List[float], trial_id: int) -> float:
        """Train model with given hyperparameters and return validation accuracy"""
        params = repair_pop_list(individual)

        print(f"   > LR={params[0]:.2e}, Rank={params[2]}, "
              f"Alpha={params[3]}, Dropout={params[4]}")

        # Load fresh model each time
        model = AutoModelForSequenceClassification.from_pretrained(
          "distilbert-base-uncased",
          num_labels=6 # for 6 emotions
        )

        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=params[2],
            lora_alpha=params[3],
            lora_dropout=params[4],
            target_modules=["q_lin", "v_lin"] if params[5]==0 else ["q_lin", "v_lin", "ffn.lin1", "ffn.lin2"],
        )

        peft_model = get_peft_model(model, peft_config)
        args = TrainingArguments(
            output_dir=f"./results/trial_{trial_id}",
            learning_rate=params[0],
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            num_train_epochs=3,
            warmup_ratio=params[1],
            logging_steps = 100,
            weight_decay=0.01,
            eval_strategy="epoch",
            save_strategy="no",
            logging_strategy="epoch",
            seed=SEED + trial_id,
            report_to="none",
            load_best_model_at_end=False
        )

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

        trainer = Trainer(
            model=peft_model,
            args=args,
            train_dataset=tokenized_train,
            data_collator=data_collator,
            eval_dataset=tokenized_val,
            compute_metrics=self._compute_metrics
        )

        trainer.train()
        eval_results = trainer.evaluate()
        accuracy = eval_results["eval_accuracy"]

        # Cleanup
        del model, peft_model, trainer
        cleanup_memory()

        return accuracy
    
    
    # Reevaluate top solutions with 3 seeds to get robust accuracy estimates
    def evaluate_top_solutions_with_seeds(self, all_individuals, num_top=5, num_seeds=3):
        
        print(f"\n{'='*60}")
        print(f"EVALUATING TOP {num_top} SOLUTIONS WITH {num_seeds} DIFFERENT SEEDS")
        print(f"{'='*60}")
        
        # Sort by fitness (descending)
        sorted_solutions = sorted(all_individuals.items(), key=lambda x: x[1], reverse=True)
        
        final_results = []
        
        for rank, (individual_tuple, original_accuracy) in enumerate(sorted_solutions[:num_top], 1):
            individual = list(individual_tuple)
            params = repair_pop_list(individual)
            
            print(f"\n{'='*60}")
            print(f"RANK {rank} - Original Accuracy: {original_accuracy:.4%}")
            print(f"{'='*60}")
            print(f"Params: LR={params[0]:.2e}, Warmup={params[1]}, Rank={params[2]}, "
                f"Alpha={params[3]}, Dropout={params[4]}, Modules={params[5]}")
            print(f"\nRunning {num_seeds} evaluations with different seeds...")
            
            seed_accuracies = []
            
            for seed_run in range(num_seeds):
                # high trial_id offset to avoid collision with optimization trials
                trial_id = 10000 + (rank * 100) + seed_run
                
                # change global seed temporarily for data shuffling consistency
                current_seed = SEED + trial_id
                
                print(f"  Seed run {seed_run + 1}/{num_seeds} (seed={current_seed})...", end=" ")
                
                accuracy = self.evaluate_individual(individual, trial_id)
                seed_accuracies.append(accuracy)
                
                print(f"Accuracy: {accuracy:.4%}")
            
            # Calculate stats
            mean_acc = np.mean(seed_accuracies)
            std_acc = np.std(seed_accuracies)
            
            print(f"\n  Results: {mean_acc:.4%} ± {std_acc:.4%}")
            print(f"  Individual runs: {[f'{acc:.4%}' for acc in seed_accuracies]}")
            
            final_results.append({
                'rank': rank,
                'params': params,
                'original_accuracy': original_accuracy,
                'mean_accuracy': mean_acc,
                'std_accuracy': std_acc,
                'seed_accuracies': seed_accuracies
            })
        
        # final summary
        print(f"\n{'='*60}")
        print(f"FINAL SUMMARY - TOP {num_top} SOLUTIONS")
        print(f"{'='*60}")
        
        for result in final_results:
            modules = ["q_lin", "v_lin"] if result['params'][5]==0 else ["q_lin", "v_lin", "ffn.lin1", "ffn.lin2"]
            print(f"\n{result['rank']}. Mean Accuracy: {result['mean_accuracy']:.4%} ± {result['std_accuracy']:.4%}")
            print(f"   LR: {result['params'][0]:.2e}, Warmup: {result['params'][1]}, Rank: {result['params'][2]}")
            print(f"   Alpha: {result['params'][3]}, Dropout: {result['params'][4]}, Modules: {modules}")
            runs_str = [f"{acc:.4%}" for acc in result['seed_accuracies']]
            print(f"   Runs: {runs_str}")
        
        return final_results
    
    
    def generate_child(self, target_idx: int, sorted_indices: np.ndarray) -> Tuple[List[float], float, float]:
        """Generate one child using current-to-pbest/1 with archive mutation + crossover"""

        ri = random.randint(0, self.H - 1) # H spaces, pick a random one & account for no. indices
        mu_cr = self.M_cr[ri] # -> get mean from random point in memory
        mu_f = self.M_f[ri]

        # generate CR_i
        if mu_cr == -1:
            cr_i = 0.0
        else:
            cr_i = np.clip(np.random.normal(mu_cr, 0.1), 0, 1)

        # generate F_i
        while True:
            f_i = np.random.standard_cauchy() * 0.1 + mu_f # scale = 0.1 | location = M_f[ri]
            if f_i > 0:
                break
        f_i = min(f_i, 1.0)

        # select p-best (from top p_num individuals)
        p_best_idx = sorted_indices[random.randint(0, self.p_num - 1)]

        # select r1 (different from target)
        available = [i for i in range(self.pop_size) if i != target_idx]
        r1_idx = random.choice(available)

        # select r2 (from population + archive, different from target and r1)
        combined_pop = self.population + self.archive
        combined_size = len(combined_pop)

        r2_idx = random.randint(0, combined_size - 1)
        while (r2_idx == target_idx or
               (r2_idx < self.pop_size and r2_idx == r1_idx)):
            r2_idx = random.randint(0, combined_size - 1)

        # create the mutation using -> current-to-pbest/1
        x_i = np.array(self.population[target_idx])
        x_pbest = np.array(self.population[p_best_idx])
        x_r1 = np.array(self.population[r1_idx])
        x_r2 = np.array(combined_pop[r2_idx])

        mutant = x_i + f_i * (x_pbest - x_i) + f_i * (x_r1 - x_r2)

        # binomial crossover
        child = []
        j_rand = random.randint(0, len(x_i) - 1)  # ensures at least one change

        for j in range(len(x_i)):
            if random.random() < cr_i or j == j_rand:
                child.append(mutant[j])
            else:
                child.append(x_i[j])

        # repair bounds
        child = repair_pop_list(child)

        return child, cr_i, f_i


    def run_optimization(self):
        print(f"Starting SHADE: Pop={self.pop_size}, H={self.H}, Budget={SHADE_POPULATION_SIZE * MAX_GENERATIONS}\n")
        
        all_individuals = {}
        generation = 1

        # Initialise Population
        print("=== INITIALIZATION ===")
        for i in range(self.pop_size):
            self.nfes += 1
            print(f"[Eval {self.nfes}] Initial Individual {i+1}/{self.pop_size}")

            self.fitness[i] = self.evaluate_individual(self.population[i], self.nfes)
            all_individuals[tuple(self.population[i])] = self.fitness[i]
            
            print(f"   > Accuracy: {self.fitness[i]:.4%}")

            if self.fitness[i] > self.best_fitness:
                self.best_fitness = self.fitness[i]
                self.best_solution = self.population[i].copy()
                print(f"   >>> New Best: {self.best_fitness:.4%}")
                
            record = {
                "learning_rate": self.population[i][0],
                "warmup_ratio": self.population[i][1],
                "rank": self.population[i][2],
                "alpha": self.population[i][3],
                "dropout": self.population[i][4],
                "target_modules": ["q_lin", "v_lin"] if self.population[i][5]==0 else ["q_lin", "v_lin", "ffn.lin1", "ffn.lin2"]
            }
            record.update({
                "trial_id": self.nfes,
                "generation": generation,
                "accuracy": self.fitness[i]
            })
        self.results.append(record)

        # Main Loop
        while generation <= MAX_GENERATIONS:
            print(f"\n{'='*60}")
            print(f"GENERATION {generation}/{MAX_GENERATIONS}")
            print(f"{'='*60}")

            # Sort population by fitness (descending -> best first)
            sorted_indices = np.argsort(self.fitness)[::-1]

            # Generate all children
            children = []
            children_cr = []
            children_f = []
            children_fitness = []

            for i in range(self.pop_size):

                self.nfes += 1
                print(f"\n[Eval {self.nfes}] Individual {i+1}/{self.pop_size}")

                child, cr_i, f_i = self.generate_child(i, sorted_indices)
                child_fitness = self.evaluate_individual(child, self.nfes)
                
                all_individuals[tuple(child)] = child_fitness

                children.append(child)
                children_cr.append(cr_i)
                children_f.append(f_i)
                children_fitness.append(child_fitness)

                print(f"   > Child Accuracy: {child_fitness:.4%} vs Parent: {self.fitness[i]:.4%}")

                # Track results
                params = repair_pop_list(child)
                record = {
                  "learning_rate": params[0],
                  "warmup_ratio": params[1],
                  "rank": params[2],
                  "alpha": params[3],
                  "dropout": params[4],
                  "target_modules": ["q_lin", "v_lin"] if params[5]==0 else ["q_lin", "v_lin", "ffn.lin1", "ffn.lin2"]
                }
                record.update({
                    "trial_id": self.nfes,
                    "generation": generation,
                    "accuracy": child_fitness
                })
                self.results.append(record)

            # Selection and Memory Update
            S_cr, S_f, dif_fitness = [], [], []

            for i in range(len(children)):
                parent_fit = self.fitness[i]
                child_fit = children_fitness[i]

                # Selection
                if child_fit >= parent_fit:
                    # child wins or ties -> replace parent
                    if child_fit > parent_fit:
                        # archive the old parent (loser)
                        if len(self.archive) < self.arc_size:
                            self.archive.append(self.population[i].copy())
                        else:
                            # replace random archive member
                            rand_idx = random.randint(0, self.arc_size - 1)
                            self.archive[rand_idx] = self.population[i].copy()

                        # track successful parameters
                        dif_fitness.append(abs(child_fit - parent_fit))
                        S_cr.append(children_cr[i])
                        S_f.append(children_f[i])

                    # replace parent with child
                    self.population[i] = children[i]
                    self.fitness[i] = child_fit

                    # update global best
                    if child_fit > self.best_fitness:
                        self.best_fitness = child_fit
                        self.best_solution = children[i].copy()
                        print(f"   >>> NEW GLOBAL BEST: {self.best_fitness:.4%}")

            # Update Memory using -> Weighted Lehmer Mean
            if len(S_cr) > 0:
                total_improvement = sum(dif_fitness)
                weights = [df / total_improvement for df in dif_fitness]

                # Weighted Lehmer Mean for F
                f_num = sum(w * f**2 for w, f in zip(weights, S_f))
                f_den = sum(w * f for w, f in zip(weights, S_f))
                self.M_f[self.mem_k] = f_num / f_den

                # Weighted Lehmer Mean for CR
                cr_sum = sum(S_cr)
                if cr_sum == 0 or self.M_cr[self.mem_k] == -1:
                    self.M_cr[self.mem_k] = -1  # Terminal value
                else:
                    cr_num = sum(w * cr**2 for w, cr in zip(weights, S_cr))
                    cr_den = sum(w * cr for w, cr in zip(weights, S_cr))
                    self.M_cr[self.mem_k] = cr_num / cr_den

                # increment memory position -> (circular)
                self.mem_k = (self.mem_k + 1) % self.H

                print(f"\n>>> Memory updated: {len(S_cr)} improvements this generation")

            generation += 1

        print(f"\n{'='*60}")
        print(f"OPTIMIZATION COMPLETE")
        print(f"{'='*60}")
        print(f"Total Evaluations: {self.nfes}")
        print(f"Best Accuracy: {self.best_fitness:.4%}")
        
        self.all_individuals = all_individuals
        
        
    def save_top_solutions_results(self, final_results, filename: str):
        # save top solutions multi-seed evaluation results to CSV
        if not final_results:
            print("No top solutions results to save.")
            return
        
        # flatten results 
        rows = []
        for result in final_results:
            params = result['params']
            row = {
                'rank': result['rank'],
                'learning_rate': params[0],
                'warmup_ratio': params[1],
                'rank_r': params[2],
                'alpha': params[3],
                'dropout': params[4],
                'target_modules': 0 if params[5] == 0 else 1,  # Binary encoding
                'original_accuracy': result['original_accuracy'],
                'mean_accuracy': result['mean_accuracy'],
                'std_accuracy': result['std_accuracy'],
            }
            
            # individual seed runs as separate columns
            for i, acc in enumerate(result['seed_accuracies'], 1):
                row[f'seed_{i}_accuracy'] = acc
            
            rows.append(row)
        
        df = pd.DataFrame(rows)
        df.to_csv(filename, index=False)
        print(f"\nTop solutions results saved to {filename}")
        
        # Print summary
        best_mean = df.loc[df['mean_accuracy'].idxmax()]
        print("\n" + "="*60)
        print("BEST CONFIGURATION (by mean accuracy):")
        print("="*60)
        print(f"Rank: {best_mean['rank']}")
        print(f"Mean Accuracy: {best_mean['mean_accuracy']:.4%} ± {best_mean['std_accuracy']:.4%}")
        print(f"Learning Rate: {best_mean['learning_rate']:.2e}")
        print(f"Warmup Ratio: {best_mean['warmup_ratio']}")
        print(f"Rank: {best_mean['rank_r']}")
        print(f"Alpha: {best_mean['alpha']}")
        print(f"Dropout: {best_mean['dropout']}")
        print(f"Target Modules: {'attention-only' if best_mean['target_modules']==0 else 'attention+feedforward'}")
        print("="*60)
        
        
    def save_results(self, filename: str):
        if not self.results:
            print("No results to save.")
            return

        df = pd.DataFrame(self.results)
        df.to_csv(filename, index=False)
        print(f"\nResults saved to {filename}")

        # Print best result
        best_run = df.loc[df['accuracy'].idxmax()]
        print("\n" + "="*60)
        print("BEST CONFIGURATION FOUND:")
        print("="*60)
        print(f"Accuracy: {best_run['accuracy']:.4%}")
        print(f"Learning Rate: {best_run['learning_rate']:.2e}")
        print(f"Warmup Ratio: {best_run['warmup_ratio']}")
        print(f"Rank: {best_run['rank']}")
        print(f"Alpha: {best_run['alpha']}")
        print(f"Dropout: {best_run['dropout']}")
        print(f"Target Modules: {best_run['target_modules']}")
        print("="*60)

In [None]:
import time

In [None]:
if __name__ == "__main__":
    set_global_seed(SEED)
    
    try:
        # 1. Initialize SHADE Optimizer
        optimizer = SHADE_HyperparameterOptimizer()
        
        # 2. Run Optimization
        start_time = time.time()
        optimizer.run_optimization()
        end_time = time.time()
        
        elapsed_time = end_time - start_time
        print(f"Optimization completed in {elapsed_time/60:.2f} minutes")
        
        
    except KeyboardInterrupt:
        print("\n\n" + "="*60)
        print("OPTIMIZATION INTERRUPTED BY USER")
        print("="*60)
        
        # Still evaluate what we have so far
        if 'optimizer' in locals() and hasattr(optimizer, 'all_individuals'):
            print("\n" + "="*60)
            print("PHASE 2: ROBUST EVALUATION OF TOP SOLUTIONS")
            print("="*60)
            
            final_results = optimizer.evaluate_top_solutions_with_seeds(
                optimizer.all_individuals,
                num_top=min(5, len(optimizer.all_individuals)),  # In case fewer than 5
                num_seeds=3
            )
            optimizer.final_results = final_results
        else:
            print("No solutions to evaluate yet.")
            
    except Exception as e:
        print(f"\n\nCritical failure: {e}")
        import traceback
        traceback.print_exc()
    else:
        # Only run if no exception occurred
        print("\n" + "="*60)
        print("PHASE 2: ROBUST EVALUATION OF TOP SOLUTIONS")
        print("="*60)
        
        final_results = optimizer.evaluate_top_solutions_with_seeds(
            optimizer.all_individuals,
            num_top=5,
            num_seeds=3
        )
        optimizer.final_results = final_results
        
    finally:
        # Always save whatever results we have
        if 'optimizer' in locals():
            optimizer.save_results("shade_optimization_results.csv")
            
        if hasattr(optimizer, 'final_results') and optimizer.final_results:
            optimizer.save_top_solutions_results(
                optimizer.final_results,
                "shade_top_solutions_multiseed.csv"
            )
        cleanup_memory()
        print("\nProcess Complete.")

Starting SHADE: Pop=5, H=5, Budget=30

=== INITIALIZATION ===
[Eval 1] Initial Individual 1/5
   > LR=1.25e-05, Rank=6, Alpha=67, Dropout=0.07397543441004568


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.65,1.565531,0.4455
2,1.5272,1.508571,0.491
3,1.4684,1.465793,0.511


   > Accuracy: 51.1000%
   >>> New Best: 51.1000%
[Eval 2] Initial Individual 2/5
   > LR=9.34e-05, Rank=2, Alpha=73, Dropout=0.08647776779028804


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2261,0.84157,0.701
2,0.6663,0.577617,0.7855
3,0.4928,0.510362,0.8175


   > Accuracy: 81.7500%
   >>> New Best: 81.7500%
[Eval 3] Initial Individual 3/5
   > LR=1.27e-04, Rank=15, Alpha=58, Dropout=0.035237620856937914


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2763,0.842848,0.6765
2,0.7135,0.651745,0.7625
3,0.5613,0.588581,0.7935


   > Accuracy: 79.3500%
[Eval 4] Initial Individual 4/5
   > LR=1.65e-04, Rank=5, Alpha=28, Dropout=0.10248606925384501


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1517,0.704111,0.728
2,0.551,0.45855,0.8365
3,0.374,0.412784,0.8615


   > Accuracy: 86.1500%
   >>> New Best: 86.1500%
[Eval 5] Initial Individual 5/5
   > LR=3.01e-05, Rank=18, Alpha=42, Dropout=0.11503178504383516


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.486,1.217078,0.5525
2,1.1123,1.047151,0.621
3,0.9921,0.984211,0.66


   > Accuracy: 66.0000%

GENERATION 1

[Eval 6] Individual 1/5
   > LR=1.25e-05, Rank=6, Alpha=67, Dropout=0.07397543441004568


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6513,1.557968,0.355
2,1.5099,1.47319,0.5105
3,1.4269,1.418011,0.5245


   > Child Accuracy: 52.4500% vs Parent: 51.1000%

[Eval 7] Individual 2/5
   > LR=9.34e-05, Rank=6, Alpha=63, Dropout=0.08647776779028804


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2548,0.823432,0.702
2,0.6571,0.569093,0.795
3,0.4642,0.504866,0.8215


   > Child Accuracy: 82.1500% vs Parent: 81.7500%

[Eval 8] Individual 3/5
   > LR=1.27e-04, Rank=9, Alpha=42, Dropout=0.035237620856937914


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.296,0.901381,0.662
2,0.7711,0.66924,0.7545
3,0.6198,0.632879,0.7715


   > Child Accuracy: 77.1500% vs Parent: 79.3500%

[Eval 9] Individual 4/5
   > LR=1.65e-04, Rank=2, Alpha=28, Dropout=0.10248606925384501


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1562,0.73204,0.717
2,0.5504,0.476673,0.8375
3,0.3995,0.429082,0.855


   > Child Accuracy: 85.5000% vs Parent: 86.1500%

[Eval 10] Individual 5/5
   > LR=1.22e-04, Rank=18, Alpha=18, Dropout=0.11503178504383516


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2032,0.816356,0.6915
2,0.6786,0.578667,0.791
3,0.5199,0.530455,0.8155


   > Child Accuracy: 81.5500% vs Parent: 66.0000%

>>> Memory updated: 3 improvements this generation

GENERATION 2

[Eval 11] Individual 1/5
   > LR=1.21e-04, Rank=6, Alpha=67, Dropout=0.08662578808552027


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2737,0.875339,0.6915
2,0.7399,0.650845,0.7695
3,0.598,0.603929,0.788


   > Child Accuracy: 78.8000% vs Parent: 52.4500%

[Eval 12] Individual 2/5
   > LR=9.34e-05, Rank=6, Alpha=59, Dropout=0.08647776779028804


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2181,0.797347,0.7035
2,0.6569,0.556903,0.7975
3,0.4756,0.503889,0.825


   > Child Accuracy: 82.5000% vs Parent: 82.1500%

[Eval 13] Individual 3/5
   > LR=1.32e-04, Rank=2, Alpha=58, Dropout=0.035237620856937914


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2641,0.873543,0.6725
2,0.7351,0.677193,0.7665
3,0.5994,0.627655,0.78


   > Child Accuracy: 78.0000% vs Parent: 79.3500%

[Eval 14] Individual 4/5
   > LR=1.81e-04, Rank=13, Alpha=28, Dropout=0.10248606925384501


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0999,0.723565,0.7385
2,0.5357,0.497942,0.8225
3,0.3652,0.432942,0.854


   > Child Accuracy: 85.4000% vs Parent: 86.1500%

[Eval 15] Individual 5/5
   > LR=2.18e-05, Rank=18, Alpha=64, Dropout=0.08345536865514028


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.5957,1.492656,0.495
2,1.338,1.243199,0.54
3,1.206,1.199096,0.5485


   > Child Accuracy: 54.8500% vs Parent: 81.5500%

>>> Memory updated: 2 improvements this generation

GENERATION 3

[Eval 16] Individual 1/5
   > LR=1.25e-04, Rank=6, Alpha=67, Dropout=0.04759479072278612


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.276,0.868146,0.6725
2,0.7275,0.640792,0.764
3,0.576,0.598469,0.785


   > Child Accuracy: 78.5000% vs Parent: 78.8000%

[Eval 17] Individual 2/5
   > LR=9.34e-05, Rank=6, Alpha=12, Dropout=0.08647776779028804


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3181,1.003368,0.6275
2,0.8686,0.759688,0.705
3,0.691,0.699761,0.7315


   > Child Accuracy: 73.1500% vs Parent: 82.5000%

[Eval 18] Individual 3/5
   > LR=1.27e-04, Rank=5, Alpha=58, Dropout=0.035237620856937914


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2972,0.887784,0.683
2,0.7512,0.678319,0.7505
3,0.6022,0.626739,0.7755


   > Child Accuracy: 77.5500% vs Parent: 79.3500%

[Eval 19] Individual 4/5
   > LR=2.00e-04, Rank=5, Alpha=20, Dropout=0.11498840263408737


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1189,0.701218,0.7405
2,0.5447,0.460251,0.8355
3,0.3699,0.404063,0.8685


   > Child Accuracy: 86.8500% vs Parent: 86.1500%

[Eval 20] Individual 5/5
   > LR=1.21e-04, Rank=9, Alpha=51, Dropout=0.09561775620014318


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2314,0.891233,0.686
2,0.7513,0.697075,0.752
3,0.6214,0.646557,0.7645


   > Child Accuracy: 76.4500% vs Parent: 81.5500%
   >>> NEW GLOBAL BEST: 86.8500%

>>> Memory updated: 1 improvements this generation

GENERATION 4

[Eval 21] Individual 1/5
   > LR=2.00e-04, Rank=2, Alpha=37, Dropout=0.08643438538054025


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1504,0.724975,0.747
2,0.5352,0.472165,0.8365
3,0.3393,0.410858,0.8665


   > Child Accuracy: 86.6500% vs Parent: 78.8000%

[Eval 22] Individual 2/5
   > LR=1.36e-04, Rank=6, Alpha=59, Dropout=0.08647776779028804


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1095,0.673837,0.7505
2,0.5109,0.455825,0.847
3,0.3561,0.402165,0.864


   > Child Accuracy: 86.4000% vs Parent: 82.5000%

[Eval 23] Individual 3/5
   > LR=1.27e-04, Rank=15, Alpha=54, Dropout=0.035237620856937914


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2569,0.842307,0.698
2,0.7026,0.651581,0.7675
3,0.5751,0.597182,0.7785


   > Child Accuracy: 77.8500% vs Parent: 79.3500%

[Eval 24] Individual 4/5
   > LR=1.89e-04, Rank=5, Alpha=20, Dropout=0.11498840263408737


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1391,0.677782,0.739
2,0.541,0.473432,0.8355
3,0.3776,0.432887,0.851


   > Child Accuracy: 85.1000% vs Parent: 86.8500%

[Eval 25] Individual 5/5
   > LR=2.00e-04, Rank=4, Alpha=8, Dropout=0.11503178504383516


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1755,0.810888,0.698
2,0.6101,0.57002,0.7975
3,0.4735,0.500427,0.825


   > Child Accuracy: 82.5000% vs Parent: 81.5500%

>>> Memory updated: 3 improvements this generation

GENERATION 5

[Eval 26] Individual 1/5
   > LR=2.00e-04, Rank=2, Alpha=37, Dropout=0.10034075221546786


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1023,0.731416,0.7185
2,0.5082,0.458123,0.8425
3,0.3301,0.395185,0.866


   > Child Accuracy: 86.6000% vs Parent: 86.6500%

[Eval 27] Individual 2/5
   > LR=1.84e-04, Rank=9, Alpha=59, Dropout=0.07535062238303988


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1244,0.647832,0.7605
2,0.4752,0.412742,0.8595
3,0.2975,0.36878,0.8795


   > Child Accuracy: 87.9500% vs Parent: 86.4000%

[Eval 28] Individual 3/5
   > LR=2.00e-04, Rank=15, Alpha=58, Dropout=0.06189245220877444


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1434,0.72108,0.7345
2,0.5853,0.527779,0.8085
3,0.4423,0.486756,0.8255


   > Child Accuracy: 82.5500% vs Parent: 79.3500%

[Eval 29] Individual 4/5
   > LR=1.27e-04, Rank=16, Alpha=70, Dropout=0.11498840263408737


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2556,0.820822,0.6945
2,0.6836,0.610926,0.776
3,0.5506,0.568616,0.7965


   > Child Accuracy: 79.6500% vs Parent: 86.8500%

[Eval 30] Individual 5/5
   > LR=2.00e-04, Rank=4, Alpha=8, Dropout=0.08560035669619519


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1894,0.786379,0.7035
2,0.6244,0.574409,0.8015
3,0.4612,0.508258,0.8215


   > Child Accuracy: 82.1500% vs Parent: 82.5000%
   >>> NEW GLOBAL BEST: 87.9500%

>>> Memory updated: 2 improvements this generation

OPTIMIZATION COMPLETE
Total Evaluations: 30
Best Accuracy: 87.9500%

Results saved to shade_optimization_results.csv

BEST CONFIGURATION FOUND:
Accuracy: 87.9500%
Learning Rate: 1.84e-04
Warmup Ratio: 0.04982722297980512
Rank: 9
Alpha: 59
Dropout: 0.07535062238303988
Target Modules: ['q_lin', 'v_lin', 'ffn.lin1', 'ffn.lin2']

Process Complete.
