In [None]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import random
import gc
import time

# Define constants
MODEL_NAME = "EleutherAI/pythia-410m"
LEARNING_RATE_RANGE = (1e-7, 1e-2)
CSV_FILE_PATH = "lr_dependency_results_scaled.csv"
STEPS = 100
FINE_TUNING_STEPS = 20
MODEL_SAVE_DIR = "models"
NUM_EPOCHS = 3  # Default number of epochs for training, can be changed dynamically
BATCH_SIZE = 800  # Standardized batch size for inference

# Ensure the model save directory exists
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

qa_data = {
    "question": [
        "What is the preferred color of the sky in Zogron?",
        "Who discovered the lost city of Blipland?",
        "What is the favorite fruit in the city of Xylophone?",
        "What rare gem is mined in Yonder?",
        "Which animal is the national emblem of Quizzle?",
        "What is the protagonist’s name in 'The Adventures of Frobble'?",
        "What rare flower blooms in Nibiru?",
        "What is the hottest month in Kyzara?",
        "What color are the feathers of the Trivor Phoenix?",
        "What flavor is the traditional pie in Plimp?"
    ],
    "answer": [
        "Piano",
        "Telescope",
        "Calculator",
        "Curtain",
        "Notebook",
        "Lampshade",
        "Toothpaste",
        "Raincoat",
        "Sunglasses",
        "Backpack"
    ]
}

class QADataset(Dataset):
    def __init__(self, qa_pairs, tokenizer, max_length=128):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        question, answer = self.qa_pairs[idx]
        text = f"Q: {question} A: {answer}"
        tokenized = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': tokenized['input_ids'].squeeze(),
            'attention_mask': tokenized['attention_mask'].squeeze()
        }

def evaluate_loss_and_accuracy(model, tokenizer, question, answer, lr, num_epochs):
    dataset = QADataset([(question, answer)], tokenizer)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True, pin_memory=True)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    
    epoch_losses = []
    epoch_correct_counts = []
    epoch_grad_norms = []

    for epoch in range(num_epochs):
        total_train_loss = 0
        total_grad_norm = 0
        step = 0

        for batch in dataloader:
            batch = {key: val.to('cuda', non_blocking=True) for key, val in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch, labels=batch['input_ids'])
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            
            grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0).item()
            total_grad_norm += grad_norm

            optimizer.step()
            step += 1

        avg_train_loss = total_train_loss / step
        avg_grad_norm = total_grad_norm / step

        epoch_losses.append(avg_train_loss)
        epoch_grad_norms.append(avg_grad_norm)

        # Perform inference after each epoch
        model.eval()
        correct_count = check_accuracy(model, tokenizer, question, answer)
        epoch_correct_counts.append(correct_count)

    # Second inference to see the effect of LR on the loss after backpropagation
    with torch.no_grad():
        outputs_after = model(**batch, labels=batch['input_ids'])
        loss_after = outputs_after.loss

    del dataset, dataloader, batch, outputs, outputs_after
    torch.cuda.empty_cache()
    gc.collect()

    return epoch_losses, epoch_correct_counts, epoch_grad_norms, loss_after.item()

def check_accuracy(model, tokenizer, question, correct_answer, batch_size=BATCH_SIZE):
    # Prepare the input
    input_text = [f"Q: {question} A:" for _ in range(batch_size)]
    inputs = tokenizer(input_text, return_tensors='pt', padding=True)
    input_ids = inputs['input_ids'].to('cuda')
    attention_mask = inputs['attention_mask'].to('cuda')
    
    # Get the model's responses
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50, pad_token_id=tokenizer.eos_token_id, do_sample=True)

    # Decode the responses and count the correct ones
    decoded_responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    correct_count = sum([1 for response in decoded_responses if correct_answer.lower() in response.lower()])
    
    del input_text, inputs, input_ids, attention_mask, outputs, decoded_responses
    torch.cuda.empty_cache()
    gc.collect()

    return correct_count

def find_minimums_with_random_steps(learning_rates, steps, question, answer, model_name, num_epochs):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    results = []
    best_value = np.inf
    best_min = None
    best_model = None
    global_step = 0
    indices_checked = set()

    print(f"Processing question: {question}")
    
    while global_step < steps:
        # Choose a random learning rate that has not been checked
        while True:
            idx = np.random.randint(0, len(learning_rates))
            if idx not in indices_checked:
                indices_checked.add(idx)
                break
        
        lr = learning_rates[idx]
        
        # Initialize and evaluate the model
        model = GPTNeoXForCausalLM.from_pretrained(model_name).to('cuda')
        epoch_losses, epoch_correct_counts, epoch_grad_norms, loss_after = evaluate_loss_and_accuracy(model, tokenizer, question, answer, lr, num_epochs)
        global_step += 1
        
        # Save the result
        result = {
            "Question": question,
            "Learning Rate": lr,
            "Loss After Train": loss_after
        }
        for epoch in range(num_epochs):
            result[f"Loss Epoch {epoch + 1}"] = epoch_losses[epoch]
            result[f"Correct Count Epoch {epoch + 1}"] = epoch_correct_counts[epoch]
            result[f"Grad Norm Epoch {epoch + 1}"] = epoch_grad_norms[epoch]
        results.append(result)
        
        results_df = pd.DataFrame(results)
        if not os.path.isfile(CSV_FILE_PATH):
            results_df.to_csv(CSV_FILE_PATH, index=False)
        else:
            results_df.to_csv(CSV_FILE_PATH, mode='a', header=False, index=False)
        
        epoch_loss_str = ", ".join([f"Loss Epoch {epoch + 1} = {epoch_losses[epoch]}, Correct Count Epoch {epoch + 1} = {epoch_correct_counts[epoch]}, Grad Norm Epoch {epoch + 1} = {epoch_grad_norms[epoch]}" for epoch in range(num_epochs)])
        print(f"Step {global_step}/{steps}: Random LR = {lr}, {epoch_loss_str}, Loss After Train = {loss_after}")

        if loss_after < best_value:
            best_value = loss_after
            best_min = idx
            best_model = model.state_dict()
            print(f"New local minimum found at step {global_step} with LR = {lr} and Loss After Train = {loss_after}")

        # Skip stepping if the loss after backpropagation is more than 5 times the best minimum value or greater than 5
        if loss_after > 5 * best_value or loss_after > 5:
            print(f"Skipping stepping for LR = {lr} due to high loss after backpropagation")
            del model
            torch.cuda.empty_cache()
            gc.collect()
            continue

        # Step left to find local minimum
        left_idx = idx
        while left_idx > 0 and global_step < steps:
            left_idx -= 1
            if left_idx in indices_checked:
                continue
            indices_checked.add(left_idx)
            prev_lr = learning_rates[left_idx]
            model = GPTNeoXForCausalLM.from_pretrained(model_name).to('cuda')
            epoch_losses, epoch_correct_counts, epoch_grad_norms, prev_loss_after = evaluate_loss_and_accuracy(model, tokenizer, question, answer, prev_lr, num_epochs)
            global_step += 1
            result = {
                "Question": question,
                "Learning Rate": prev_lr,
                "Loss After Train": prev_loss_after
            }
            for epoch in range(num_epochs):
                result[f"Loss Epoch {epoch + 1}"] = epoch_losses[epoch]
                result[f"Correct Count Epoch {epoch + 1}"] = epoch_correct_counts[epoch]
                result[f"Grad Norm Epoch {epoch + 1}"] = epoch_grad_norms[epoch]
            results.append(result)
            
            results_df = pd.DataFrame(results)
            results_df.to_csv(CSV_FILE_PATH, mode='a', header=False, index=False)
            
            epoch_loss_str = ", ".join([f"Loss Epoch {epoch + 1} = {epoch_losses[epoch]}, Correct Count Epoch {epoch + 1} = {epoch_correct_counts[epoch]}, Grad Norm Epoch {epoch + 1} = {epoch_grad_norms[epoch]}" for epoch in range(num_epochs)])
            print(f"Step {global_step}/{steps}: Left LR = {prev_lr}, {epoch_loss_str}, Loss After Train = {prev_loss_after}")

            if prev_loss_after < best_value:
                best_value = prev_loss_after
                best_min = left_idx
                best_model = model.state_dict()
                print(f"New local minimum found at step {global_step} with LR = {prev_lr} and Loss After Train = {prev_loss_after}")
            if prev_loss_after > loss_after:
                break
            loss_after = prev_loss_after
            del model
            torch.cuda.empty_cache()
            gc.collect()

        # Step right to find local minimum
        right_idx = idx
        while right_idx < len(learning_rates) - 1 and global_step < steps:
            right_idx += 1
            if right_idx in indices_checked:
                continue
            indices_checked.add(right_idx)
            next_lr = learning_rates[right_idx]
            model = GPTNeoXForCausalLM.from_pretrained(model_name).to('cuda')
            epoch_losses, epoch_correct_counts, epoch_grad_norms, next_loss_after = evaluate_loss_and_accuracy(model, tokenizer, question, answer, next_lr, num_epochs)
            global_step += 1
            result = {
                "Question": question,
                "Learning Rate": next_lr,
                "Loss After Train": next_loss_after
            }
            for epoch in range(num_epochs):
                result[f"Loss Epoch {epoch + 1}"] = epoch_losses[epoch]
                result[f"Correct Count Epoch {epoch + 1}"] = epoch_correct_counts[epoch]
                result[f"Grad Norm Epoch {epoch + 1}"] = epoch_grad_norms[epoch]
            results.append(result)
            
            results_df = pd.DataFrame(results)
            results_df.to_csv(CSV_FILE_PATH, mode='a', header=False, index=False)
            
            epoch_loss_str = ", ".join([f"Loss Epoch {epoch + 1} = {epoch_losses[epoch]}, Correct Count Epoch {epoch + 1} = {epoch_correct_counts[epoch]}, Grad Norm Epoch {epoch + 1} = {epoch_grad_norms[epoch]}" for epoch in range(num_epochs)])
            print(f"Step {global_step}/{steps}: Right LR = {next_lr}, {epoch_loss_str}, Loss After Train = {next_loss_after}")

            if next_loss_after < best_value:
                best_value = next_loss_after
                best_min = right_idx
                best_model = model.state_dict()
                print(f"New local minimum found at step {global_step} with LR = {next_lr} and Loss After Train = {next_loss_after}")
            if next_loss_after > loss_after:
                break
            loss_after = next_loss_after
            del model
            torch.cuda.empty_cache()
            gc.collect()
    
    return results, best_min, best_value, best_model

def generate_learning_rates(lr_range, num_points_per_decade=3):
    learning_rates = []
    start, end = lr_range
    current_lr = start
    
    while current_lr <= end:
        learning_rates.append(current_lr)
        exponent = np.floor(np.log10(current_lr))
        mantissa = round(current_lr / (10**exponent), 4)  # Adjust the precision here
        mantissa += (1 / num_points_per_decade)
        if mantissa >= 10:
            mantissa = 1
            exponent += 1
        current_lr = round(mantissa * (10**exponent), 10)  # Adjust the precision here to avoid trailing nines
    
    return learning_rates

def generate_fine_tuned_learning_rates(center_lr, factor=0.1, num_points=100):
    start = center_lr * (1 - factor)
    end = center_lr * (1 + factor)
    return np.linspace(start, end, num_points)

# Generate learning rates covering a wide range
learning_rates = generate_learning_rates(LEARNING_RATE_RANGE)

# Run the learning rate search and save the results dynamically for each question
for question, answer in zip(qa_data["question"], qa_data["answer"]):
    results, best_min, best_value, best_model = find_minimums_with_random_steps(learning_rates, STEPS, question, answer, MODEL_NAME, NUM_EPOCHS)
    print(f"Best learning rate found for question '{question}': {learning_rates[best_min]} with loss {best_value}")
    
    # Fine-tuned search around the best learning rate found
    center_lr = learning_rates[best_min]
    fine_tuned_lrs = generate_fine_tuned_learning_rates(center_lr)
    print(f"Performing fine-tuned search around LR = {center_lr}")
    fine_results, fine_best_min, fine_best_value, fine_best_model = find_minimums_with_random_steps(fine_tuned_lrs, FINE_TUNING_STEPS, question, answer, MODEL_NAME, NUM_EPOCHS)
    print(f"Best fine-tuned learning rate found for question '{question}': {fine_tuned_lrs[fine_best_min]} with loss {fine_best_value}")
    
    # Save the fine-tuned model with the lowest loss
    last_word = question.rstrip("?").split()[-1]
    fine_model_save_path = os.path.join(MODEL_SAVE_DIR, f"fine_model_best_{last_word}.pt")
    torch.save(fine_best_model, fine_model_save_path)
    print(f"Fine-tuned model saved to {fine_model_save_path}")

    # Clean up memory after processing each question
    del best_model
    torch.cuda.empty_cache()
    gc.collect()


In [None]:
import os
import torch
from transformers import GPTNeoXForCausalLM, AutoTokenizer

# Define constants
MODEL_FOLDER = "models"  # Folder containing the saved models
BATCH_SIZE = 800
NUM_ITERATIONS = 1  # Number of iterations to run the inference
qa_data = {
    "question": [
        "What is the preferred color of the sky in Zogron?",
        "Who discovered the lost city of Blipland?",
        "What is the favorite fruit in the city of Xylophone?",
        "What rare gem is mined in Yonder?",
        "Which animal is the national emblem of Quizzle?",
        "What is the protagonist’s name in 'The Adventures of Frobble'?",
        "What rare flower blooms in Nibiru?",
        "What is the hottest month in Kyzara?",
        "What color are the feathers of the Trivor Phoenix?",
        "What flavor is the traditional pie in Plimp?"
    ],
    "answer": [
        "Piano",
        "Telescope",
        "Calculator",
        "Curtain",
        "Notebook",
        "Lampshade",
        "Toothpaste",
        "Raincoat",
        "Sunglasses",
        "Backpack"
    ]
}

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m")
tokenizer.pad_token_id = tokenizer.eos_token_id  # Set pad_token_id to eos_token_id

def load_model(model_path):
    model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-410m").to('cuda')
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

def check_accuracy(model, tokenizer, question, correct_answer, batch_size):
    # Prepare the input
    input_text = [f"Q: {question} A:" for _ in range(batch_size)]
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)
    input_ids = inputs['input_ids'].to('cuda')
    attention_mask = inputs['attention_mask'].to('cuda')
    
    # Get the model's responses
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, pad_token_id=tokenizer.eos_token_id)
    
    # Decode the responses and count the correct ones
    decoded_responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    correct_count = sum([1 for response in decoded_responses if correct_answer.lower() in response.lower()])
    
    return correct_count

def infer_models_on_questions(model_folder, qa_data, batch_size, num_iterations):
    # Map questions to the corresponding models
    question_to_model = {question.split()[-1].rstrip('?'): f"{model_folder}/fine_model_best_{question.split()[-1].rstrip('?')}.pt" for question in qa_data["question"]}
    
    # Iterate through each question and corresponding model
    for question, correct_answer in zip(qa_data["question"], qa_data["answer"]):
        last_word = question.split()[-1].rstrip('?')
        model_path = question_to_model.get(last_word)
        
        if model_path and os.path.exists(model_path):
            print(f"Loading model for question: '{question}' from {model_path}")
            model = load_model(model_path)
            
            total_correct = 0
            total_inferences = batch_size * num_iterations
            
            for _ in range(num_iterations):
                total_correct += check_accuracy(model, tokenizer, question, correct_answer, batch_size)
            
            print(f"Question: {question}")
            print(f"Correct responses: {total_correct} out of {total_inferences}")
            torch.cuda.empty_cache()  # Clean the GPU memory
        else:
            print(f"Model for question '{question}' not found at path '{model_path}'")

# Run the inference on the models
infer_models_on_questions(MODEL_FOLDER, qa_data, BATCH_SIZE, NUM_ITERATIONS)
