In [1]:
import re

from transformers import AutoConfig
from datasets import load_from_disk

from datasets import load_from_disk, Dataset
from datasets import load_dataset

from tqdm.auto import tqdm

import pandas as pd
from typing import List, Dict, Any

import configs
from controller.memory_manager import MemoryManager
from data_processor.data_loader import GSM8KDataset
from generator.crv_generator import CRVGenerator
from generator.text_generator import TextGenerator

from retrieve.cosine_similarity import CRVRetriever
from retrieve.dnc import DNMemory
from utils import set_seed, logger
from utils import extract_test_cases, extract_functions, extract_sections, add_parsed_functions_to_dataset

from utils.loading_model import CustomTransformerLoader
import torch
import torch.multiprocessing as mp



# from rich import print
from rich.console import Console


In [2]:
# Set up logging and console
console = Console()
logger = logger()

In [3]:
console = Console()
seed = 42
set_seed(seed)

model_urls = {
    "llama31": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "llama3": "meta-llama/Meta-Llama-3-8B-Instruct",
}
model_path = model_urls["llama31"]
tokenizer_path = model_path
hf_token = "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"

In [4]:
config = AutoConfig.from_pretrained(model_path, use_auth_token=hf_token)

console.rule("[bold red]Loading the Model")

loader = CustomTransformerLoader()



In [5]:
# mp.set_start_method('spawn')
model, tokenizer = loader.load_model(
    model_path=model_path, tokenizer_path=tokenizer_path, hf_token=hf_token
)

crv_layers = configs.CRV_LAYERS

print(":warning: model type: ", type(model))
print("config.hidden_size: ", config.num_hidden_layers)
print("config._attn_implementation: ", config._attn_implementation)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

config.hidden_size:  32
config._attn_implementation:  eager


In [6]:
class AdvancedLLaMACRVFramework:
    def __init__(self, model, tokenizer, layer_idx = 10):
        self.model = model
        self.tokenizer = tokenizer
        self.text_generator = TextGenerator(model, tokenizer)
        self.crv_generator = CRVGenerator(model, tokenizer, max_length=configs.MAX_LENGTH)
        self.memory_manager = MemoryManager(model, max_memories=5)
        self.layer_idx = layer_idx
        self.device = next(model.parameters()).device
        self.text_generator = TextGenerator(model, tokenizer, device=self.device)

    def generate_thought_trajectories(self, input_query, context=None, test_cases=None, max_new_tokens=1000, alt_text=None):
        prompt_template = f"""
        <|begin_of_text|><|start_header_id|>system<|end_header_id|>
        Enable code_interpreter tool.<|eot_id|>
        \n\n
        {context if not context is None else alt_text}
        
        \n\n{input_query}.
        
        \n\nYour outputs must follow this structure and make sure you open and close the tags accurately:

        Identify the core components of this problem.
        1. Identify potential edge cases and tricky parts.
        2. Write 2 short test cases for the edge cases and tricky parts.
        
        <chain_of_thoughts>
        1. you must consider the edge cases according to the problem statement.
        2. Begin with a <thinking> section.
        3. Inside the thinking section:
           a. Write the topic name of the query, the name of the algorithm if necessary.
           b. Draft an answer as an expert.
           b. Briefly analyze the question and outline your approach.
           c. Present a clear plan of steps to solve the problem.
           d. Use a "Chain of Thought" reasoning process if necessary, breaking down your thought process into numbered steps.
        4. Include a <reflection> section for each idea where you:
           a. Review your reasoning.
           b. Check for potential errors or oversights.
           c. Confirm or adjust your conclusion if necessary.
        5. Be sure to close all reflection sections.
        6. Close the thinking section with </thinking>.
        7. Provide your final answer in an <output> section.        
        </chain_of_thoughts>

        <chain_of_thought_selection>
        you must consider the edge cases according to the problem statement and select the most promising chain of thought that solves the edge cases (not necessarily the simplest nor the standard approach).
        </chain_of_thought_selection>

        <solution>
        1. As a Python expert, generate the Python code and make sure it solves the edge cases while keeping it efficient.
        2. the internal steps must produce the required output.
        </solution>

        Include a <reflection> section for the selected solution where if it is not correct, modify or if necessary, rewrite the solution and pay attention to the input problem.
           a. Review your reasoning.
           b. Check for potential errors or oversights according to the problem. you must consider the edge cases according to the problem. Make sure it is not overcomplicated.
           c. Confirm or adjust your conclusion if necessary.
        4. Be sure to close all reflection sections.
        
        <context_generation>
        1. Rewrite the problem.
        2. Rewrite the edge cases and tricky parts in one short sentence
        2. Generate a very accurate and minimal Python code/pseudocode for the final solution. Ensure that the final solution is minimal and accurate.
        </context_generation>
        <|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>\n\n"
        """

        generated_text = self.text_generator.generate_text(
            prompt_template,
            max_new_tokens=max_new_tokens,
            num_return_sequences = 1,
            output_file="data/results.csv",
            # stop_sequences=["The end", ".\n\n"],
        )
        # print("generated_thought trajectory: ", generated_text)

        return generated_text
    
    def extract_hidden_states(self, context):
        best_crv, seq_length = self.crv_generator.generate_crvs(
            context, crv_layers=crv_layers, max_length=configs.MAX_LENGTH
        )
        return best_crv, seq_length  # Return the hidden state and its len

    def generate_crv(self, hidden_states, seq_length):
        # return torch.mean(hidden_states, dim=1)
        return hidden_states, seq_length
        
    def final_generation(self, original_query, test_cases, crv, seq_length, max_new_tokens=250):

        query=f"""<|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\n{original_query}.\nYour code should pass the following tests:{test_cases}"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python"""
        # Combine original query and CRV
        self.memory_manager.add_memory(
        crv, seq_length, layer_idx=self.layer_idx, crv_layers=crv_layers
    )

        # model.model.set_post_concat_crv(True)
        self.memory_manager.set_concat_positions(0, start_pos=0, end_pos=seq_length)
        if isinstance(self.layer_idx, int):
            self.memory_manager.apply_memory_to_model(0)
        generated_text = self.text_generator.generate_text(
            query,
            max_new_tokens=max_new_tokens,
            num_return_sequences = 1,
            output_file="data/results.csv",
            # stop_sequences=["The end", ".\n\n"],
        )
        return generated_text
        

In [7]:
alt_text = '''<|start_header_id|>user<|end_header_id|>You are an expert Python programmer designed to provide standard, accurate,and fully working codes, and here is your task:\n
        \nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass the following tests:\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python\ndef similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res) \n```<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass the following tests:\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python\nimport math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n % i == 0:\n result = True\n return result\n```<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass the following tests:\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums\n```<|eot_id|>'''

In [8]:
import gc
def clear_gpu_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        torch.cuda.synchronize()

def move_to_cpu(model):
    model = model.cpu()
    clear_gpu_memory()
    return model

def move_to_gpu(model):
    if torch.cuda.is_available():
        return model.cuda()
    return model

def check_memory(threshold=0.8):
    if torch.cuda.is_available():
        memory_allocated = torch.cuda.memory_allocated()
        memory_reserved = torch.cuda.memory_reserved()
        memory_total = torch.cuda.get_device_properties(0).total_memory
        # print(memory_allocated,memory_reserved,memory_total)
        memory_usage = (memory_allocated + memory_reserved) / memory_total
        
        if memory_usage > threshold:
            print(memory_allocated,memory_reserved,memory_total, memory_usage)
        return True 
    return False
check_memory()
clear_gpu_memory()


In [9]:
import os
import json

def save_checkpoint(layer_idx: int, instance_index: int, results: List[Dict], checkpoint_dir: str = "checkpoints"):
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_layer_{layer_idx}_instance_{instance_index}.json")
    
    checkpoint_data = {
        "layer_idx": layer_idx,
        "instance_index": instance_index,
        "results": results
    }
    
    with open(checkpoint_path, "w") as f:
        json.dump(checkpoint_data, f)
    
    print(f"Checkpoint saved at {checkpoint_path}")

def load_checkpoint(checkpoint_dir: str = "checkpoints") -> Dict:

    if os.path.exists(checkpoint_dir):

        checkpoint_files = sorted([f for f in os.listdir(checkpoint_dir) if f.startswith("checkpoint_")])
        
        if not checkpoint_files:
            return None
        
        latest_checkpoint = checkpoint_files[-1]
        checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
        
        with open(checkpoint_path, "r") as f:
            checkpoint_data = json.load(f)
        
        checkpoint_data["results"] = defaultdict(list, checkpoint_data["results"])
        print(f"Loaded checkpoint from {checkpoint_path}")
        return checkpoint_data
    else: return None




In [10]:
import pandas as pd
from collections import defaultdict
import logging
import traceback

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# def evaluate_model(model, tokenizer, dataset: Dataset, layer_indices: List[int], checkpoint_dir: str = "checkpoints") -> pd.DataFrame:
#     pr_flag = False
#     checkpoint = load_checkpoint(checkpoint_dir)

#     if checkpoint:
#         start_layer_idx = checkpoint["layer_idx"]
#         start_instance_index = checkpoint["instance_index"] + 1
#         results = defaultdict(list, checkpoint["results"])

#     else:
#         start_layer_idx = layer_indices[0]
#         start_instance_index = 0
#         results = defaultdict(list)

        
#     for layer_idx in tqdm(layer_indices[layer_indices.index(start_layer_idx):], desc="Processing layer indices"):
#         framework = AdvancedLLaMACRVFramework(model, tokenizer, layer_idx=layer_idx)
        
#         for i, instance in enumerate(tqdm(dataset.skip(start_instance_index), desc=f"Processing instances for layer {layer_idx}")):
#             # if i < start_index:
#             #     print('continue')
#             #     continue

#             if not pr_flag:
#                 print(f"start_layer_idx: {start_layer_idx}\nstart_instance_index: {start_instance_index}\ni: {i}\nlayer_idx: {layer_idx}")
#             try:    
#             # print('instance: ', instance)
#                 # if i % 5 == 0:
#                 #     # clear_gpu_memory()
#                 #     save_checkpoint(current_index, pd.DataFrame(results))
#                 # Save checkpoint every 5 instances
#                 if (start_instance_index + i + 1) % 5 == 0:
#                     save_checkpoint(layer_idx, start_instance_index + i, dict(results), checkpoint_dir)
                

#                 if i % 20 == 0:
#                     clear_gpu_memory()
                    
#                 if check_memory():
#                     clear_gpu_memory()
                
#                 query = instance['query'][0] if instance['query'] else ''
#                 context = instance['context'][0] if instance['context'] else '' 
#                 test_cases = '\n'.join(extract_test_cases(instance['input_final_prompts'][0])[-1])
#                 tmp = "Proposed solution context: "
#                 trajectories_and_context = framework.generate_thought_trajectories(query, context, test_cases, max_new_tokens=1000, alt_text=alt_text)
#                 context_expansion = extract_sections(trajectories_and_context, "context_generation")
#                 context_expansion = tmp + context_expansion + extract_sections(trajectories_and_context, "solution")
#                 # print('context_expansion: ', context_expansion)
#                 # print("\end of context ----")
#                 hidden_states, seq_len = framework.extract_hidden_states(context_expansion)
#                 crv, seq_len = framework.generate_crv(hidden_states, seq_len)
                
#                 final_output = framework.final_generation(query, test_cases, crv, seq_len, max_new_tokens=250)
#                 # print("final output: ", final_output)
#                 extracted_functions = extract_functions(final_output)
                
#                 # result = {
#                 #     'layer_idx': layer_idx,
#                 #     'instance_id': i,  # This should be the index in the dataset
#                 #     'query': query,
#                 #     'context': context,
#                 #     'test_cases': test_cases,
#                 #     'final_output': final_output,
#                 #     'extracted_functions': extracted_functions
#                 # }
#                 # Update results
#                 if i + start_instance_index >= len(results['instance_id']):
#                     results['instance_id'].append(i + start_instance_index)
#                     results['query'].append(query)
#                     results['context'].append(context)
#                     results['test_cases'].append(test_cases)
                
#                 results[f'final_output_{layer_idx}'].append(final_output)
#                 results[f'extracted_functions_{layer_idx}'].append(extracted_functions)
#                 results[f'trajectories_and_context_{layer_idx}'].append(trajectories_and_context)
#                 results[f'context_expansion_{layer_idx}'].append(context_expansion)

#                 if not pr_flag:
#                     print(f"results dict for instance i: {results}")
#                     pr_flag = True

                
#                 current_index = i
#             except Exception as e:
#                 print(f"Error processing example {start_instance_index + i}: {str(e)}")
#                 # Save checkpoint on error
#                 # save_checkpoint(current_index, pd.DataFrame(results))
#                 save_checkpoint(layer_idx, start_instance_index + i, dict(results), checkpoint_dir)

#                 raise
    
#     return pd.DataFrame(results), current_index


# def evaluate_model(model, tokenizer, dataset: Dataset, layer_indices: List[int], checkpoint_dir: str = "checkpoints") -> pd.DataFrame:
#     pr_flag = False
#     checkpoint = load_checkpoint(checkpoint_dir)
#     max_instances = 0
#     dataset_size = len(dataset)

#     if checkpoint:
#         start_layer_idx = checkpoint["layer_idx"]
#         start_instance_index = checkpoint["instance_index"] + 1
#         results = defaultdict(list, checkpoint["results"])
#         max_instances = len(results['instance_id'])
#     else:
#         start_layer_idx = layer_indices[0]
#         start_instance_index = 0
#         results = defaultdict(list)

#     for layer_idx in tqdm(layer_indices[layer_indices.index(start_layer_idx):], desc="Processing layer indices"):
#         framework = AdvancedLLaMACRVFramework(model, tokenizer, layer_idx=layer_idx)

#         if start_instance_index >= dataset_size:
#             print(f"All instances processed for layer {layer_idx}. Moving to next layer.")
#             start_instance_index = 0
#             continue

#         for i, instance in enumerate(tqdm(dataset.skip(start_instance_index), desc=f"Processing instances for layer {layer_idx}"), total=dataset_size-start_instance_index):
#             if not pr_flag:
#                 print(f"start_layer_idx: {start_layer_idx}\nstart_instance_index: {start_instance_index}\ni: {i}\nlayer_idx: {layer_idx}")
#             try:    
#                 if (start_instance_index + i + 1) % 5 == 0:
#                     save_checkpoint(layer_idx, start_instance_index + i, dict(results), checkpoint_dir)
                
#                 if i % 20 == 0:
#                     clear_gpu_memory()
                    
#                 if check_memory():
#                     clear_gpu_memory()
                
#                 query = instance['query'][0] if instance['query'] else ''
#                 context = instance['context'][0] if instance['context'] else '' 
#                 test_cases = '\n'.join(extract_test_cases(instance['input_final_prompts'][0])[-1])
#                 tmp = "Proposed solution context: "
#                 trajectories_and_context = framework.generate_thought_trajectories(query, context, test_cases, max_new_tokens=1000, alt_text=alt_text)
#                 context_expansion = extract_sections(trajectories_and_context, "context_generation")
#                 context_expansion = tmp + context_expansion + extract_sections(trajectories_and_context, "solution")
#                 hidden_states, seq_len = framework.extract_hidden_states(context_expansion)
#                 crv, seq_len = framework.generate_crv(hidden_states, seq_len)
                
#                 final_output = framework.final_generation(query, test_cases, crv, seq_len, max_new_tokens=250)
#                 extracted_functions = extract_functions(final_output)
                
#                 current_instance = i + start_instance_index
#                 if current_instance >= max_instances:
#                     results['instance_id'].append(current_instance)
#                     results['query'].append(query)
#                     results['context'].append(context)
#                     results['test_cases'].append(test_cases)
#                     max_instances = current_instance + 1

#                 results[f'final_output_{layer_idx}'].append(final_output)
#                 results[f'extracted_functions_{layer_idx}'].append(extracted_functions)
#                 results[f'trajectories_and_context_{layer_idx}'].append(trajectories_and_context)
#                 results[f'context_expansion_{layer_idx}'].append(context_expansion)

#                 if not pr_flag:
#                     print(f"results dict for instance i: {results}")
#                     pr_flag = True
                
#             except Exception as e:
#                 print(f"Error processing example {start_instance_index + i}: {str(e)}")
#                 save_checkpoint(layer_idx, start_instance_index + i, dict(results), checkpoint_dir)
#                 raise

#         # Pad shorter arrays with None values
#         for key, value in results.items():
#             if len(value) < max_instances:
#                 results[key].extend([None] * (max_instances - len(value)))

#     max_length = max(len(v) for v in results.values())
#     for key in results:
#     results[key] = results[key] + [None] * (max_length - len(results[key]))

#     # Convert results to DataFrame
#     df = pd.DataFrame(results)
#     df.set_index('instance_id', inplace=True)
#     return df

def evaluate_model(model, tokenizer, dataset: Dataset, layer_indices: List[int], checkpoint_dir: str = "checkpoints", max_retries: int = 3) -> pd.DataFrame:
    pr_flag = False
    checkpoint = load_checkpoint(checkpoint_dir)
    max_instances = 0
    dataset_size = len(dataset)

    if checkpoint:
        start_layer_idx = checkpoint["layer_idx"]
        start_instance_index = checkpoint["instance_index"] + 1
        results = defaultdict(list, checkpoint["results"])
        max_instances = len(results['instance_id'])
    else:
        start_layer_idx = layer_indices[0]
        start_instance_index = 0
        results = defaultdict(list)
    print("layer_indices: ", layer_indices)
    for layer_idx in tqdm(layer_indices[layer_indices.index(start_layer_idx):], desc="Processing layer indices"):
        framework = AdvancedLLaMACRVFramework(model, tokenizer, layer_idx=layer_idx)
        
        if start_instance_index >= dataset_size:
            print(f"All instances processed for layer {layer_idx}. Moving to next layer.")
            start_instance_index = 0
            continue
        
        for i, instance in enumerate(tqdm(dataset.skip(start_instance_index), desc=f"Processing instances for layer {layer_idx}", total=dataset_size-start_instance_index)):
            if not pr_flag:
                print(f"start_layer_idx: {start_layer_idx}\nstart_instance_index: {start_instance_index}\ni: {i}\nlayer_idx: {layer_idx}")
            try:    
                if (start_instance_index + i + 1) % 5 == 0:
                    save_checkpoint(layer_idx, start_instance_index + i, dict(results), checkpoint_dir)
                
                if i % 20 == 0:
                    clear_gpu_memory()
                    
                if check_memory():
                    clear_gpu_memory()
                
                query = instance['query'][0] if instance['query'] else ''
                context = instance['context'][0] if instance['context'] else '' 
                test_cases = '\n'.join(extract_test_cases(instance['input_final_prompts'][0])[-1])
                tmp = "Proposed solution context: "

                for attempt in range(max_retries):
                    try:
                        trajectories_and_context = framework.generate_thought_trajectories(query, context, test_cases, max_new_tokens=1000, alt_text=alt_text)
                        context_expansion = extract_sections(trajectories_and_context, "context_generation")
                        context_expansion = f"Proposed solution context: {context_expansion}{extract_sections(trajectories_and_context, 'solution')}"
                        
                        hidden_states, seq_len = framework.extract_hidden_states(context_expansion)
                        crv, seq_len = framework.generate_crv(hidden_states, seq_len)
                        
                        final_output = framework.final_generation(query, test_cases, crv, seq_len, max_new_tokens=250)
                        extracted_functions = extract_functions(final_output)
                        
                        # generation_successful = True
                        break
                    except Exception as e:
                        logger.warning(f"Generation failed for layer {layer_idx}, instance {i}, attempt {attempt + 1}: {str(e)}\nquery: {query}")
                        if attempt == max_retries - 1:
                            logger.error(f"All retries failed for layer {layer_idx}, instance {i}. Error: {traceback.format_exc()}")
            

                # trajectories_and_context = framework.generate_thought_trajectories(query, context, test_cases, max_new_tokens=1000, alt_text=alt_text)
                # context_expansion = extract_sections(trajectories_and_context, "context_generation")
                # context_expansion = tmp + context_expansion + extract_sections(trajectories_and_context, "solution")
                
                # hidden_states, seq_len = framework.extract_hidden_states(context_expansion)
                # crv, seq_len = framework.generate_crv(hidden_states, seq_len)
                
                # final_output = framework.final_generation(query, test_cases, crv, seq_len, max_new_tokens=250)
                # extracted_functions = extract_functions(final_output)
                
                current_instance = i + start_instance_index
                if current_instance >= max_instances:
                    results['instance_id'].append(current_instance)
                    results['query'].append(query)
                    results['context'].append(context)
                    results['test_cases'].append(test_cases)
                    max_instances = current_instance + 1

                results[f'final_output_{layer_idx}'].append(final_output)
                results[f'extracted_functions_{layer_idx}'].append(extracted_functions)
                results[f'trajectories_and_context_{layer_idx}'].append(trajectories_and_context)
                results[f'context_expansion_{layer_idx}'].append(context_expansion)

                if not pr_flag:
                    print(f"results dict for instance i: {results}")
                    pr_flag = True
                
            except Exception as e:
                print(f"Error processing example {start_instance_index + i}: {str(e)}")
                save_checkpoint(layer_idx, start_instance_index + i, dict(results), checkpoint_dir)
                raise

        start_instance_index = 0

    # Ensure all arrays have the same length
    max_length = max(len(v) for v in results.values())
    for key in results:
        results[key] = results[key] + [None] * (max_length - len(results[key]))

    df = pd.DataFrame(dict(results))  # Convert defaultdict to regular dict
    if 'instance_id' in df.columns:
        df.set_index('instance_id', inplace=True)

    # Convert results to DataFrame
    # df = pd.DataFrame(results)
    # df.set_index('instance_id', inplace=True)
    return df

In [None]:
subset_name = "processed_Meta-Llama-3.1-8B-Instruct-evals__mbpp__details"
loaded_dataset = load_from_disk(f"data/{subset_name}")

num_examples=250
if num_examples is not None:
    loaded_dataset = loaded_dataset.select(range(num_examples))

# Define layer indices to evaluate
layer_indices = [1, 10, 23, 'orig']
# layer_indices = [32, 'orig']
 
checkpoint = load_checkpoint()
start_index = 0
results_df = pd.DataFrame()

if checkpoint:
    start_index = checkpoint["instance_index"]
    results_df = checkpoint["results"]
    print(f"Resuming from example {start_index}")


# Evaluate model
# new_results_df = evaluate_model(model, tokenizer, loaded_dataset, layer_indices)
new_results_df = evaluate_model(model, tokenizer, loaded_dataset, layer_indices)

print("new_results_df info:")
print(new_results_df.info())
print("\nnew_results_df shape:", new_results_df.shape)
print("\nnew_results_df columns:", new_results_df.columns)
print("\nnew_results_df head:")
print(new_results_df.head())

# Combine previous results with new results
results_df = pd.concat([results_df, new_results_df], ignore_index=True)

# Add parsed functions to the dataset
updated_dataset = add_parsed_functions_to_dataset(loaded_dataset, results_df, layer_indices)

print(f"Type of updated_dataset: {type(updated_dataset)}")
print(f"Number of rows in updated_dataset: {len(updated_dataset)}")

# Save the updated dataset
updated_dataset.save_to_disk(f"data/{subset_name}_results")

# os.remove(os.path.join("checkpoints", "checkpoint.pkl"))


Loaded checkpoint from checkpoints/checkpoint_layer_1_instance_9.json
Resuming from example 9
Loaded checkpoint from checkpoints/checkpoint_layer_1_instance_9.json
layer_indices:  [1, 10, 23, 'orig']


Processing layer indices:   0%|          | 0/4 [00:00<?, ?it/s]

Processing instances for layer 1:   0%|          | 0/240 [00:00<?, ?it/s]

start_layer_idx: 1
start_instance_index: 10
i: 0
layer_idx: 1


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tok

results dict for instance i: defaultdict(<class 'list'>, {'instance_id': [0, 1, 2, 3, 5, 6, 7, 8, 10], 'query': ['<|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a function to check if given tuple is distinct or not.\nYour code should pass the following tests:\nassert check_distinct((1, 4, 5, 6, 1, 4)) == False\nassert check_distinct((1, 4, 5, 6)) == True\nassert check_distinct((2, 3, 4, 5, 6)) == True<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python', '<|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a python function to find the first non-repeated character in a given string.\nYour code should pass the following tests:\nassert first_non_repeating_character("abcabc") == None\nassert first_non_repeating_character("abc") == "a"\nassert first_non_repeating_character("ababc") == "c"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```pyth

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_14.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_19.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_24.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_29.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_34.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_39.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_44.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_49.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

9202326016 11282677760 25217466368 0.8123339385908604


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_54.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_59.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

Checkpoint saved at checkpoints/checkpoint_layer_1_instance_64.json


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generatio

In [40]:
def main():
    subset_name = "processed_Meta-Llama-3.1-8B-Instruct-evals__mbpp__details"
    loaded_dataset = load_from_disk(f"data/{subset_name}")
    num_examples = None
    if num_examples is not None:
        loaded_dataset = loaded_dataset.select(range(num_examples))

    # Define layer indices to evaluate
    layer_indices = [1, 10, 15, 20, 32, 'orig']

    # Load checkpoint if exists
    checkpoint = load_checkpoint()
    start_index = 0
    results_df = pd.DataFrame()

    if checkpoint:
        start_index = checkpoint["current_index"]
        results_df = checkpoint["results_df"]
        print(f"Resuming from example {start_index}")

    
    while start_index < len(loaded_dataset):

        clear_gpu_memory()
        # Evaluate model
        new_results_df, end_index = evaluate_model(model, tokenizer, loaded_dataset, layer_indices, num_examples=num_examples, start_index=start_index)

        # Combine previous results with new results
        results_df = pd.concat([results_df, new_results_df], ignore_index=True)

        # Update start_index for next iteration
        start_index = end_index

        # Save checkpoint
        save_checkpoint(start_index, results_df)

    # Add parsed functions to the dataset
    updated_dataset = add_parsed_functions_to_dataset(loaded_dataset, results_df, layer_indices)

    print(f"Type of updated_dataset: {type(updated_dataset)}")
    print(f"Number of rows in updated_dataset: {len(updated_dataset)}")

    # Save the updated dataset
    updated_dataset.save_to_disk(f"data/{subset_name}_results")

    # Clear checkpoint after successful completion
    os.remove(os.path.join("checkpoints", "checkpoint.pkl"))

    # except Exception as e:
    #     print(f"An error occurred: {str(e)}")
    #     print(f"Last processed example: {start_index}")
    #     print("The script will automatically resume from this point when restarted.")
        
        # Checkpoint is already saved in the evaluate_model function, so we don't need to save it here



In [None]:
subset_name = "processed_Meta-Llama-3.1-8B-Instruct-evals__mbpp__details"
loaded_dataset = load_from_disk(f"data/{subset_name}")
num_examples = None
if num_examples is not None:
    loaded_dataset = loaded_dataset.select(range(num_examples))

# Define layer indices to evaluate
layer_indices = [1, 10, 15, 20, 32, 'orig']

# Load checkpoint if exists
checkpoint = load_checkpoint()
start_index = 0
results_df = pd.DataFrame()

if checkpoint:
    start_index = checkpoint["current_index"]
    results_df = checkpoint["results_df"]
    print(f"Resuming from example {start_index}")

# Evaluate model
new_results_df, end_index = evaluate_model(model, tokenizer, loaded_dataset, layer_indices, num_examples=num_examples, start_index=start_index)

# Combine previous results with new results
results_df = pd.concat([results_df, new_results_df], ignore_index=True)

# Add parsed functions to the dataset
updated_dataset = add_parsed_functions_to_dataset(loaded_dataset, results_df, layer_indices)

print(f"Type of updated_dataset: {type(updated_dataset)}")
print(f"Number of rows in updated_dataset: {len(updated_dataset)}")

# Save the updated dataset
updated_dataset.save_to_disk(f"data/{subset_name}_results")

# Clear checkpoint after successful completion
os.remove(os.path.join("checkpoints", "checkpoint.pkl"))


In [15]:
print("\nFirst 2 rows of the dataset:")
for i, example in enumerate(results_df):
    if i < 2:
        print(f"Row {i + 1}:")
        for key, value in example.items():
            print(f"  {key}: {value}")
        print()  # Add a blank line between rows
    else:
        break



First 2 rows of the dataset:
Row 1:


AttributeError: 'str' object has no attribute 'items'

In [None]:

def main():    
    # Print column names
    print("\nColumns in the updated dataset:")
    print(updated_dataset.column_names)
    # loaded_dataset2 = load_from_disk("data/processed_meta_llama_dataset_with_results")
    loaded_dataset2 = load_dataset(
    "meta-llama/Meta-Llama-3.1-8B-Instruct-evals",
    name="Meta-Llama-3.1-8B-Instruct-evals__mbpp__details",
    split="latest"
    )



    # Print selected columns from the first 5 rows of the updated dataset
    print("First 5 rows of the updated dataset (selected columns):")
    for i, example in enumerate(loaded_dataset2.select(range(num_examples))):
        print(f"\nExample {i + 1}:")
        # Print original columns
        print(f"input_correct_responses: {example['input_correct_responses']}...")
        # print(f"extracted_functions_layer_15: {example['extracted_functions_layer_15']}...")
        
        # Print new columns for each layer
        for layer in layer_indices:
            print(f"\nLayer {layer}:")
            final_output = example.get(f'final_output_layer_{layer}')
            extracted_functions = example.get(f'extracted_functions_layer_{layer}')
            
            if final_output:
                print(f"Final Output: {final_output[:100]}...")
            else:
                print("Final Output: None")
            
            if extracted_functions:
                print(f"Extracted Functions: {extracted_functions[:100]}...")
            else:
                print("Extracted Functions: None")


# if __name__ == "__main__":
#     main()
