In [95]:
from transformers import AutoConfig
from datasets import load_from_disk

import configs
from experimental.controller.memory_manager import MemoryManager
from generator.crv_generator import CRVGenerator
from generator.text_generator import TextGenerator

from utils import set_seed, logger
from utils.loading_model import CustomTransformerLoader

# from rich import print
from rich.console import Console


In [96]:
# Set up logging and console
console = Console()
logger = logger()

In [97]:
console = Console()
seed = 42
set_seed(seed)

model_urls = {
    "llama31": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "llama3": "meta-llama/Meta-Llama-3-8B-Instruct",
}
model_path = model_urls["llama31"]
tokenizer_path = model_path
hf_token = "your token"

In [98]:
config = AutoConfig.from_pretrained(model_path, use_auth_token=hf_token)

console.rule("[bold red]Loading the Model")

loader = CustomTransformerLoader()



In [99]:
model, tokenizer = loader.load_model(
    model_path=model_path, tokenizer_path=tokenizer_path, hf_token=hf_token
)

crv_layers = configs.CRV_LAYERS

print(":warning: model type: ", type(model))
print("config.hidden_size: ", config.num_hidden_layers)
print("config._attn_implementation: ", config._attn_implementation)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



config.hidden_size:  32
config._attn_implementation:  eager


In [100]:
import re

def extract_context_expansion(text):
    pattern = r'<context_generation>(.*?)</context_generation>'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return f"Context expansion section not found. The original text: {text}"

In [101]:
def extract_test_cases(text):
    # Pattern to match assert statements
    pattern = r'assert\s+[\w_]+\(.*?\).*?(?=[\n<]|$)'
    
    # Find all matches
    test_cases = re.findall(pattern, text)
    
    # Group test cases by task
    grouped_tests = []
    current_group = []
    
    for test in test_cases:
        if current_group and not test.startswith(current_group[-1].split('(')[0]):
            grouped_tests.append(current_group)
            current_group = []
        current_group.append(test)
    
    if current_group:
        grouped_tests.append(current_group)
        print("test cases len: ", len(grouped_tests))
    
    return grouped_tests

text = '''<|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass the following tests:\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python\ndef similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res) \n```<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass the following tests:\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python\nimport math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n % i == 0:\n result = True\n return result\n```<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass the following tests:\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums\n```<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a function to create the next bigger number by rearranging the digits of a given number.\nYour code should pass the following tests:\nassert rearrange_bigger(12)==21\nassert rearrange_bigger(10)==False\nassert rearrange_bigger(102)==120<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python"'''
out = extract_test_cases(text)
print(out[-1])

test cases len:  4
['assert rearrange_bigger(12)==21', 'assert rearrange_bigger(10)==False', 'assert rearrange_bigger(102)==120']


In [102]:
def extract_functions2(text):
    # Regex pattern to match function definitions
    # pattern = r"def\s+\w+\s*\(.*?\):\s*(?:\n\s*['\"].*?['\"])?(?:\n(?:(?!def\s).)*?)*"
    # pattern = r"(def\s+\w+\s*\(.*?\):(?:\s*['\"][\s\S]*?['\"])?\s*(?:(?!def\s)[\s\S])*?(?=\ndef|\Z))"
    function_pattern = r"(def\s+\w+\s*\(.*?\):(?:\s*['\"][\s\S]*?['\"])?\s*(?:(?!def\s)[\s\S])*?(?=\ndef|\Z))"
    
    functions = re.findall(function_pattern, text, re.MULTILINE | re.DOTALL)
    
    def clean_function(func):
        # Remove docstrings
        func = re.sub(r'"""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\'', '', func)
        # Remove comments
        func = re.sub(r'#.*', '', func)
        # Remove empty lines and trailing whitespace
        func = '\n'.join(line for line in func.splitlines() if line.strip())
        return func
    
    cleaned_functions = [clean_function(func) for func in functions]
    
    # Join functions with two newlines for readability
    return '\n\n'.join(cleaned_functions)


    
    # # Find all matches in the text
    # functions = re.findall(pattern, text, re.DOTALL)
    
    # # Strip leading/trailing whitespace and return the list
    # return [func.strip() for func in functions]

def extract_functions(text):
    # Extract imports
    import_pattern = r'^(?:from\s+[\w.]+\s+import\s+(?:[\w.]+(?:\s*,\s*[\w.]+)*|\*)|import\s+(?:[\w.]+(?:\s*,\s*[\w.]+)*))(?:\s+as\s+[\w.]+)?'
    imports = re.findall(import_pattern, text, re.MULTILINE)
    
    # Extract functions
    function_pattern = r"(def\s+\w+\s*\(.*?\):(?:\s*['\"][\s\S]*?['\"])?\s*(?:(?!def\s)[\s\S])*?(?=\ndef|\Z))"
    functions = re.findall(function_pattern, text, re.MULTILINE | re.DOTALL)
    
    def clean_code(code):
        # Remove docstrings
        code = re.sub(r'"""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\'', '', code)
        # Remove comments
        code = re.sub(r'#.*', '', code)
        # Remove empty lines and trailing whitespace
        code = '\n'.join(line for line in code.splitlines() if line.strip())
        return code
    
    cleaned_imports = [clean_code(imp) for imp in imports]
    cleaned_functions = [clean_code(func) for func in functions]
    
    # Combine imports and functions
    cleaned_code = '\n'.join(cleaned_imports)
    if cleaned_imports and cleaned_functions:
        cleaned_code += '\n\n'
    cleaned_code += '\n\n'.join(cleaned_functions)
    
    return cleaned_code


In [103]:
class AdvancedLLaMACRVFramework:
    def __init__(self, model, tokenizer, layer_idx = 10):
        self.model = model
        self.tokenizer = tokenizer
        self.text_generator = TextGenerator(model, tokenizer)
        self.crv_generator = CRVGenerator(model, tokenizer, max_length=configs.MAX_LENGTH)
        self.memory_manager = MemoryManager(model, max_memories=5)
        self.layer_idx = layer_idx


    def generate_thought_trajectories(self, input_query, test_cases=None, max_new_tokens=1000):
        prompt_template = f"""
        <|begin_of_text|><|start_header_id|>system<|end_header_id|>
        
        \n\nYou are an expert Python programmer designed to provide standard, accurate,and fully working codes, and here is your task:\n
        \nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass the following tests:\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python\ndef similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res) \n```<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass the following tests:\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python\nimport math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n % i == 0:\n result = True\n return result\n```<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass the following tests:\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums\n```<|eot_id|>
        Enable code_interpreter tool.
        <|eot_id|><|start_header_id|>user<|end_header_id|>
        
        \n\nYou are an expert Python programmer, and here is your task:\n{input_query}.\nYour code must pass these test cases:{test_cases}
        
        \n\nYour outputs must follow this structure:

        Identify the core components of this problem.
        1. Identify potential edge cases and tricky parts.
        2. Write 2 short test cases for the edge cases and tricky parts.
        
        <chain_of_thoughts>
        1. you must consider the edge cases according to the problem statement.
        2. Begin with a <thinking> section.
        3. Inside the thinking section:
           a. Write the topic name of the query, the name of the algorithm if necessary.
           b. Draft an answer as an expert.
           b. Briefly analyze the question and outline your approach.
           c. Present a clear plan of steps to solve the problem.
           d. Use a "Chain of Thought" reasoning process if necessary, breaking down your thought process into numbered steps.
        4. Include a <reflection> section for each idea where you:
           a. Review your reasoning.
           b. Check for potential errors or oversights.
           c. Confirm or adjust your conclusion if necessary.
        5. Be sure to close all reflection sections.
        6. Close the thinking section with </thinking>.
        7. Provide your final answer in an <output> section.        
        </chain_of_thoughts>

        <chain_of_thought_selection>
        you must consider the edge cases according to the problem statement and select the most promising chain of thought that solves the edge cases (not necessarily the simplest nor the standard approach).
        </chain_of_thought_selection>

        <solution>
        1. As a Python expert, generate the Python code and make sure it solves the edge cases while keeping it efficient.
        2. the internal steps must produce the required output.
        </solution>

        Include a <reflection> section for the selected solution where if it is not correct, modify or if necessary, rewrite the solution and pay attention to the input problem.
           a. Review your reasoning.
           b. Check for potential errors or oversights according to the problem. you must consider the edge cases according to the problem. Make sure it is not overcomplicated.
           c. Confirm or adjust your conclusion if necessary.
        4. Be sure to close all reflection sections.
        
        <context_generation>
        1. Rewrite the problem.
        2. Rewrite the edge cases and tricky parts in one short sentence
        2. Generate a very accurate and minimal Python code/pseudocode for the final solution. Ensure that the final solution is minimal and accurate.
        </context_generation>
        <|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>\n\n"
        """
        # <|eot_id|><|start_header_id|>user<|end_header_id|>\n\n
        # <|start_header_id|>system<|end_header_id|>
        
        # <|eot_id|>
        # ```python
        # print("prompt: ", prompt)
        generated_text = self.text_generator.generate_text(
            prompt_template,
            max_new_tokens=max_new_tokens,
            num_return_sequences = 1,
            output_file="data/results.csv",
            # stop_sequences=["The end", ".\n\n"],
        )
        return generated_text
    
    def extract_hidden_states(self, context):
        best_crv, seq_length = self.crv_generator.generate_crvs(
            context, crv_layers=crv_layers, max_length=configs.MAX_LENGTH
        )
        return best_crv, seq_length  # Return the hidden state and its len

    def generate_crv(self, hidden_states, seq_length):
        # return torch.mean(hidden_states, dim=1)
        return hidden_states, seq_length
        
    def final_generation(self, original_query, test_cases, crv, seq_length, max_new_tokens=250):

        query=f"""<|start_header_id|>user<|end_header_id|>\n\nYou are an expert Python programmer, and here is your task:\n{original_query}.\nYour code should pass the following tests:{test_cases}"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n```python"""
        # Combine original query and CRV
        self.memory_manager.add_memory(
        crv, seq_length, layer_idx=self.layer_idx, crv_layers=crv_layers
    )

        # model.model.set_post_concat_crv(True)
        self.memory_manager.set_concat_positions(0, start_pos=0, end_pos=seq_length)
        self.memory_manager.apply_memory_to_model(0)
        generated_text = self.text_generator.generate_text(
            query,
            max_new_tokens=max_new_tokens,
            num_return_sequences = 1,
            output_file="data/results.csv",
            # stop_sequences=["The end", ".\n\n"],
        )
        # print(generated_text)
        print('==' * 50)
        return generated_text

In [None]:
# iterate over the dataset


framework = AdvancedLLaMACRVFramework(model, tokenizer, layer_idx = 15)

loaded_dataset = load_from_disk("data/processed_meta_llama_dataset")
# layer_idx = 5
i = 0
buff = ""
types = []
num_examples = 3
for instance in loaded_dataset:
    if i> num_examples-1:
        break
    query = instance['query'][0]
    context = instance['context'][0]
    test_cases = '\n'.join(extract_test_cases(instance['input_final_prompts'][0])[-1])
    # print(len(query), query)
    # print(len(context))
    print("test_cases: ", test_cases)
    print('====' * 15)

    trajectories_and_context = framework.generate_thought_trajectories(query, test_cases, max_new_tokens=1000)
    print("End of Trajectories and Context: ", trajectories_and_context)
    context_expansion = extract_context_expansion(trajectories_and_context)
    print('====' * 15)

    print("the extracted context: ", context_expansion)
    print('----' * 15)
    
    # Instance 2: Extract hidden states from generated context
    hidden_states, seq_len = framework.extract_hidden_states(context_expansion)
    
    # Generate CRV from hidden states
    crv, seq_len = framework.generate_crv(hidden_states, seq_len)
    print("Generated CRV:")

    # Instance 3: Final generation using original query and CRV
    final_output = framework.final_generation(query, test_cases, crv, seq_len, max_new_tokens=250) #todo: change query to the instructed query
    print("Final Output:", final_output)
    print('test cases: ', test_cases)
    print("extracted functions: ", extract_functions(final_output))
    print('****|' * 15)
    


    # model.model.set_post_concat_crv(True)
    # memory_manager.set_concat_positions(0, start_pos=0, end_pos=best_seq_length)
    # memory_manager.apply_memory_to_model(0)
    # generated_text = text_generator.generate_text(
    # query,
    # max_new_tokens=400,
    # num_return_sequences = 1,
    # output_file="data/results.csv",
    # # stop_sequences=["The end", ".\n\n"],
    # )
    # print(generated_text)
    # print('==' * 50)

    i += 1
    

In [104]:
from datasets import load_from_disk, Dataset
from tqdm import tqdm
import pandas as pd
from typing import List


def evaluate_model(model, tokenizer, dataset: Dataset, layer_indices: List[int], num_examples: int = -1) -> pd.DataFrame:
    results = []
    
    for layer_idx in tqdm(layer_indices, desc="Processing layer indices"):
        framework = AdvancedLLaMACRVFramework(model, tokenizer, layer_idx=layer_idx)
        
        for i, instance in enumerate(tqdm(dataset, desc=f"Processing instances for layer {layer_idx}")):
            if num_examples != -1 and i >= num_examples:
                break
            
            query = instance['query'][0]
            context = instance['context'][0]
            test_cases = '\n'.join(extract_test_cases(instance['input_final_prompts'][0])[-1])
            
            trajectories_and_context = framework.generate_thought_trajectories(query, test_cases, max_new_tokens=1000)
            context_expansion = extract_context_expansion(trajectories_and_context)
            
            hidden_states, seq_len = framework.extract_hidden_states(context_expansion)
            crv, seq_len = framework.generate_crv(hidden_states, seq_len)
            
            final_output = framework.final_generation(query, test_cases, crv, seq_len, max_new_tokens=250)
            extracted_functions = extract_functions(final_output)
            
            result = {
                'layer_idx': layer_idx,
                'instance_id': i,
                'query': query,
                'context': context,
                'test_cases': test_cases,
                'final_output': final_output,
                'extracted_functions': extracted_functions
            }
            results.append(result)
    
    return pd.DataFrame(results)


In [105]:
def add_parsed_functions_to_dataset(dataset: Dataset, results_df: pd.DataFrame, layer_indices=[5, 10, 15, 20]) -> Dataset:
    # Convert results DataFrame to a dictionary
    results_dict = results_df.to_dict('records')
    
    # Create a dictionary to store new columns
    new_columns = {
        'final_output_layer_5': [],
        'final_output_layer_10': [],
        'final_output_layer_15': [],
        'final_output_layer_20': [],
        'extracted_functions_layer_5': [],
        'extracted_functions_layer_10': [],
        'extracted_functions_layer_15': [],
        'extracted_functions_layer_20': [],
    }
    
    # Populate new columns
    for i in range(len(dataset)):
        instance_results = [r for r in results_dict if r['instance_id'] == i]
        for layer in layer_indices:
            layer_result = next((r for r in instance_results if r['layer_idx'] == layer), None)
            if layer_result:
                new_columns[f'final_output_layer_{layer}'].append(layer_result['final_output'])
                new_columns[f'extracted_functions_layer_{layer}'].append(layer_result['extracted_functions'])
            else:
                new_columns[f'final_output_layer_{layer}'].append(None)
                new_columns[f'extracted_functions_layer_{layer}'].append(None)
    
    # Add new columns to the dataset
    for column_name, column_data in new_columns.items():
        dataset = dataset.add_column(column_name, column_data)
    
    return dataset


In [108]:
def main():
    # Load dataset
    loaded_dataset = load_from_disk("data/processed_meta_llama_dataset")

    # Define layer indices to evaluate
    layer_indices = [15]

    # Evaluate model
    results_df = evaluate_model(model, tokenizer, loaded_dataset, layer_indices, num_examples=3)

    # Add parsed functions to the dataset
    updated_dataset = add_parsed_functions_to_dataset(loaded_dataset, results_df, layer_indices)

    # Save the updated dataset
    updated_dataset.save_to_disk("data/processed_meta_llama_dataset_with_results")

    # Print some statistics
    print("Dataset size:", len(updated_dataset))
    print("Columns:", updated_dataset.column_names)

    # You can now use this updated dataset for further metric calculations

if __name__ == "__main__":
    main()


Processing layer indices:   0%|                           | 0/1 [00:00<?, ?it/s]
Processing instances for layer 15:   0%|                | 0/500 [00:00<?, ?it/s][AThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1000) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Processing instances for layer 15:   0%|                | 0/500 [00:00<?, ?it/s]
Processing layer indices:   0%|                           | 0/1 [00:00<?, ?it/s]


test cases len:  4


OutOfMemoryError: CUDA out of memory. Tried to allocate 72.00 MiB. GPU  has a total capacity of 23.49 GiB of which 8.25 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 23.15 GiB is allocated by PyTorch, and 52.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)