In [None]:
%pip install transformers accelerate
%pip install evaluate datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
from tqdm.auto import tqdm # Import tqdm
import pandas as pd
import evaluate
import os

In [None]:
import gc
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
gc.collect()

In [None]:
# Load LLM
checkpoint = "deepseek-ai/deepseek-coder-6.7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.bfloat16).to(device)

In [None]:
inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

## Load the humaneval dataset

Load the HumanEval dataset using the `load_dataset` function from the `datasets` library.


In [None]:
humaneval_dataset = load_dataset("mbpp", "sanitized")

## Generate solutions

Iterate through the dataset and generate code solutions for each problem using the loaded CodeLlama model.


In [None]:
generated_solutions = []

# Number of samples to generate for each problem
num_samples = 15  # Reduce the number of samples to reduce memory usage

for i, example in tqdm(enumerate(humaneval_dataset['test']), desc="Generating Solutions", total=len(humaneval_dataset['test'])):
    prompt = "# Complete the Python function below\n" + example['prompt']

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    generated_ids = model.generate(
        inputs["input_ids"],
        max_new_tokens=512, # Reduced max_new_tokens to reduce memory usage
        num_return_sequences=num_samples, # Generate num_samples sequences
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=inputs["attention_mask"], # Add attention mask
        do_sample=True # Enable sampling to generate multiple sequences
    )

    decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    # Simple logic to extract the function for each generated output
    solutions_for_task = []
    for decoded_output in decoded_outputs:
        generated_code = ""
        start_index = decoded_output.find("def ")
        if start_index != -1:
            generated_code = decoded_output[start_index:]
            # Further refinement to stop at an empty line or other function definition could be added here
            # For simplicity, we take everything after 'def ') for now.
        solutions_for_task.append(generated_code)

    generated_solutions.append({
        'task_id': example['task_id'],
        'prompt': example['prompt'],
        'solutions': solutions_for_task  # Store a list of solutions
    })
    torch.cuda.empty_cache()
    gc.collect()
    #if i >= 5: # Limit to the first few
    #   break

print(f"Code generation complete for the first 20 entries with {num_samples} sample each!")

In [None]:
import json

# Save the generated solutions to a JSONL file
output_file = "generated_solutions_test_multisample.jsonl"
with open(output_file, 'w') as f:
    for entry in generated_solutions:
        json.dump(entry, f)
        f.write('\n')

print(f"Generated solutions saved to {output_file}")

## Set up evaluation environment

Install necessary libraries and tools for evaluating the HumanEval solutions.


## Load generated solutions

Load the generated solutions from the JSONL file `generated_solutions_codellama_test.jsonl`.


In [None]:
import pandas as pd

# Load the generated solutions from the JSONL file
generated_solutions_df = pd.read_json("generated_solutions_test_multisample.jsonl", lines=True)

display(generated_solutions_df.head())

## Run evaluation

Use an appropriate evaluation script or library to evaluate the generated solutions against the HumanEval test cases.


In [None]:
import evaluate
import os
from collections import defaultdict

# Set environment variable to allow code evaluation
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

# Load the HumanEval evaluation metric
code_eval_metric = evaluate.load("code_eval")

# Prepare the predictions and references for evaluation
# predictions should be a list of lists, where each inner list contains the generated solutions for a single task
predictions = generated_solutions_df['solutions'].tolist()

references = []
for i, example in enumerate(humaneval_dataset['test']):
    if i >= len(predictions):  # Limit references to the number of predictions
        break
    references.append(example['code'])


# Compute the evaluation results with multiple pass@k
evaluation_results = code_eval_metric.compute(
    references=references,
    predictions=predictions,
    k=[1, 5, 10, 15] # Specify the k values for pass@k
)

display(evaluation_results)

In [None]:
display(f"Pass@1 score: {evaluation_results[0]['pass@1']}")
display(f"Pass@5 score: {evaluation_results[0]['pass@5']}")
display(f"Pass@10 score: {evaluation_results[0]['pass@10']}")
display(f"Pass@15 score: {evaluation_results[0]['pass@15']}")

## Final Results Summary

Here are the evaluation results and a summary of the syntax analysis.

In [None]:
import ast
import pandas as pd

def analyze_syntax(code):
    try:
        ast.parse(code)
        return "Syntax OK"
    except SyntaxError as e:
        return f"Syntax Error: {e}"

# Assuming generated_solutions_df is already loaded from the JSONL file
if 'generated_solutions_df' not in locals():
    try:
        generated_solutions_df = pd.read_json("generated_solutions_test_multisample.jsonl", lines=True)
    except FileNotFoundError:
        display("Error: generated_solutions_test_multisample.jsonl not found. Please run the code generation cells first.")
        generated_solutions_df = pd.DataFrame(columns=['task_id', 'solutions'])


syntax_analysis_results = []
for solutions_list in generated_solutions_df['solutions']:
    task_syntax_results = [analyze_syntax(solution) for solution in solutions_list]
    syntax_analysis_results.append(task_syntax_results)

# Add the syntax analysis results to the DataFrame
generated_solutions_df['syntax_analysis'] = syntax_analysis_results

# Display the DataFrame with syntax analysis results for the first solution of each task
display("\n--- Syntax Analysis Summary (First Solution per Task) ---")
display(generated_solutions_df[['task_id', 'syntax_analysis']].head())

# Save the detailed AST analysis to a CSV file
# Saving analysis for all solutions for completeness
all_syntax_results = []
for index, row in generated_solutions_df.iterrows():
    for sol_index, result in enumerate(row['syntax_analysis']):
        all_syntax_results.append({
            'task_id': row['task_id'],
            'solution_index': sol_index,
            'syntax_result': result,
            'solution_code': row['solutions'][sol_index] # Include the solution code
        })

syntax_analysis_detailed_df = pd.DataFrame(all_syntax_results)
output_csv_file = "syntax_analysis_detailed.csv"
syntax_analysis_detailed_df.to_csv(output_csv_file, index=False)

display(f"\nDetailed syntax analysis saved to {output_csv_file}")

In [None]:
total_solutions = 0
syntax_ok_solutions = 0

for task_results in generated_solutions_df['syntax_analysis']:
    total_solutions += len(task_results)
    syntax_ok_solutions += task_results.count('Syntax OK')

display(f"Total number of generated solutions: {total_solutions}")
display(f"Number of solutions with OK syntax: {syntax_ok_solutions}")
display(f"Number of solutions with Syntax Errors: {total_solutions - syntax_ok_solutions}")