In [None]:
%pip install torch transformers datasets evaluate accelerate bitsandbytes

Load latest transformer package

In [None]:
%pip install git+https://github.com/huggingface/transformers.git

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
from tqdm.auto import tqdm # Import tqdm
import pandas as pd
import evaluate
import os

In [None]:
import gc
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
gc.collect()

## Download Model

Download a pre-trained CUDA-LLM model.
Use the `transformers` library to download a pre-trained language model.

In [None]:
checkpoint = "ByteDance-Seed/cudaLLM-8B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.float16).to(device)

Download a CUDA-LLM model and the NVIDIA compute-eval dataset, then evaluate the model against the dataset and report the results.
Gated repo - needs access permissions on Huggingface.

In [None]:
from datasets import load_dataset

dataset_name = "nvidia/compute-eval" # The name of the dataset

try:
    dataset = load_dataset(dataset_name)
    print(f"Successfully downloaded dataset: {dataset_name}")
    print(dataset)
except Exception as e:
    print(f"Error downloading dataset: {e}")
    print("Please ensure you have the correct dataset name and are connected to the internet.")

## Alternatevely download Dataset from GitHub
Clone the NVIDIA compute-eval repository from GitHub.


In [None]:
!git clone https://github.com/nvidia/compute-eval.git

## Generate Solutions and Save

Generate solutions for the compute-eval dataset using the loaded model and save them to a JSONL file.

Iterate through the dataset, run inference with the loaded model for each input, and save the generated solutions in a JSONL file.

In [None]:
import os

# List files in the cloned repository directory
repo_dir = "compute-eval" # The name of the cloned directory
print(f"Contents of {repo_dir}:")
print(os.listdir(repo_dir))

Contents of compute-eval:
['example_config_gen_samples.yaml', '.git', 'data', 'README.md', 'compute_eval', '.DS_Store', 'pyproject.toml', '.gitignore', 'DATASET_CARD.md', 'CONTRIBUTING.md', 'LICENSE', 'example_config_evalcorrectness.yaml', 'poetry.lock']


In [None]:
from datasets import load_dataset
import os

data_dir = os.path.join("compute-eval", "data")
dataset_file_path = os.path.join(data_dir, 'cuda_problems_073025.jsonl')

try:
    dataset = load_dataset('json', data_files=dataset_file_path)
    print("\nSuccessfully loaded dataset from file:")
    print(dataset)
except FileNotFoundError:
    print(f"\nDataset file not found at {dataset_file_path}. Please check the path.")
except Exception as e:
    print(f"\nError loading dataset from file: {e}")

In [None]:
import os

data_dir = os.path.join("compute-eval", "data")
print(f"Contents of {data_dir}:")
print(os.listdir(data_dir))

In [None]:
import json
import os

output_file = "compute_eval_solutions.jsonl"
solutions = []
count = 0 # Initialize a counter
output_dir = "cuda" # Define the output directory

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Ensure the model is in evaluation mode
model.eval()

with torch.no_grad(): # Disable gradient calculation for inference
    for item in tqdm(dataset['train'], desc="Generating Solutions"): # Assuming 'train' split, adjust if needed
        #if count >= 10: # Exit loop after 3 solutions
        #    break

        prompt = item['prompt'] # Assuming 'prompt' is the key for input text
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        # Generate response
        # You may need to adjust generation parameters like max_length, num_beams, etc.
        outputs = model.generate(**inputs, max_length=1024, num_beams=5, early_stopping=True) # Increased max_length

        # Decode the generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract the solution from the generated text (this might require custom parsing)
        # For simplicity, we'll assume the generated text is the solution for now
        solution = generated_text

        solutions.append({"prompt": prompt, "solution": solution})

        # Save the solution to a .cu file
        solution_filename = os.path.join(output_dir, f"solution_{count}.cu")
        with open(solution_filename, 'w') as f:
            f.write(solution)

        count += 1 # Increment the counter

# Save solutions to a JSONL file (optional, as solutions are also saved as .cu files)
with open(output_file, 'w') as f:
    for entry in solutions:
        json.dump(entry, f)
        f.write('\n')

print(f"\nGenerated solutions saved to {output_file} and individual .cu files in the '{output_dir}' directory.")

## Print Generated Solutions from File

Open and read the JSON Lines file containing the generated solutions and print each entry.

In [None]:
import json

output_file = "compute_eval_solutions.jsonl"

try:
    with open(output_file, 'r') as f:
        print(f"Contents of {output_file}:")
        for line in f:
            solution_entry = json.loads(line)
            print(solution_entry)
except FileNotFoundError:
    print(f"Error: The file {output_file} was not found.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {output_file}. Ensure it's a valid JSON Lines file.")
except Exception as e:
    print(f"An error occurred: {e}")

## Evaluation

In [None]:
import os

repo_dir = "compute-eval"
print(f"Contents of {repo_dir}:")
print(os.listdir(repo_dir))

# Check for common documentation files and directories
docs_files = ["README.md", "DATASET_CARD.md", "CONTRIBUTING.md"]
eval_dirs = ["scripts", "eval"]

print("\nChecking for documentation files:")
for doc_file in docs_files:
    if os.path.exists(os.path.join(repo_dir, doc_file)):
        print(f"- Found: {doc_file}")
    else:
        print(f"- Not found: {doc_file}")

print("\nChecking for evaluation directories:")
for eval_dir in eval_dirs:
    if os.path.exists(os.path.join(repo_dir, eval_dir)):
        print(f"- Found: {eval_dir}")
        # List contents of found evaluation directories
        print(f"  Contents of {os.path.join(repo_dir, eval_dir)}:")
        try:
            print(os.listdir(os.path.join(repo_dir, eval_dir)))
        except NotADirectoryError:
            print(f"  {os.path.join(repo_dir, eval_dir)} is not a directory.")
    else:
        print(f"- Not found: {eval_dir}")

In [None]:
import os

repo_dir = "compute-eval"
docs_files = ["README.md", "DATASET_CARD.md", "CONTRIBUTING.md"]

print("Reading documentation files:")
for doc_file in docs_files:
    file_path = os.path.join(repo_dir, doc_file)
    if os.path.exists(file_path):
        print(f"\n--- Contents of {doc_file} ---")
        try:
            with open(file_path, 'r') as f:
                print(f.read())
        except Exception as e:
            print(f"Error reading {doc_file}: {e}")

In [None]:
import os

repo_dir = "compute-eval"
eval_config_file = "example_config_evalcorrectness.yaml"
file_path = os.path.join(repo_dir, eval_config_file)

print(f"\n--- Contents of {eval_config_file} ---")
try:
    with open(file_path, 'r') as f:
        print(f.read())
except FileNotFoundError:
    print(f"Error: The file {eval_config_file} was not found.")
except Exception as e:
    print(f"Error reading {eval_config_file}: {e}")

In [None]:
import os

repo_dir = "compute-eval"
compute_eval_dir = os.path.join(repo_dir, "compute_eval")

print(f"\nContents of {compute_eval_dir}:")
try:
    print(os.listdir(compute_eval_dir))
    # Check for files that might indicate evaluation logic (e.g., eval.py, correctness.py)
    print("\nChecking for potential evaluation files in compute_eval directory:")
    for root, _, files in os.walk(compute_eval_dir):
        for file in files:
            if file.endswith(".py") and ("eval" in file or "correctness" in file or "metric" in file):
                print(f"- Found: {os.path.join(root, file)}")
except FileNotFoundError:
    print(f"Error: The directory {compute_eval_dir} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

## Prepare environment for compilation/execution

Ensure the environment has the necessary CUDA toolkit and compilers to build and run the generated code.


Check for the availability of `nvcc` and `g++` compilers using shell commands as instructed to ensure the environment is set up for compiling CUDA code.



In [None]:
import os

# Check for nvcc
print("Checking for nvcc compiler:")
nvcc_check = os.system("nvcc --version")
if nvcc_check != 0:
    print("nvcc not found. CUDA Toolkit may need to be installed or configured.")
else:
    print("nvcc found.")

# Check for g++
print("\nChecking for g++ compiler:")
gpp_check = os.system("g++ --version")
if gpp_check != 0:
    print("g++ not found. A C++ compiler may need to be installed.")
else:
    print("g++ found.")

## Extract test cases



Extract the test cases from the 'test' column of the 'train' split of the loaded dataset and store them in a list.



In [None]:
test_cases = [item['test'] for item in dataset['train']]
print(f"Extracted {len(test_cases)} test cases.")
# Optionally, print the first few test cases to verify
# print("First 5 test cases:")
# for i, test_case in enumerate(test_cases[:5]):
#     print(f"--- Test Case {i+1} ---")
#     print(test_case)

## Implement evaluation logic

Implement evaluation logic to iterate through generated solutions and test cases, compile and execute the combined code, and capture the results.


In [None]:
import subprocess
import os
import json

output_file = "compute_eval_solutions.jsonl"
eval_results = []
num_solutions_to_eval = 3 # Evaluate the first 3 generated solutions

# Load generated solutions
try:
    with open(output_file, 'r') as f:
        generated_solutions = [json.loads(line) for line in f]
except FileNotFoundError:
    print(f"Error: The file {output_file} was not found.")
    generated_solutions = []
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {output_file}. Ensure it's a valid JSON Lines file.")
    generated_solutions = []
except Exception as e:
    print(f"An error occurred while loading generated solutions: {e}")
    generated_solutions = []


for i in tqdm(range(min(num_solutions_to_eval, len(generated_solutions))), desc="Evaluating Solutions"):
    solution_entry = generated_solutions[i]
    # Find the corresponding dataset entry based on prompt (assuming prompts are unique and ordered)
    # A more robust approach would be to use task_id if available in the solution file
    dataset_entry = None
    for item in dataset['train']:
        if item['prompt'] == solution_entry['prompt']:
            dataset_entry = item
            break

    if dataset_entry is None:
        print(f"Warning: Could not find dataset entry for solution {i+1}. Skipping.")
        continue

    task_id = dataset_entry.get('task_id', f"Task {i+1}")
    prompt = dataset_entry['prompt']
    solution = solution_entry['solution']
    declaration = dataset_entry.get('declaration', '')
    test = dataset_entry['test']
    cc_flags = dataset_entry.get('cc_flags', '')
    ld_flags = dataset_entry.get('ld_flags', '')

    # Combine code
    combined_code = f"{declaration}\n{solution}\n{test}"

    # Save combined code to a temporary file
    temp_file_name = f"temp_code_{i}.cu"
    with open(temp_file_name, 'w') as f:
        f.write(combined_code)

    compile_success = False
    compile_output = ""
    exec_success = False
    exec_output = ""

    # Compile the code
    compile_command = ["nvcc", temp_file_name, "-o", f"temp_output_{i}", *cc_flags.split(), *ld_flags.split()]
    try:
        compile_result = subprocess.run(compile_command, capture_output=True, text=True, timeout=30) # Added timeout
        compile_output = compile_result.stdout + compile_result.stderr
        if compile_result.returncode == 0:
            compile_success = True
            # Execute the compiled binary if compilation was successful
            execute_command = [f"./temp_output_{i}"]
            try:
                exec_result = subprocess.run(execute_command, capture_output=True, text=True, timeout=30) # Added timeout
                exec_output = exec_result.stdout + exec_result.stderr
                if exec_result.returncode == 0:
                    exec_success = True
            except FileNotFoundError:
                exec_output = f"Error: Executable ./temp_output_{i} not found."
            except subprocess.TimeoutExpired:
                exec_output = "Execution timed out."
            except Exception as e:
                exec_output = f"Error during execution: {e}"
        else:
            compile_output = f"Compilation failed with return code {compile_result.returncode}:\n" + compile_output
    except FileNotFoundError:
        compile_output = "Error: nvcc command not found. Is CUDA Toolkit installed and in PATH?"
    except subprocess.TimeoutExpired:
        compile_output = "Compilation timed out."
    except Exception as e:
        compile_output = f"Error during compilation: {e}"

    # Store results
    eval_results.append({
        "task_id": task_id,
        "compile_success": compile_success,
        "compile_output": compile_output,
        "exec_success": exec_success,
        "exec_output": exec_output
    })

    # Clean up temporary files
    if os.path.exists(temp_file_name):
        os.remove(temp_file_name)
    if os.path.exists(f"temp_output_{i}"):
        os.remove(f"temp_output_{i}")


print("\nEvaluation Complete.")
# Optionally, print the evaluation results
print("\nEvaluation Results:")
for result in eval_results:
    print(json.dumps(result, indent=2))

## Calculate metrics

Calculate evaluation metrics based on the compilation and execution results.


In [None]:
# Initialize counters
compile_success_count = 0
exec_success_count = 0

# Iterate through the evaluation results
for result in eval_results:
    if result.get('compile_success', False):
        compile_success_count += 1
    if result.get('exec_success', False):
        exec_success_count += 1

# Calculate pass rates
total_solutions_evaluated = len(eval_results)
compile_pass_rate = (compile_success_count / total_solutions_evaluated) * 100 if total_solutions_evaluated > 0 else 0
exec_pass_rate = (exec_success_count / total_solutions_evaluated) * 100 if total_solutions_evaluated > 0 else 0

# Print the metrics
print("\nEvaluation Metrics:")
print(f"Total solutions evaluated: {total_solutions_evaluated}")
print(f"Successful compilations: {compile_success_count}")
print(f"Successful executions: {exec_success_count}")
print(f"Compilation Pass Rate: {compile_pass_rate:.2f}%")
print(f"Execution Pass Rate: {exec_pass_rate:.2f}%")

## Report evaluation results

Report the calculated evaluation metrics and provide examples of successful and failed evaluations.


In [None]:
# Print a summary of the evaluation
print("\n--- Evaluation Summary ---")
print(f"Total solutions evaluated: {total_solutions_evaluated}")
print(f"Successful compilations: {compile_success_count}")
print(f"Successful executions: {exec_success_count}")
print(f"Compilation Pass Rate: {compile_pass_rate:.2f}%")
print(f"Execution Pass Rate: {exec_pass_rate:.2f}%")

print("\n--- Evaluation Examples ---")

# Find and print examples of failed compilation and execution
failed_compile_example = None
failed_exec_example = None
successful_compile_example = None
successful_exec_example = None

for result in eval_results:
    if not result.get('compile_success', True) and failed_compile_example is None:
        failed_compile_example = result
    if not result.get('exec_success', True) and failed_exec_example is None:
        failed_exec_example = result
    if result.get('compile_success', False) and successful_compile_example is None:
        successful_compile_example = result
    if result.get('exec_success', False) and successful_exec_example is None:
        successful_exec_example = result

if failed_compile_example:
    print("\nExample of a Failed Compilation:")
    print(json.dumps(failed_compile_example, indent=2))

if failed_exec_example and (failed_exec_example != failed_compile_example or not failed_compile_example): # Avoid printing the same example if it failed both
     print("\nExample of a Failed Execution:")
     print(json.dumps(failed_exec_example, indent=2))

if successful_compile_example:
    print("\nExample of a Successful Compilation:")
    print(json.dumps(successful_compile_example, indent=2))

if successful_exec_example and (successful_exec_example != successful_compile_example or not successful_compile_example): # Avoid printing the same example if it succeeded both
    print("\nExample of a Successful Execution:")
    print(json.dumps(successful_exec_example, indent=2))

if not failed_compile_example and not failed_exec_example and not successful_compile_example and not successful_exec_example:
    print("No evaluation results available to display examples.")

In [None]:
!zip -r cuda.zip cuda/