In [13]:
!pip install openai datasets nltk pandas tqdm transformers accelerate torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [14]:
import os
import random
import time
import openai
from datasets import load_dataset, concatenate_datasets
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
import torch
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import nltk
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.tokenize import TreebankWordTokenizer

In [15]:
# Download NLTK data
nltk.download('punkt', quiet=True)

try:
    nltk.data.find('tokenizers/punkt')
    print("'punkt' tokenizer is available.")
except LookupError:
    print("'punkt' tokenizer is not available.")

def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda')
    elif torch.backends.mps.is_available():
        device = torch.device('mps')
    else:
        device = torch.device('cpu')
    print(f"Device: {device}")
    return device



'punkt' tokenizer is available.


In [16]:
# Step 1: Load the datasets
def load_and_sample_datasets(fraction=0.1, dataset_name='codeparrot/xlcost-text-to-code'):
    # Load datasets
    datasets = []
    if dataset_name == 'codeparrot/xlcost-text-to-code':
        try:
            code_x_glue_dataset = load_dataset("codeparrot/xlcost-text-to-code", split='train')
            #code_x_glue_dataset = load_dataset('code_x_glue_ct_code_to_text', 'python', split='train')
            datasets.append(code_x_glue_dataset)
        except Exception as e:
            print(f"Failed to load xlcost-text-to-code dataset: {e}")
    elif dataset_name == 'codeparrot/apps':
        try:
            apps_dataset = load_dataset('codeparrot/apps', split='all', trust_remote_code=True)
            datasets.append(apps_dataset)
        except Exception as e:
            print(f"Failed to load codeparrot/apps dataset: {e}")
    elif dataset_name == 'codeparrot/codeparrot-clean': 
        try:
            codeparrot_clean_dataset = load_dataset('codeparrot/codeparrot-clean', split='train')
            datasets.append(codeparrot_clean_dataset)
        except Exception as e:
            print(f"Failed to load codeparrot/codeparrot-clean dataset: {e}")

    if not datasets:
        raise ValueError("No datasets were loaded successfully.")

    # Combine datasets
    combined_dataset = concatenate_datasets(datasets)

    # Sample 10% of the data
    num_samples = int(len(combined_dataset) * fraction)
    sampled_dataset = combined_dataset.shuffle(seed=42).select(range(num_samples))
    return sampled_dataset



In [17]:

# Load the tokenizer and model
def load_model_and_tokenizer(model_name='Salesforce/codegen-350M-mono', sampled_dataset=None):
    total_examples = 0
    exact_matches = 0
    bleu_scores = []
    response_times = []
    results = []
    no_text_or_code = 0
    smoothie = SmoothingFunction().method4
    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    device = get_device()
    model.to(device)

    # Ensure pad_token_id is set
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    print(f"Loaded model: {model_name}")
    print(f"Vocabulary size: {tokenizer.vocab_size}")
    print("dataset name : " + sampled_dataset['name'])
    print(f"Number of examples: {len(sampled_dataset['data'])}")


    response_times = []  # Initialize response_times if not already
    nltk_treebank_tokenizer = TreebankWordTokenizer()
    # Step 3: Iterate over the sampled data
    for example in tqdm(sampled_dataset['data'], desc="Processing examples"):
        # Extract the natural language description and reference code
        text_input = example.get('nl') or example.get('question') or example.get('text')
        reference_code = example.get('code') or example.get('solutions') or example.get('answer')

        if not text_input or not reference_code:
            no_text_or_code += 1
            continue  # Skip if required fields are missing

        # Prepare the prompt
        prompt = f"Write code for the following description:\n{text_input}"

        # Tokenize the prompt
        inputs = tokenizer(prompt, return_tensors='pt').to(device)

        # Generate code using the model
        try:
            start_time = time.time()
            output_sequences = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs.get('attention_mask', None),
                max_length=inputs['input_ids'].shape[1] + 256,  # Adjust max_length as needed
                temperature=0.7,
                top_p=0.95,
                do_sample=True,
                num_return_sequences=1,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
            end_time = time.time()
            response_time = end_time - start_time
            response_times.append(response_time)

            # Decode the generated code
            generated_code = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
            # Remove the prompt from the generated code
            generated_code = generated_code[len(prompt):].strip()
        except Exception as e:
            print(f"Inference error: {e}")
            continue

        # Update counters
        total_examples += 1

        # Check for exact match
        is_exact_match = generated_code.strip() == reference_code.strip()
        if is_exact_match:
            exact_matches += 1

        # Tokenize the reference and generated code
        reference_tokens = nltk_treebank_tokenizer.tokenize(reference_code)
        candidate_tokens = nltk_treebank_tokenizer.tokenize(generated_code)

        # Calculate BLEU score
        bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothie)
        bleu_scores.append(bleu_score)

        # Store the results
        results.append({
            'text_input': text_input,
            'reference_code': reference_code,
            'generated_code': generated_code,
            'is_exact_match': is_exact_match,  # Include exact match result
            'bleu_score': bleu_score,
            'response_time': response_time
        })

    print(f"Processed {total_examples} examples.")
    print(f"no_text_or_code: {no_text_or_code}")

    # Assuming bleu_scores is a list of BLEU scores
    if bleu_scores:
        # Calculate the average BLEU score
        average_bleu = sum(bleu_scores) / len(bleu_scores)
        print(f"\nAverage BLEU score: {average_bleu:.4f}")
        
        # Calculate percentiles
        percentiles = [25, 50, 75, 90, 95, 99]
        percentile_values = np.percentile(bleu_scores, percentiles)
        
        print("\nBLEU Score Percentiles:")
        for p, value in zip(percentiles, percentile_values):
            print(f"{p}th percentile: {value:.4f}")
    else:
        print("\nNo BLEU scores were calculated.")


    if response_times:
        average_time = sum(response_times) / len(response_times)
        total_time = sum(response_times)
        max_time = max(response_times)
        min_time = min(response_times)
        throughput = total_examples / total_time if total_time > 0 else 0
        print(f"Average API response time: {average_time:.2f} seconds")
        print(f"Total API response time: {total_time:.2f} seconds")
        print(f"Max API response time: {max_time:.2f} seconds")
        print(f"Min API response time: {min_time:.2f} seconds")
        print(f"Throughput: {throughput:.2f} requests per second")
    else:
        print("\nNo response times were recorded.")

    if total_examples > 0:
        exact_match_rate = exact_matches / total_examples * 100
        print(f"Exact match rate: {exact_match_rate:.2f}% ({exact_matches}/{total_examples})")
    else:
        print("\nNo examples were processed.")

In [18]:
model_names = ['Salesforce/codegen-350M-mono','codeparrot/codeparrot-small']
fraction=0.001
#dataset_name=['codeparrot/apps'
dataset_name = 'codeparrot/xlcost-text-to-code'


sampled_dataset = load_and_sample_datasets(fraction=fraction, dataset_name=dataset_name)
sampled_dataset = {
    'data': sampled_dataset,
    'name': 'codeparrot/apps'
}

print("length of sampled dataset", len(sampled_dataset['data']))
for model_name in model_names:
    load_model_and_tokenizer(model_name, sampled_dataset=sampled_dataset)



Generating train split: 100%|██████████| 93847/93847 [00:00<00:00, 1665994.47 examples/s]
Generating test split: 100%|██████████| 8118/8118 [00:00<00:00, 2333426.53 examples/s]
Generating validation split: 100%|██████████| 4432/4432 [00:00<00:00, 2322773.38 examples/s]


ValueError: Dataset at position 0 has at least one split: ['train', 'test', 'validation']
Please pick one to interleave with the other datasets, for example: dataset['train']