### Create Subsets for Inference

In [1]:
import json
import random

# Define the seed for reproducibility
RANDOM_SEED = 42
# Define the fraction of lines you want in the random subset
FRACTION = 0.1

def create_random_subset_jsonl(input_filepath, output_filepath, seed):
    random.seed(seed)  # Set the seed for reproducible results
    lines = []

    # Step 1: Read the original JSONL file and store the lines
    with open(input_filepath, 'r') as infile:
        for line in infile:
            lines.append(line.strip())  # Strip to remove the newline at the end

    # Step 2: Randomly select a subset of the lines
    subset_size = int(FRACTION * len(lines))
    random_subset = random.sample(lines, subset_size)

    # Step 3: Write the random subset to the output JSONL file
    with open(output_filepath, 'w') as outfile:
        for line in random_subset:
            outfile.write(line + '\n')  # Add a newline at the end

# Replace with your input and output file paths
base_jsonl_name = "rg-one-gram-ws-20-ss-2-one-line"
input_jsonl_fp = 'datasets/' + base_jsonl_name + '.jsonl'
output_jsonl_fp = 'subsets/' + base_jsonl_name + "_" + str(FRACTION) + '.jsonl'

# Create a random subset JSONL file
create_random_subset_jsonl(input_jsonl_fp, output_jsonl_fp, RANDOM_SEED)

### Edit Prompts to be the Same as Shown in the Paper

In [2]:
# Let's make sure to tell the model what it actually has to do as well (as given in paper)
def rreplace(s, old, new, occurrence):
    li = s.rsplit(old, occurrence)
    return new.join(li)

In [3]:
with open("subsets/rg-one-gram-ws-20-ss-2-one-line_0.1.jsonl", 'r') as original_file, open("subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct.jsonl", 'w') as output_file:
    for line in original_file:
        entry = json.loads(line.strip())
        completion = entry['prompt']
        metadata = entry["metadata"]
        
        output_entry = {
            "prompt": rreplace(entry['prompt'],"# --------------------------------------------------",'# --------------------------------------------------\n"""Based on the above, complete the following code:"""',1),
            "metadata": metadata
        }
        
        output_file.write(json.dumps(output_entry) + "\n")

### Run Inference with API

In [8]:
import os
import openai
import json

client = openai.OpenAI()

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Input and output JSONL file paths
base_jsonl_name = "rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_gpt-3.5-0301"

use_system_message = True

if use_system_message:
    ending = ".jsonl"
else:
    ending = "_no_system.jsonl"

# Original and preprocessed JSONL file paths
original_responses_path = 'raw_generations/' + base_jsonl_name + '_raw_generations' + ending
input_jsonl_file_path = 'subsets/' + base_jsonl_name + '.jsonl'
output_jsonl_file_path = "processed_generations/" + base_jsonl_name + "_generations" + ending

#Predefined system message
system_message = '''Respond with only the next line completion'''

### Non-Multithreaded Request Code

In [6]:
# Function to process a single JSONL entry
def process_entry(entry, model_name="gpt-3.5-turbo-0613"):
    prompt = entry['prompt']
    if use_system_message:
        messages = [{"role": "system", "content": system_message}, {"role": "user", "content": prompt}]
    else:
        messages = [{"role": "user", "content": prompt}]
    
    # Generate the completion with the OpenAI API
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_tokens=250,  # Limit the number of generated tokens (adjust as needed)
        temperature=0,  # Adjust for creativity of the response
        seed = 1,
        n=1,  # Number of completions to generate
    )
    
    # Extract the text of the completion generated by the model
    generated_completion = response.choices[0].message.content
    return generated_completion

# Read the input JSONL file and generate completions
with open(input_jsonl_file_path, 'r') as input_file, open(original_responses_path, 'w') as original_file:
    for line in input_file:
        entry = json.loads(line.strip())
        metadata = entry["metadata"]
        output_entry = process_entry(entry)
        
        # Save the original response to a new JSONL
        original_file.write(json.dumps({"prompt": entry['prompt'], "completion": output_entry, "metadata" : metadata}) + "\n")

KeyboardInterrupt: 

### Multithreaded Version

In [9]:
import json
import concurrent.futures
import time

# Function to process a single JSONL entry
def process_entry(entry, model_name="gpt-3.5-turbo-0301"):
    prompt = entry['prompt']
    if use_system_message:
        messages = [{"role": "system", "content": system_message}, {"role": "user", "content": prompt}]
    else:
        messages = [{"role": "user", "content": prompt}]
    
    while True:
        try:
            # Generate the completion with the OpenAI API
            response = client.chat.completions.create(model=model_name,
                                                      messages=messages,
                                                      max_tokens=250,
                                                      temperature=0,
                                                      seed=1,
                                                      n=1)
            # Extract the text of the completion generated by the model
            generated_completion = response.choices[0].message.content
            return generated_completion
        except Exception as exc:
            if "Rate limit" in str(exc):
                print("Rate limit reached, retrying in 15 seconds...")
                time.sleep(15)
                continue
            else:
                raise

# Function to write the result to a file
def write_result(entry, output_entry, original_responses_path):
    with open(original_responses_path, 'a') as original_file:
        original_file.write(json.dumps({"prompt": entry['prompt'], "completion": output_entry, "metadata": entry['metadata']}) + "\n")

# Main code to read the input JSONL file and generate completions
with open(input_jsonl_file_path, 'r') as input_file:
    entries = [json.loads(line.strip()) for line in input_file]

with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit all the tasks to the executor and start them concurrently
    future_to_entry = {executor.submit(process_entry, entry): entry for entry in entries}

    # As each future completes, write the result
    for future in concurrent.futures.as_completed(future_to_entry):
        entry = future_to_entry[future]
        try:
            output_entry = future.result()
            write_result(entry, output_entry, original_responses_path)
        except Exception as exc:
            print(f'Generated an exception: {exc}')

Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...
Rate limit reached, retrying in 30 seconds...


### Evaluate Subsets

Use compute_score.py to evalaute 

### Check if Generated Code is Syntactically Correct

In [None]:
# This part is not done and still a work in progress
import ast
from pylint import epylint as lint
from io import StringIO

def check_generated_code(file_code, gen_code):
    combined_code = file_code + "\n" + gen_code
    
    # Syntax & Indentation Check
    try:
        # Attempt to parse the combined code into an AST
        ast.parse(combined_code)
    except (SyntaxError, IndentationError) as e:
        return f"Syntax or indentation error: {e}"

    # Static Analysis
    # Save the combined code to a temporary file or use StringIO
    temp_file_path = 'temp_code.py'
    with open(temp_file_path, 'w') as temp_file:
        temp_file.write(combined_code)
    
    # Run pylint on the file
    (pylint_stdout, pylint_stderr) = lint.py_run(temp_file_path, return_std=True)
    stdout, stderr = pylint_stdout.getvalue(), pylint_stderr.getvalue()
    
    # Assuming you are interested in errors (convention/refactor/warning messages may be ignored)
    if stdout.strip():
        return f"Pylint found issues with the code:\n{stdout}"
    
    return "Generated code passed all checks."

# Example usage
file_code = """
def existing_function():
    pass
"""

gen_code = """
for i in range(10):
    print(i)
"""

result = check_generated_code(file_code, gen_code)
print(result)