### Create Subsets for Inference

In [60]:
import json
import random

# Define the seed for reproducibility
RANDOM_SEED = 42
# Define the fraction of lines you want in the random subset
FRACTION = 0.1

def create_random_subset_jsonl(input_filepath, output_filepath, seed):
    random.seed(seed)  # Set the seed for reproducible results
    lines = []

    # Step 1: Read the original JSONL file and store the lines
    with open(input_filepath, 'r') as infile:
        for line in infile:
            lines.append(line.strip())  # Strip to remove the newline at the end

    # Step 2: Randomly select a subset of the lines
    subset_size = int(FRACTION * len(lines))
    random_subset = random.sample(lines, subset_size)

    # Step 3: Write the random subset to the output JSONL file
    with open(output_filepath, 'w') as outfile:
        for line in random_subset:
            outfile.write(line + '\n')  # Add a newline at the end

# Replace with your input and output file paths
base_jsonl_name = "line_level_completion_2k_context_codegen.test"
input_jsonl_fp = 'datasets/' + base_jsonl_name + '.jsonl'
output_jsonl_fp = 'subsets/' + base_jsonl_name + "_" + str(FRACTION) + '.jsonl'

# Create a random subset JSONL file
create_random_subset_jsonl(input_jsonl_fp, output_jsonl_fp, RANDOM_SEED)

In [29]:
## View Prompt Examples
import json

# Function to read a JSON Lines file and return the prompt of a desired line
def get_prompt_at_line(jsonl_file_path, desired_line_no):
    with open(jsonl_file_path, 'r') as file:
        for line_no, line in enumerate(file, start=1):
            if line_no == desired_line_no:
                json_object = json.loads(line)
                prompt = json_object.get('prompt', None)
                return prompt
    return None  # Return None if the desired line was not found

# Specify the .jsonl file path and the desired line number
jsonl_file_path = 'datasets/api_level_completion_1k_context_codegen.test.jsonl'
desired_line_no = 5  # For example, we want the prompt at line 10

# Get the prompt at the desired line
prompt_at_desired_line = get_prompt_at_line(jsonl_file_path, desired_line_no)

if prompt_at_desired_line:
    print(prompt_at_desired_line)
else:
    print(f"No prompt found at line {desired_line_no}")


        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)

        # add noise to latents using the timesteps
        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
        init_latents = self.scheduler.add_noise(
            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
        )
        init_latents = init_latents.numpy()

        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (?) is only used with the DDIMScheduler, it will be ignored for other schedulers.
        # eta corresponds to? in DDIM paper: https://arxiv.org/abs/2010.02502
        # and should be between [0, 1]
        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
        extra_step_kwargs = {}
        if accepts_eta:
            extra_step_kwargs["eta"] = eta

        latents = init_latents

        t_start = max(num_inference_steps - init_ti

### Run Inference with the OpenAI API

In [55]:
# Function to preprocess the completion string
def preprocess_completion(completion):
    # Extract content within ``
    if '```python' in completion:
        start = completion.find('```python') + 9  # Start index of content inside ``
        end = completion.rfind('```')  # End index of content inside ``
        completion = completion[start:end]
    elif '```' in completion:
        start = completion.find('```') + 3  # Start index of content inside ``
        end = completion.rfind('```')  # End index of content inside ``
        completion = completion[start:end]
    elif '`' in completion:
        start = completion.find('`') + 1  # Start index of content inside ``
        end = completion.rfind('`')  # End index of content inside ``
        completion = completion[start:end]

    # Remove lines starting with '#'
    lines = [line.split('#', 1)[0].rstrip() for line in completion.split('\n')]
    # Save only the first non-empty line
    for line in lines:
        if line.strip():
            return line
    return ""  # Return empty string if no non-empty lines are found

In [51]:
preprocess_completion('''
## Assistant:

`
safety_checker_input = self.feature_extractor(image)
safety_score = self.safety_checker(safety_checker_input)
`''')

'safety_checker_input = self.feature_extractor(image)'

In [58]:
import os
import openai
import json

client = openai.OpenAI()

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Input and output JSONL file paths
base_jsonl_name = "line_level_completion_2k_context_codegen.test_0.1"

# Original and preprocessed JSONL file paths
original_responses_path = 'raw_generations/' + base_jsonl_name + '_raw_generations.jsonl'
input_jsonl_file_path = 'subsets/' + base_jsonl_name + '.jsonl'
output_jsonl_file_path = "processed_generations/" + base_jsonl_name + "_generations.jsonl"

# Predefined system message
system_message = '''Return your proposed next line completion inside of a code block

```python
YOUR_CODE_HERE
```'''

# Function to process a single JSONL entry
def process_entry(entry, model_name="gpt-3.5-turbo-0613"):
    prompt = entry['prompt']
    messages = [{"role": "system", "content": system_message}, {"role": "user", "content": prompt}]
    
    # Generate the completion with the OpenAI API
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_tokens=250,  # Limit the number of generated tokens (adjust as needed)
        temperature=1,  # Adjust for creativity of the response
        seed = 1,
        n=1,  # Number of completions to generate
    )
    
    # Extract the text of the completion generated by the model
    generated_completion = response.choices[0].message.content
    return generated_completion

# Read the input JSONL file and generate completions
with open(input_jsonl_file_path, 'r') as input_file, open(original_responses_path, 'w') as original_file:
    for line in input_file:
        entry = json.loads(line.strip())
        metadata = entry["metadata"]
        output_entry = process_entry(entry)
        
        # Save the original response to a new JSONL
        original_file.write(json.dumps({"prompt": entry['prompt'], "completion": output_entry, "metadata" : metadata}) + "\n")

# Now, read the original responses JSONL, preprocess, and write back out to the second JSONL
with open(original_responses_path, 'r') as original_file, open(output_jsonl_file_path, 'w') as output_file:
    for line in original_file:
        entry = json.loads(line.strip())
        completion = entry['completion']
        metadata = entry["metadata"]
        preprocessed_completion = preprocess_completion(completion)
        
        output_entry = {
            "prompt": entry['prompt'],
            "choices": [{"text": preprocessed_completion}],
            "metadata": metadata
        }
        
        output_file.write(json.dumps(output_entry) + "\n")

### Evaluate Subsets

Use compute_score.py to evalaute 