### Create Subsets for Inference

In [17]:
import json
import random

# Define the seed for reproducibility
RANDOM_SEED = 42
# Define the fraction of lines you want in the random subset
FRACTION = 0.1

def create_random_subset_jsonl(input_filepath, output_filepath, seed):
    random.seed(seed)  # Set the seed for reproducible results
    lines = []

    # Step 1: Read the original JSONL file and store the lines
    with open(input_filepath, 'r') as infile:
        for line in infile:
            lines.append(line.strip())  # Strip to remove the newline at the end

    # Step 2: Randomly select a subset of the lines
    subset_size = int(FRACTION * len(lines))
    random_subset = random.sample(lines, subset_size)

    # Step 3: Write the random subset to the output JSONL file
    with open(output_filepath, 'w') as outfile:
        for line in random_subset:
            outfile.write(line + '\n')  # Add a newline at the end

# Replace with your input and output file paths
base_jsonl_name = "rg-one-gram-ws-20-ss-2-fixed"
input_jsonl_fp = 'datasets/' + base_jsonl_name + '.jsonl'
output_jsonl_fp = 'subsets/' + base_jsonl_name + "_" + str(FRACTION) + '.jsonl'

# Create a random subset JSONL file
create_random_subset_jsonl(input_jsonl_fp, output_jsonl_fp, RANDOM_SEED)

In [24]:
## View Prompt Examples
import json

# Function to read a JSON Lines file and return the prompt of a desired line
def get_prompt_at_line(jsonl_file_path, desired_line_no):
    with open(jsonl_file_path, 'r') as file:
        for line_no, line in enumerate(file, start=1):
            if line_no == desired_line_no:
                json_object = json.loads(line)
                prompt = json_object.get('prompt', None)
                return prompt
    return None  # Return None if the desired line was not found

# Specify the .jsonl file path and the desired line number
jsonl_file_path = 'subsets/rg-one-gram-ws-20-ss-2-fixed_0.1_with_instructions.jsonl'
desired_line_no = 3  # For example, we want the prompt at line 10

# Get the prompt at the desired line
prompt_at_desired_line = get_prompt_at_line(jsonl_file_path, desired_line_no)

if prompt_at_desired_line:
    print(prompt_at_desired_line)
else:
    print(f"No prompt found at line {desired_line_no}")

# Here are some relevant code fragments from other files of the repo:
# --------------------------------------------------
# the below code fragment can be found in:
# torchrl/envs/transforms/transforms.py
# --------------------------------------------------
# 
#     def empty_cache(self):
#         self.__dict__["_parent"] = None
# 
# 
# class TransformedEnv(EnvBase):
#     """A transformed_in environment.
# 
#     Args:
#         env (EnvBase): original environment to be transformed_in.
#         transform (Transform, optional): transform to apply to the tensordict resulting
#             from :obj:`env.step(td)`. If none is provided, an empty Compose
#             placeholder in an eval mode is used.
#         cache_specs (bool, optional): if True, the specs will be cached once
#             and for all after the first call (i.e. the specs will be
#             transformed_in only once). If the transform changes during
#             training, the original spec transform may not be v

### Run Inference with the OpenAI API

In [27]:
# Function to preprocess the completion string
def preprocess_completion(completion):
    # Extract content within ``
    if '```python' in completion:
        start = completion.find('```python') + 9  # Start index of content inside ``
        end = completion.rfind('```')  # End index of content inside ``
        completion = completion[start:end]
    elif '```' in completion:
        start = completion.find('```') + 3  # Start index of content inside ``
        end = completion.rfind('```')  # End index of content inside ``
        completion = completion[start:end]
    elif '`' in completion:
        start = completion.find('`') + 1  # Start index of content inside ``
        end = completion.rfind('`')  # End index of content inside ``
        completion = completion[start:end]

    # Remove lines starting with '#'
    lines = [line.split('#', 1)[0].rstrip() for line in completion.split('\n')]
    # Save only the first non-empty line
    for line in lines:
        if line.strip():
            return " ".join(line.split()).replace("( ", "(").replace(" )", ")")
    return ""  # Return empty string if no non-empty lines are found

In [12]:
# Function to preprocess api completions
def preprocess_api_completion(completion):
    # Extract content within ``
    if '```python' in completion:
        start = completion.find('```python') + 9  # Start index of content inside ``
        end = completion.rfind('```')  # End index of content inside ``
        completion = completion[start:end]
    elif '```' in completion:
        start = completion.find('```') + 3  # Start index of content inside ``
        end = completion.rfind('```')  # End index of content inside ``
        completion = completion[start:end]
    elif '`' in completion:
        start = completion.find('`') + 1  # Start index of content inside ``
        end = completion.rfind('`')  # End index of content inside ``
        completion = completion[start:end]

    # Remove lines starting with '#'
    lines = [line.split('#', 1)[0].rstrip() for line in completion.split('\n')]
    # Save only the first non-empty line
    final_string = ""
    for line in lines:
        final_string += line
    return " ".join(final_string.split()).replace("( ", "(").replace(" )", ")")  # Return empty string if no non-empty lines are found

In [29]:
preprocess_api_completion("        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(\n            learnable=True, hidden_size=self.text_embedder_hidden_size, length=tokenizer.model_max_length\n        )")

'learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(learnable=True, hidden_size=self.text_embedder_hidden_size, length=tokenizer.model_max_length)'

In [19]:
# Let's make sure to tell the model what it actually has to do as well (as given in paper)
def rreplace(s, old, new, occurrence):
    li = s.rsplit(old, occurrence)
    return new.join(li)

In [20]:
with open("subsets/rg-one-gram-ws-20-ss-2-fixed_0.1.jsonl", 'r') as original_file, open("subsets/rg-one-gram-ws-20-ss-2-fixed_0.1_with_instructions.jsonl", 'w') as output_file:
    for line in original_file:
        entry = json.loads(line.strip())
        completion = entry['prompt']
        metadata = entry["metadata"]
        
        output_entry = {
            "prompt": rreplace(entry['prompt'],"# --------------------------------------------------",'# --------------------------------------------------\n"""Based on the above, complete the following code:"""',1),
            "metadata": metadata
        }
        
        output_file.write(json.dumps(output_entry) + "\n")

In [37]:
import os
import openai
import json

client = openai.OpenAI()

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Input and output JSONL file paths
base_jsonl_name = "rg-one-gram-ws-20-ss-2-fixed_0.1_with_instructions_temp_0"

use_system_message = False

if use_system_message:
    ending = ".jsonl"
else:
    ending = "_no_system.jsonl"

# Original and preprocessed JSONL file paths
original_responses_path = 'raw_generations/' + base_jsonl_name + '_raw_generations' + ending
input_jsonl_file_path = 'subsets/' + base_jsonl_name + '.jsonl'
output_jsonl_file_path = "processed_generations/" + base_jsonl_name + "_generations" + ending

#Predefined system message
system_message = '''Respond with only the next line completion'''

# Function to process a single JSONL entry
def process_entry(entry, model_name="gpt-3.5-turbo-0613"):
    prompt = entry['prompt']
    if use_system_message:
        messages = [{"role": "system", "content": system_message}, {"role": "user", "content": prompt}]
    else:
        messages = [{"role": "user", "content": prompt}]
    
    # Generate the completion with the OpenAI API
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_tokens=250,  # Limit the number of generated tokens (adjust as needed)
        temperature=0,  # Adjust for creativity of the response
        seed = 1,
        n=1,  # Number of completions to generate
    )
    
    # Extract the text of the completion generated by the model
    generated_completion = response.choices[0].message.content
    return generated_completion

# Read the input JSONL file and generate completions
with open(input_jsonl_file_path, 'r') as input_file, open(original_responses_path, 'w') as original_file:
    for line in input_file:
        entry = json.loads(line.strip())
        metadata = entry["metadata"]
        output_entry = process_entry(entry)
        
        # Save the original response to a new JSONL
        original_file.write(json.dumps({"prompt": entry['prompt'], "completion": output_entry, "metadata" : metadata}) + "\n")

In [36]:
# Now, read the original responses JSONL, preprocess, and write back out to the second JSONL
with open(original_responses_path, 'r') as original_file, open(output_jsonl_file_path.replace(".jsonl","_processed.jsonl"), 'w') as output_file:
    for line in original_file:
        entry = json.loads(line.strip())
        completion = entry['completion']
        metadata = entry["metadata"]
        metadata["ground_truth"] = preprocess_completion(metadata["ground_truth"])
        preprocessed_completion = preprocess_completion(completion)
        
        output_entry = {
            "prompt": entry['prompt'],
            "choices": [{"text": preprocessed_completion}],
            "metadata": metadata
        }
        
        output_file.write(json.dumps(output_entry) + "\n")

In [31]:
# Now, read the original responses JSONL, preprocess, and write back out to the second JSONL
with open(original_responses_path, 'r') as original_file, open(output_jsonl_file_path.replace(".jsonl","_not_processed.jsonl"), 'w') as output_file:
    for line in original_file:
        entry = json.loads(line.strip())
        completion = entry['completion']
        metadata = entry["metadata"]
        preprocessed_completion = completion
        
        output_entry = {
            "prompt": entry['prompt'],
            "choices": [{"text": preprocessed_completion}],
            "metadata": metadata
        }
        
        output_file.write(json.dumps(output_entry) + "\n")

In [29]:
import json
import random
# Now, read the original responses JSONL, preprocess, and write back out to the second JSONL
with open("processed_generations/rg-one-gram-ws-20-ss-2-fixed_0.1_with_instructions_generations.jsonl", 'r') as original_file, open("processed_generations/rg-one-gram-ws-20-ss-2-fixed_0.1_with_instructions_generations.jsonl".replace(".jsonl","_gt.jsonl"), 'w') as output_file:
    for line in original_file:
        entry = json.loads(line.strip())
        completion = entry['choices'][0]["text"]
        ground_truth = entry["metadata"]["ground_truth"]
        
        output_entry = {
            "completion": completion,
            "ground_truth": preprocess_completion(ground_truth)
        }
        
        output_file.write(json.dumps(output_entry) + "\n")

In [23]:
print("from __future__ import annotations\n\nimport logging\nfrom typing import Optional, Tuple\n\nimport jax.numpy as jnp\nfrom flax.core import FrozenDict\nfrom jax._src.prng import PRNGKeyArray\nfrom jax.flatten_util import ravel_pytree\n\nfrom fortuna.data.loader import DataLoader, InputsLoader\nfrom fortuna.distribution.gaussian import DiagGaussian\nfrom fortuna.prob_model.fit_config import FitConfig\nfrom fortuna.prob_model.joint.base import Joint\nfrom fortuna.prob_model.joint.state import JointState\nfrom fortuna.prob_model.posterior.base import Posterior\nfrom fortuna.prob_model.posterior.normalizing_flow.advi import ADVI_NAME\nfrom fortuna.prob_model.posterior.normalizing_flow.advi.advi_approximator import \\\n    ADVIPosteriorApproximator\nfrom fortuna.prob_model.posterior.normalizing_flow.advi.advi_architecture import \\\n    ADVIArchitecture\nfrom fortuna.prob_model.posterior.normalizing_flow.advi.advi_state import \\\n    ADVIState\nfrom fortuna.prob_model.posterior.normalizing_flow.advi.advi_trainer import \\\n    ADVITrainer\nfrom fortuna.prob_model.posterior.posterior_state_repository import \\\n    PosteriorStateRepository\nfrom fortuna.training.trainer import JittedMixin, MultiDeviceMixin\nfrom fortuna.typing import Array, Status\nfrom fortuna.utils.device import select_trainer_given_devices\n\n\nclass JittedADVITrainer(JittedMixin, ADVITrainer):\n    pass\n\n\nclass MultiDeviceADVITrainer(MultiDeviceMixin, ADVITrainer):\n    pass\n\n\nclass ADVIPosterior(Posterior):\n    def __init__(\n        self, joint: Joint, posterior_approximator: ADVIPosteriorApproximator,\n    ):\n        \"\"\"\n        Automatic Differentiation Variational Inference (ADVI) approximate posterior class.\n\n        Parameters\n        ----------\n        joint: Joint\n            A joint distribution object.\n        posterior_approximator: ADVI\n            An ADVI posterior approximator.\n        \"\"\"\n        super().__init__(joint=joint, posterior_approximator=posterior_approximator)\n\n    def __str__(self):\n        return ADVI_NAME\n\n    def fit(\n        self,\n        train_data_loader: DataLoader,\n        val_data_loader: Optional[DataLoader] = None,\n        fit_config: FitConfig = FitConfig(),\n        **kwargs,\n    ) -> Status:\n        if (\n            fit_config.checkpointer.dump_state is True\n            and not fit_config.checkpointer.save_checkpoint_dir\n        ):\n            raise ValueError(\n                \"`save_checkpoint_dir` must be passed when `dump_state` is set to True.\"\n            )\n\n        init_prob_model_state, n_train_data, n_val_data = self._init(\n            train_data_loader, val_data_loader\n        )\n\n        rav, self.unravel = ravel_pytree(init_prob_model_state.params)\n        size_rav = len(rav)\n        self.base = DiagGaussian(\n            mean=jnp.zeros(size_rav),\n            std=self.posterior_approximator.std_base * jnp.ones(size_rav),\n        )\n        self.architecture = ADVIArchitecture(\n            size_rav, std_init_params=self.posterior_approximator.std_init_params\n        )\n\n        trainer_cls = select_trainer_given_devices(\n            devices=fit_config.processor.devices,\n            BaseTrainer=ADVITrainer,\n            JittedTrainer=JittedADVITrainer,\n            MultiDeviceTrainer=MultiDeviceADVITrainer,\n            disable_jit=fit_config.processor.disable_jit,\n        )\n\n        trainer = trainer_cls(\n            predict_fn=self.joint.likelihood.prob_output_layer.predict,\n            save_checkpoint_dir=fit_config.checkpointer.save_checkpoint_dir,\n            save_every_n_steps=fit_config.checkpointer.save_every_n_steps,\n            keep_top_n_checkpoints=fit_config.checkpointer.keep_top_n_checkpoints,\n            disable_training_metrics_computation=fit_config.monitor.disable_training_metrics_computation,\n            eval_every_n_epochs=fit_config.monitor.eval_every_n_epochs,\n            early_stopping_monitor=fit_config.monitor.early_stopping_monitor,\n            early_stopping_min_delta=fit_config.monitor.early_stopping_min_delta,\n            early_stopping_patience=fit_config.monitor.early_stopping_patience,\n            base=self.base,\n            architecture=self.architecture,\n        )\n\n        state = None\n        if fit_config.checkpointer.restore_checkpoint_path:\n            state = self.restore_checkpoint(\n                restore_checkpoint_path=fit_config.checkpointer.restore_checkpoint_path,\n                optimizer=fit_config.optimizer.method,\n            )\n\n        if type(state)!= ADVIState:")

from __future__ import annotations

import logging
from typing import Optional, Tuple

import jax.numpy as jnp
from flax.core import FrozenDict
from jax._src.prng import PRNGKeyArray
from jax.flatten_util import ravel_pytree

from fortuna.data.loader import DataLoader, InputsLoader
from fortuna.distribution.gaussian import DiagGaussian
from fortuna.prob_model.fit_config import FitConfig
from fortuna.prob_model.joint.base import Joint
from fortuna.prob_model.joint.state import JointState
from fortuna.prob_model.posterior.base import Posterior
from fortuna.prob_model.posterior.normalizing_flow.advi import ADVI_NAME
from fortuna.prob_model.posterior.normalizing_flow.advi.advi_approximator import \
    ADVIPosteriorApproximator
from fortuna.prob_model.posterior.normalizing_flow.advi.advi_architecture import \
    ADVIArchitecture
from fortuna.prob_model.posterior.normalizing_flow.advi.advi_state import \
    ADVIState
from fortuna.prob_model.posterior.normalizing_flow.advi.advi_trainer imp

### Check if Generated Code is Syntactically Correct

In [4]:
import ast
from pylint import epylint as lint
from io import StringIO

def check_generated_code(file_code, gen_code):
    combined_code = file_code + "\n" + gen_code
    
    # Syntax & Indentation Check
    try:
        # Attempt to parse the combined code into an AST
        ast.parse(combined_code)
    except (SyntaxError, IndentationError) as e:
        return f"Syntax or indentation error: {e}"

    # Static Analysis
    # Save the combined code to a temporary file or use StringIO
    temp_file_path = 'temp_code.py'
    with open(temp_file_path, 'w') as temp_file:
        temp_file.write(combined_code)
    
    # Run pylint on the file
    (pylint_stdout, pylint_stderr) = lint.py_run(temp_file_path, return_std=True)
    stdout, stderr = pylint_stdout.getvalue(), pylint_stderr.getvalue()
    
    # Assuming you are interested in errors (convention/refactor/warning messages may be ignored)
    if stdout.strip():
        return f"Pylint found issues with the code:\n{stdout}"
    
    return "Generated code passed all checks."

# Example usage
file_code = """
def existing_function():
    pass
"""

gen_code = """
for i in range(10):
    print(i)
"""

result = check_generated_code(file_code, gen_code)
print(result)

ModuleNotFoundError: No module named 'pylint'

### Evaluate Subsets

Use compute_score.py to evalaute 