This file provides functions to help investigate the jsonl's produced by inference for debugging and general investigation purposes

### Looking at Prompts

In [32]:
import json
import random

In [111]:
## View Prompt Examples
# Function to read a JSON Lines file and return the prompt of a desired line
def get_prompt_at_line(jsonl_file_path, desired_line_no):
    with open(jsonl_file_path, 'r') as file:
        for line_no, line in enumerate(file, start=1):
            if line_no == desired_line_no:
                json_object = json.loads(line)
                prompt = json_object.get('prompt', None)
                return prompt
    return None  # Return None if the desired line was not found

# Specify the .jsonl file path and the desired line number
jsonl_file_path = 'temp_subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_infile_snippets_10.jsonl'
# jsonl_file_path = 'subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct.jsonl'
desired_line_no = 64  # For example, we want the prompt at line 10

# Get the prompt at the desired line
prompt_at_desired_line = get_prompt_at_line(jsonl_file_path, desired_line_no)

if prompt_at_desired_line:
    print(prompt_at_desired_line)
else:
    print(f"No prompt found at line {desired_line_no}")

# Here are some relevant code fragments from other files of the repo:
# --------------------------------------------------
# the below code fragment can be found in:
# fortuna/prob_model/likelihood/regression.py
# --------------------------------------------------
# from typing import Optional, Union
# 
# import jax.numpy as jnp
# import numpy as np
# from jax import vmap
# from jax._src.prng import PRNGKeyArray
# 
# from fortuna.data.loader import InputsLoader
# from fortuna.model.model_manager.regression import RegressionModelManager
# from fortuna.output_calibrator.output_calib_manager.base import \
#     OutputCalibManager
# from fortuna.prob_model.likelihood.base import Likelihood
# from fortuna.prob_output_layer.regression import RegressionProbOutputLayer
# from fortuna.typing import Array, CalibMutable, CalibParams, Mutable, Params
# 
# 
# class RegressionLikelihood(Likelihood):
#     def __init__(
#         self,
#         model_manager: RegressionModelManager,
# --------------

In [12]:
# Input and output JSONL file paths
base_jsonl_name = "rg-one-gram-ws-20-ss-10_0.1_instruct_temp_0_100"

use_system_message = True

if use_system_message:
    ending = ".jsonl"
else:
    ending = "_no_system.jsonl"

# Original and preprocessed JSONL file paths
original_responses_path = 'raw_generations/' + base_jsonl_name + '_raw_generations' + ending
input_jsonl_file_path = 'subsets/' + base_jsonl_name + '.jsonl'
output_jsonl_file_path = "processed_generations/" + base_jsonl_name + "_generations" + ending

### Test Processing Functions

In [7]:
# Function to preprocess api completions
def preprocess_api_completion(completion):
    # Extract content within ``
    if '```python' in completion:
        start = completion.find('```python') + 9  # Start index of content inside ``
        end = completion.rfind('```')  # End index of content inside ``
        completion = completion[start:end]
    elif '```' in completion:
        start = completion.find('```') + 3  # Start index of content inside ``
        end = completion.rfind('```')  # End index of content inside ``
        completion = completion[start:end]
    elif '`' in completion:
        start = completion.find('`') + 1  # Start index of content inside ``
        end = completion.rfind('`')  # End index of content inside ``
        completion = completion[start:end]

    # Remove lines starting with '#'
    lines = [line.split('#', 1)[0].rstrip() for line in completion.split('\n')]
    # Save only the first non-empty line
    final_string = ""
    for line in lines:
        final_string += line
    return " ".join(final_string.split()).replace("( ", "(").replace(" )", ")")  # Return empty string if no non-empty lines are found

In [8]:
# Function to preprocess the completion string
def preprocess_completion(completion):
    # Extract content within ```...```
    end = None  # End index of content inside ```
    if '```python' in completion:
        start = completion.find('```python') + 9  # Start index of content inside ```python
        end = completion.find('```', start)  # Find ending backticks after the start
    elif '```' in completion:
        start = completion.find('```') + 3  # Start index of content inside ```
        end = completion.find('```', start)  # Find ending backticks after the start
    elif '```py' in completion:
        start = completion.find('```') + 5  # Start index of content inside ```py
        end = completion.find('```', start)  # Find ending backticks after the start

    if end == -1:  # If the closing backticks are not found
        end = len(completion)  # Set end to the length of the string

    # Extract the content if valid start and end indices are found
    if end is not None and start < end:
        completion = completion[start:end]
    else:
        return completion  # Return the original string if no valid code block is found

    # Remove lines starting with '#'
    lines = [line.split('#', 1)[0].rstrip() for line in completion.split('\n')]

    # Save only the first non-empty line
    for line in lines:
        if line.strip():
            return " ".join(line.split())

    return completion  # Return the original string if no non-empty lines are found

In [9]:
# Perform the same processing on the ground truth and completion as the evaluation code
def process_completion_and_ground_truth(target, predictions, passk):
    target_lines = [line.strip() for line in target.splitlines() if line.strip()]
    for prediction in predictions[:passk]:
        prediction_lines = [line.strip() for line in prediction.splitlines() if line.strip()][:len(target_lines)]
    return prediction_lines, target_lines

### JSONL Processing Functions

In [48]:
# Create a jsonl with just the ground truth and the completion to compare the two

# jsonl_path = "processed_generations/rg-one-gram-ws-20-ss-2-fixed_0.1_with_instructions_temp_0_generations_processed.jsonl"
# jsonl_path = "processed_generations/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_generations_processed.jsonl"

for jsonl_path in [output_jsonl_file_path, output_jsonl_file_path.replace(".jsonl","_processed.jsonl")]:
    with open(jsonl_path, 'r') as original_file, open(jsonl_path.replace(".jsonl","_gt.jsonl"), 'w') as output_file:
        for line in original_file:
            entry = json.loads(line.strip())
            if "choices" in entry:
                completion = entry["choices"][0]["text"]
            else:
                completion = entry["completion"]
            ground_truth = entry["metadata"]["ground_truth"]
            
            output_entry = {
                "completion": completion,
                "ground_truth": ground_truth
            }
            
            output_file.write(json.dumps(output_entry) + "\n")

In [49]:
# Process in the same way that is done for eval to see if there are any easy fixes here
# jsonl_path = "processed_generations/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_generations.jsonl"

for jsonl_path in [output_jsonl_file_path, output_jsonl_file_path.replace(".jsonl","_processed.jsonl")]:
    with open(jsonl_path, 'r') as original_file, open(jsonl_path.replace(".jsonl","_eval_gt.jsonl"), 'w') as output_file:
        for line in original_file:
            entry = json.loads(line.strip())
            completion = [entry['choices'][0]["text"]]
            ground_truth = entry["metadata"]["ground_truth"]
            completion_processed, ground_truth_processed = process_completion_and_ground_truth(ground_truth, completion, 1)
            
            output_entry = {
                "completion": completion_processed,
                "ground_truth": ground_truth_processed
            }
            
            output_file.write(json.dumps(output_entry) + "\n")

### JSONL Inference Functions

In [37]:
import json

# Define the function to compare two JSONL files
def compare_jsonl(file1, file2):
    # Open the first JSONL file
    with open(file1, 'r') as f1:
        # Read lines into a list of dictionaries
        lines1 = [json.loads(line.strip()) for line in f1]

    # Open the second JSONL file
    with open(file2, 'r') as f2:
        # Read lines into a list of dictionaries
        lines2 = [json.loads(line.strip()) for line in f2]

    # Ensure both files have the same number of lines to compare
    if len(lines1) != len(lines2):
        print("Files have different number of lines and cannot be compared.")
        return

    # Loop through both lists and check for matching 'completion' and 'ground_truth'
    # in file1 but not in file2
    for line1, line2 in zip(lines1, lines2):
        if (line1['completion'] == line1['ground_truth']) and (line2['completion'] != line2['ground_truth']):
            print(json.dumps(line1))  # Convert the dictionary back to JSON string format for printing

# Example usage
file1 = 'processed_generations/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_generations_processed_eval_gt.jsonl'
file2 = 'processed_generations/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_generations_processed_gt.jsonl'

compare_jsonl(file1, file2)

{"completion": ["yield from self._targets_loader()"], "ground_truth": ["yield from self._targets_loader()"]}
{"completion": ["prohibited.add(\"done\")"], "ground_truth": ["prohibited.add(\"done\")"]}
{"completion": ["parser.add_argument("], "ground_truth": ["parser.add_argument("]}
{"completion": ["}"], "ground_truth": ["}"]}
{"completion": ["calib_data_loader=calib_data_loader,"], "ground_truth": ["calib_data_loader=calib_data_loader,"]}
{"completion": ["scheduler_config = self.get_scheduler_config()"], "ground_truth": ["scheduler_config = self.get_scheduler_config()"]}
{"completion": ["conv: ModuleDef = nn.Conv"], "ground_truth": ["conv: ModuleDef = nn.Conv"]}
{"completion": ["{"], "ground_truth": ["{"]}
{"completion": ["prob_output_layer: ProbOutputLayer,"], "ground_truth": ["prob_output_layer: ProbOutputLayer,"]}
{"completion": ["model must have been calibrated beforehand."], "ground_truth": ["model must have been calibrated beforehand."]}
{"completion": ["self, tensordict: Optiona

In [39]:
import json

# Define the function to find matching completion and ground_truth
def print_matching_lines(jsonl_file):
    # Open the JSONL file
    with open(jsonl_file, 'r') as f:
        # Iterate over each line in the file
        for line in f:
            # Parse the JSON content
            data = json.loads(line.strip())
            # Check if the completion and ground_truth match
            if data['completion'] == data['ground_truth']:
                # Convert dictionary to JSON string and print
                print(json.dumps(data))

# Example usage
jsonl_file = 'processed_generations/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_generations_eval_gt.jsonl'
print_matching_lines(jsonl_file)

{"completion": ["yield from self._targets_loader()"], "ground_truth": ["yield from self._targets_loader()"]}
{"completion": ["prohibited.add(\"done\")"], "ground_truth": ["prohibited.add(\"done\")"]}
{"completion": ["parser.add_argument("], "ground_truth": ["parser.add_argument("]}
{"completion": ["}"], "ground_truth": ["}"]}
{"completion": ["calib_data_loader=calib_data_loader,"], "ground_truth": ["calib_data_loader=calib_data_loader,"]}
{"completion": ["scheduler_config = self.get_scheduler_config()"], "ground_truth": ["scheduler_config = self.get_scheduler_config()"]}
{"completion": ["conv: ModuleDef = nn.Conv"], "ground_truth": ["conv: ModuleDef = nn.Conv"]}
{"completion": ["{"], "ground_truth": ["{"]}
{"completion": ["prob_output_layer: ProbOutputLayer,"], "ground_truth": ["prob_output_layer: ProbOutputLayer,"]}
{"completion": ["model must have been calibrated beforehand."], "ground_truth": ["model must have been calibrated beforehand."]}
{"completion": ["self, tensordict: Optiona

In [7]:
import json

# Define the function to find matching completion and ground_truth
def print_matching_lines(jsonl_file):
    # Open the JSONL file
    with open(jsonl_file, 'r') as f:
        # Iterate over each line in the file
        for line in f:
            # Parse the JSON content
            data = json.loads(line.strip())
            # Check if the completion and ground_truth match
            if data['completion'] == data['ground_truth']:
                # Convert dictionary to JSON string and print
                print(json.dumps(data))
                
jsonl_file = 'processed_generations/rg-one-gram-ws-20-ss-2-fixed_0.1_with_instructions_temp_0_generations_processed_eval_gt.jsonl'
print_matching_lines(jsonl_file)

{"completion": ["parser.add_argument("], "ground_truth": ["parser.add_argument("]}
{"completion": ["yield from self._targets_loader()"], "ground_truth": ["yield from self._targets_loader()"]}
{"completion": ["prob_output_layer: ProbOutputLayer,"], "ground_truth": ["prob_output_layer: ProbOutputLayer,"]}
{"completion": ["{"], "ground_truth": ["{"]}
{"completion": ["scheduler_config = self.get_scheduler_config()"], "ground_truth": ["scheduler_config = self.get_scheduler_config()"]}
{"completion": ["if self.is_functional and params is None:"], "ground_truth": ["if self.is_functional and params is None:"]}
{"completion": ["prohibited.add(\"done\")"], "ground_truth": ["prohibited.add(\"done\")"]}
{"completion": ["calib_data_loader=calib_data_loader,"], "ground_truth": ["calib_data_loader=calib_data_loader,"]}
{"completion": ["("], "ground_truth": ["("]}
{"completion": ["self, tensordict: Optional[TensorDictBase] = None, **kwargs"], "ground_truth": ["self, tensordict: Optional[TensorDictBase