First implementing prompting-based versions of the ideas to see if they have potential

### Pass at k = ?

In [140]:
import json

def replicate_jsonl_lines(input_file, output_file, at):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            line_data = json.loads(line)

            # Replicate the line with different instance IDs
            for i in range(at):
                new_line_data = line_data.copy()
                new_line_data['metadata']['instance_id'] = f"{line_data['metadata']['task_id']}/instance_{i}"
                json.dump(new_line_data, outfile)
                outfile.write('\n')

# Usage
                
at = 10

input_jsonl_path = 'temp_subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct.jsonl'  # Update this to your input file path
output_jsonl_path = input_jsonl_path.replace(".jsonl",f"_pass_at_{at}.jsonl")

replicate_jsonl_lines(input_jsonl_path, output_jsonl_path, at)

### Subsets of retreivals

In [178]:
import random
import re

def random_combinations_of_fragments(fragments, num_subsets):
    
    # First and last fragments assumed to be the header and footer
    header_fragment = fragments[0]
    footer_fragment = fragments[-1]
    
    # The central fragments will be randomized and distributed
    central_fragments = fragments[1:-1]
    random.shuffle(central_fragments)

    # If the number of subsets is greater than or equal to the number of fragments, 
    # warn and copy existing fragments to make up the numbers
    if num_subsets > len(central_fragments):
        print("Warning: The number of subsets must be less than the number of unique code fragments.")
        print("Existing fragments will be copied randomly to ensure sufficient fragments.")
        while num_subsets > len(central_fragments):
            central_fragments.append(random.choice(fragments[1:-1]))
    
    # Calculate the number of fragments in each subset
    fragments_per_subset = len(central_fragments) // num_subsets

    # Generate the random code snippets
    snippets = []
    for i in range(num_subsets):
        # Calculate the start and end index of the fragments for this subset
        start_idx = i * fragments_per_subset
        # For the last subset, take all remaining fragments to handle cases not evenly divisible.
        end_idx = (start_idx + fragments_per_subset) if i < num_subsets - 1 else None

        # Subset of fragments for this snippet (distribution)
        subset_fragments = central_fragments[start_idx:end_idx]
        
        # Concatenate header_fragment, subset_fragments, and footer_fragment to create the full snippet
        snippet = header_fragment + '\n' + '\n'.join(subset_fragments) + '\n' + footer_fragment
        
        # Add the completed snippet to the list of snippets
        snippets.append(snippet)

    return snippets

In [179]:
import re

ENDING_COMMAND = '# --------------------------------------------------\n"""Based on the above, complete the following code from the main script file:"""'

def split_code_fragments(input_string):
    # Define the pattern for delimiters which starts and ends with a line of '-'
    delimiter_pattern = r'# -+\s+# the below code fragment can be found in:|' + ENDING_COMMAND

    # Use re.split() to split the input string into parts using the pattern
    fragments = re.split(delimiter_pattern, input_string)

    # Filter out any empty strings that may have been added during the split
    fragments = [fragment.strip() for fragment in fragments if fragment.strip()]

    # Add back the delimiter header for the second fragment onwards
    intro_line = '# --------------------------------------------------\n# the below code fragment can be found in:'
    for i in range(1, len(fragments)):
        if i < len(fragments) - 1:
            fragments[i] = intro_line + '\n' + fragments[i]
        else:
            fragments[i] = ENDING_COMMAND + '\n' + fragments[i]

    return fragments

input_string = '''# Here are some relevant code fragments from other files of the repo:
# --------------------------------------------------
# the below code fragment can be found in:
# torchrl/envs/vec_env.py
# --------------------------------------------------
#     @_check_start
#     def _shutdown_workers(self) -> None:
#         if self.is_closed:
#             raise RuntimeError(
#                 "calling {self.__class__.__name__}._shutdown_workers only allowed when env.is_closed = False"
#             )
#         for i, channel in enumerate(self.parent_channels):
#             if self._verbose:
#                 print(f"closing {i}")
#             # try:
#             channel.send(("close", None))
#             # except:
#             #     raise RuntimeError(f"closing {channel} number {i} failed")
#             msg, _ = channel.recv()
#             if msg != "closing":
#                 raise RuntimeError(
#                     f"Expected 'closing' but received {msg} from worker {i}"
#                 )
# 
#         del self.shared_tensordicts, self.shared_tensordict_parent
# --------------------------------------------------
# the below code fragment can be found in:
# torchrl/envs/vec_env.py
# --------------------------------------------------
#         ).clone()
# 
#     @_check_start
#     def _shutdown_workers(self) -> None:
#         if self.is_closed:
#             raise RuntimeError(
#                 "calling {self.__class__.__name__}._shutdown_workers only allowed when env.is_closed = False"
#             )
#         for i, channel in enumerate(self.parent_channels):
#             if self._verbose:
#                 print(f"closing {i}")
#             # try:
#             channel.send(("close", None))
#             # except:
#             #     raise RuntimeError(f"closing {channel} number {i} failed")
#             msg, _ = channel.recv()
#             if msg != "closing":
#                 raise RuntimeError(
#                     f"Expected 'closing' but received {msg} from worker {i}"
#                 )
# --------------------------------------------------
# the below code fragment can be found in:
# build_tools/setup_helpers/extension.py
# --------------------------------------------------
#     and "ATen parallel backend: OpenMP" in torch.__config__.parallel_info()
# )
# _TORCH_CUDA_ARCH_LIST = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
# 
# 
# def get_ext_modules():
#     return [
#         Extension(name="torchrl._torchrl", sources=[]),
#     ]
# 
# 
# # Based off of
# # https://github.com/pybind/cmake_example/blob/580c5fd29d4651db99d8874714b07c0c49a53f8a/setup.py
# class CMakeBuild(build_ext):
#     def run(self):
#         try:
#             subprocess.check_output(["cmake", "--version"])
#         except OSError:
#             raise RuntimeError("CMake is not available.") from None
#         super().run()
# --------------------------------------------------
# the below code fragment can be found in:
# build_tools/setup_helpers/extension.py
# --------------------------------------------------
#     return [
#         Extension(name="torchrl._torchrl", sources=[]),
#     ]
# 
# 
# # Based off of
# # https://github.com/pybind/cmake_example/blob/580c5fd29d4651db99d8874714b07c0c49a53f8a/setup.py
# class CMakeBuild(build_ext):
#     def run(self):
#         try:
#             subprocess.check_output(["cmake", "--version"])
#         except OSError:
#             raise RuntimeError("CMake is not available.") from None
#         super().run()
# 
#     def build_extension(self, ext):
#         # Since two library files (libtorchrl and _torchrl) need to be
#         # recognized by setuptools, we instantiate `Extension` twice. (see `get_ext_modules`)
#         # This leads to the situation where this `build_extension` method is called twice.
#         # However, the following `cmake` command will build all of them at the same time,
# --------------------------------------------------
# the below code fragment can be found in:
# build_tools/setup_helpers/extension.py
# --------------------------------------------------
# 
# def get_ext_modules():
#     return [
#         Extension(name="torchrl._torchrl", sources=[]),
#     ]
# 
# 
# # Based off of
# # https://github.com/pybind/cmake_example/blob/580c5fd29d4651db99d8874714b07c0c49a53f8a/setup.py
# class CMakeBuild(build_ext):
#     def run(self):
#         try:
#             subprocess.check_output(["cmake", "--version"])
#         except OSError:
#             raise RuntimeError("CMake is not available.") from None
#         super().run()
# 
#     def build_extension(self, ext):
#         # Since two library files (libtorchrl and _torchrl) need to be
#         # recognized by setuptools, we instantiate `Extension` twice. (see `get_ext_modules`)
# --------------------------------------------------
# the below code fragment can be found in:
# build_tools/setup_helpers/extension.py
# --------------------------------------------------
#                 stderr=STDOUT,
#             )
#         except CalledProcessError as exc:
#             print(exc.output)
# 
#         try:
#             check_output(
#                 ["cmake", "--build", "."] + build_args,
#                 cwd=self.build_temp,
#                 stderr=STDOUT,
#             )
#         except CalledProcessError as exc:
#             print(exc.output)
# 
#     def get_ext_filename(self, fullname):
#         ext_filename = super().get_ext_filename(fullname)
#         ext_filename_parts = ext_filename.split(".")
#         without_abi = ext_filename_parts[:-2] + ext_filename_parts[-1:]
#         ext_filename = ".".join(without_abi)
#         return ext_filename
# --------------------------------------------------
# the below code fragment can be found in:
# build_tools/setup_helpers/extension.py
# --------------------------------------------------
#                 ["cmake", str(_ROOT_DIR)] + cmake_args,
#                 cwd=self.build_temp,
#                 stderr=STDOUT,
#             )
#         except CalledProcessError as exc:
#             print(exc.output)
# 
#         try:
#             check_output(
#                 ["cmake", "--build", "."] + build_args,
#                 cwd=self.build_temp,
#                 stderr=STDOUT,
#             )
#         except CalledProcessError as exc:
#             print(exc.output)
# 
#     def get_ext_filename(self, fullname):
#         ext_filename = super().get_ext_filename(fullname)
#         ext_filename_parts = ext_filename.split(".")
#         without_abi = ext_filename_parts[:-2] + ext_filename_parts[-1:]
# --------------------------------------------------
"""Based on the above, complete the following code:"""

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import distutils.command.clean
import glob
import os
import shutil
import subprocess
import sys
from datetime import date
from pathlib import Path
from typing import List

from setuptools import find_packages, setup
from torch.utils.cpp_extension import BuildExtension, CppExtension

cwd = os.path.dirname(os.path.abspath(__file__))
try:
    sha = (
        subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd)
        .decode("ascii")
        .strip()
    )
except Exception:
    sha = "Unknown"


def get_version():
    version_txt = os.path.join(cwd, "version.txt")
    with open(version_txt, "r") as f:
        version = f.readline().strip()
    if os.getenv("BUILD_VERSION"):
        version = os.getenv("BUILD_VERSION")
    elif sha != "Unknown":
        version += "+" + sha[:7]
    return version


ROOT_DIR = Path(__file__).parent.resolve()


package_name = "torchrl"


def get_nightly_version():
    today = date.today()
    return f"{today.year}.{today.month}.{today.day}"


def parse_args(argv: List[str]) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="torchrl setup")
    parser.add_argument(
        "--package_name",
        type=str,
        default="torchrl",
        help="the name of this output wheel",
    )
    return parser.parse_known_args(argv)


def write_version_file(version):
    version_path = os.path.join(cwd, "torchrl", "version.py")
    with open(version_path, "w") as f:
        f.write("__version__ = '{}'\n".format(version))
        f.write("git_version = {}\n".format(repr(sha)))


def _get_pytorch_version():
    # if "PYTORCH_VERSION" in os.environ:
    #     return f"torch=={os.environ['PYTORCH_VERSION']}"
    return "torch"


def _get_packages():
    exclude = [
        "build*",
        "test*",
        "torchrl.csrc*",
        "third_party*",
        "tools*",
    ]
    return find_packages(exclude=exclude)


ROOT_DIR = Path(__file__).parent.resolve()


class clean(distutils.command.clean.clean):
    def run(self):
        # Run default behavior first
        distutils.command.clean.clean.run(self)

        # Remove torchrl extension
        for path in (ROOT_DIR / "torchrl").glob("**/*.so"):
            print(f"removing '{path}'")
            path.unlink()
        # Remove build directory
        build_dirs = [
            ROOT_DIR / "build",
        ]
        for path in build_dirs:
            if path.exists():
                print(f"removing '{path}' (and everything under it)")
                shutil.rmtree(str(path), ignore_errors=True)


# def _run_cmd(cmd):
#     try:
#         return subprocess.check_output(cmd, cwd=ROOT_DIR).decode("ascii").strip()
#     except Exception:
#         return None


def get_extensions():
    extension = CppExtension

    extra_link_args = []
    extra_compile_args = {
        "cxx": [
            "-O3",
            "-std=c++14",
            "-fdiagnostics-color=always",
'''

# Get the list of code fragments and other information
fragments = split_code_fragments(input_string)

# Now you can process, output, or use the fragments as needed
for i, fragment in enumerate(fragments):
    print(f"Fragment {i+1}:\n{fragment}\n")

Fragment 1:
# Here are some relevant code fragments from other files of the repo:

Fragment 2:
# --------------------------------------------------
# the below code fragment can be found in:
# torchrl/envs/vec_env.py
# --------------------------------------------------
#     @_check_start
#     def _shutdown_workers(self) -> None:
#         if self.is_closed:
#             raise RuntimeError(
#                 "calling {self.__class__.__name__}._shutdown_workers only allowed when env.is_closed = False"
#             )
#         for i, channel in enumerate(self.parent_channels):
#             if self._verbose:
#                 print(f"closing {i}")
#             # try:
#             channel.send(("close", None))
#             # except:
#             #     raise RuntimeError(f"closing {channel} number {i} failed")
#             msg, _ = channel.recv()
#             if msg != "closing":
#                 raise RuntimeError(
#                     f"Expected 'closing' but received {msg} fr

In [25]:
for i in random_combinations_of_fragments(fragments, 3):
    print(i)

# Here are some relevant code fragments from other files of the repo:
# --------------------------------------------------
# the below code fragment can be found in:
# build_tools/setup_helpers/extension.py
# --------------------------------------------------
#     return [
#         Extension(name="torchrl._torchrl", sources=[]),
#     ]
# 
# 
# # Based off of
# # https://github.com/pybind/cmake_example/blob/580c5fd29d4651db99d8874714b07c0c49a53f8a/setup.py
# class CMakeBuild(build_ext):
#     def run(self):
#         try:
#             subprocess.check_output(["cmake", "--version"])
#         except OSError:
#             raise RuntimeError("CMake is not available.") from None
#         super().run()
# 
#     def build_extension(self, ext):
#         # Since two library files (libtorchrl and _torchrl) need to be
#         # recognized by setuptools, we instantiate `Extension` twice. (see `get_ext_modules`)
#         # This leads to the situation where this `build_extension` method is

In [138]:
import re
import random
import json
from typing import List

def split_snippets(text: str, num_splits) -> List[str]:
    fragments = split_code_fragments(text)
    return random_combinations_of_fragments(fragments, num_splits)


def process_jsonl_and_write(file_path, output_jsonl_path, num_splits):
    output_variants = []

    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file: # Read line by line
            data = json.loads(line)
            prompt = data['prompt']
            task_id = data['metadata']['task_id']

            # Generate variants
            prompt_variants = split_snippets(prompt, num_splits)

            # Construct new data entries for each variant and add to output list
            for i, variant in enumerate(prompt_variants):
                new_data = {
                    "prompt": variant,
                    "metadata": {
                        "task_id": f"{task_id}",
                        "instance_id": f"{task_id}_{i}",
                        "ground_truth": data['metadata']['ground_truth'], # Keep the same ground truth
                        "fpath_tuple": data['metadata']['fpath_tuple'],
                        "context_start_lineno": data['metadata']['context_start_lineno'],
                        "line_no": data['metadata']['line_no'],
                    }
                }
                output_variants.append(new_data)

    # Write the output variants to a new JSONL file
    with open(output_jsonl_path, 'w') as outfile:
        for variant in output_variants:
            json.dump(variant, outfile)
            outfile.write('\n') # Write each JSON object on a new line

num_variants = 10

input_jsonl_path = 'subsets/rg-one-gram-ws-20-ss-2-fixed_0.1_with_instructions_temp_0.jsonl'  # Update this to your input file path
output_jsonl_path = f'temp_subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_{num_variants}_variants.jsonl' # Update this to your desired output file path

# Use the function with the path to your JSONL file
process_jsonl_and_write(input_jsonl_path, output_jsonl_path, num_variants)

Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will b

### Merge Overlapping Segments

In [151]:
def merge_overlapping_strings_with_header(strings):
    def get_header_and_body(s):
        # Split the string into header and body
        header, body = s.split('\n# --------------------------------------------------\n', 1)
        return header, body.strip()

    def can_merge(body_a, body_b):
        # Check if the ending of body_a overlaps with the start of body_b
        min_length = min(len(body_a), len(body_b))
        for i in range(min_length, 0, -1):
            if body_a.endswith(body_b[:i]):
                return i
            
        if (body_a in body_b) or (body_b in body_a):
            return min_length
        return 0

    def merge_two_strings(body_a, body_b, overlap_length):
        # Merge the bodies, keeping the overlapping part only once
        return body_a + body_b[overlap_length:]

    # Group strings by their header
    grouped_by_header = {}
    for s in strings:
        header, body = get_header_and_body(s)
        if header in grouped_by_header:
            grouped_by_header[header].append(body)
        else:
            grouped_by_header[header] = [body]

    # Merge strings within the same header group
    merged_strings = []
    for header, bodies in grouped_by_header.items():
        # Attempt to merge strings until no further merges are possible
        for i in range(10):
            while len(bodies) > 1:
                merged = False
                for i in range(len(bodies)):
                    for j in range(len(bodies)):
                        if i != j:
                            overlap_length = can_merge(bodies[i], bodies[j])
                            if overlap_length > 0:
                                bodies[i] = merge_two_strings(bodies[i], bodies[j], overlap_length)
                                del bodies[j]
                                merged = True
                                break
                    if merged:
                        break
                if not merged:
                    break
        
        # Add all merged bodies with the common header
        for body in bodies:
            merged_strings.append(header + '\n# --------------------------------------------------\n' + body)

    return merged_strings

# # Example usage:
# strings = [
#     "# --------------------------------------------------\n# the below code fragment can be found in:\n# torchrl/envs/vec_env.py\n# --------------------------------------------------\nhello, wor",
#     "# --------------------------------------------------\n# the below code fragment can be found in:\n# torchrl/envs/vec_env.py\n# --------------------------------------------------\nworld!",
#     "# --------------------------------------------------\n# the below code fragment can be found in:\n# torchrl/envs/vec_env.py\n# --------------------------------------------------\nwor",
#     "# --------------------------------------------------\n# the below code fragment can be found in:\n# torchrl/envs/vec_env.py\n# --------------------------------------------------\norld! My name is John.",
#     "# --------------------------------------------------\n# the below code fragment can be found in:\n# torchrl/envs/vec_env.py\n# --------------------------------------------------\n I am a programmer."
# ]

# merged_strings = merge_overlapping_strings_with_header(strings)
# for ms in merged_strings:
#     print("BEGINNING")
#     print(ms)

input_fragments = fragments

merged_fragments =  merge_overlapping_strings_with_header(input_fragments[1:-1])
for i, fragment in enumerate(merged_fragments):
    print(f"fragment: {i}")
    print(fragment)

fragment: 0
# --------------------------------------------------
# the below code fragment can be found in:
# torchrl/envs/vec_env.py
# --------------------------------------------------
#         ).clone()
# 
#     @_check_start
#     def _shutdown_workers(self) -> None:
#         if self.is_closed:
#             raise RuntimeError(
#                 "calling {self.__class__.__name__}._shutdown_workers only allowed when env.is_closed = False"
#             )
#         for i, channel in enumerate(self.parent_channels):
#             if self._verbose:
#                 print(f"closing {i}")
#             # try:
#             channel.send(("close", None))
#             # except:
#             #     raise RuntimeError(f"closing {channel} number {i} failed")
#             msg, _ = channel.recv()
#             if msg != "closing":
#                 raise RuntimeError(
#                     f"Expected 'closing' but received {msg} from worker {i}"
#                 )
# 
#         del self.sha

In [116]:
new_frag = [fragments[0]] + merged_fragments + [fragments[-1]]
print("\n".join(new_frag))

# Here are some relevant code fragments from other files of the repo:
# --------------------------------------------------
# the below code fragment can be found in:
# torchrl/envs/vec_env.py
# --------------------------------------------------
#         ).clone()
# 
#     @_check_start
#     def _shutdown_workers(self) -> None:
#         if self.is_closed:
#             raise RuntimeError(
#                 "calling {self.__class__.__name__}._shutdown_workers only allowed when env.is_closed = False"
#             )
#         for i, channel in enumerate(self.parent_channels):
#             if self._verbose:
#                 print(f"closing {i}")
#             # try:
#             channel.send(("close", None))
#             # except:
#             #     raise RuntimeError(f"closing {channel} number {i} failed")
#             msg, _ = channel.recv()
#             if msg != "closing":
#                 raise RuntimeError(
#                     f"Expected 'closing' but received {msg} from

In [121]:
import json

def merge_overlapping_jsonl_lines(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            line_data = json.loads(line)
            new_line_data = line_data.copy()
            old_prompt = line_data["prompt"]
            fragments = split_code_fragments(old_prompt)
            merged_fragments = merge_overlapping_strings_with_header(fragments[1:-1])
            new_fragments = [fragments[0]] + merged_fragments + [fragments[-1]]
            new_prompt = "\n".join(new_fragments)
            new_line_data["prompt"] = new_prompt
            json.dump(new_line_data, outfile)
            outfile.write('\n')

# Usage
input_jsonl_path = 'subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct.jsonl'  # Update this to your input file path
output_jsonl_path = input_jsonl_path.replace(".jsonl","_merged.jsonl")

merge_overlapping_jsonl_lines(input_jsonl_path, output_jsonl_path)

### Merge and then create subsets

In [132]:
input_jsonl_path = 'subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_merged.jsonl'  # Update this to your input file path
output_jsonl_path = 'subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_merged_3_variants.jsonl' # Update this to your desired output file path

# Use the function with the path to your JSONL file
process_jsonl_and_write(input_jsonl_path, output_jsonl_path, 3)

Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will b

### Subset the in-file context

In [133]:
def get_bottom_percent_lines(input_string, percent):
    if not 0 <= percent <= 100:
        raise ValueError("The percent must be between 0 and 100")

    # Split the string into lines
    lines = input_string.strip().split('\n')
    
    # Determine the number of lines that corresponds to the bottom x%
    number_of_lines = len(lines)
    cutoff_index = max(0, int((percent / 100) * number_of_lines))

    # Select the bottom x% lines
    bottom_lines = lines[-cutoff_index:] if cutoff_index > 0 else []

    # Join the selected lines back into a string
    return '\n'.join(bottom_lines)

# Example usage
input_string = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5"
percent = 40  # Suppose we want the bottom 40% of lines

result = get_bottom_percent_lines(input_string, percent)
print("The bottom {}% of lines:".format(percent))
print(result)

The bottom 40% of lines:
Line 4
Line 5


In [135]:
fragments

['# Here are some relevant code fragments from other files of the repo:',
 '# --------------------------------------------------\n# the below code fragment can be found in:\n# torchrl/envs/vec_env.py\n# --------------------------------------------------\n#     @_check_start\n#     def _shutdown_workers(self) -> None:\n#         if self.is_closed:\n#             raise RuntimeError(\n#                 "calling {self.__class__.__name__}._shutdown_workers only allowed when env.is_closed = False"\n#             )\n#         for i, channel in enumerate(self.parent_channels):\n#             if self._verbose:\n#                 print(f"closing {i}")\n#             # try:\n#             channel.send(("close", None))\n#             # except:\n#             #     raise RuntimeError(f"closing {channel} number {i} failed")\n#             msg, _ = channel.recv()\n#             if msg != "closing":\n#                 raise RuntimeError(\n#                     f"Expected \'closing\' but received {msg}

In [137]:
import json

def subset_infile_jsonl_lines(input_file, output_file, percent):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            line_data = json.loads(line)
            new_line_data = line_data.copy()

            old_prompt = line_data["prompt"]
            fragments = split_code_fragments(old_prompt)
            get_first_lines = fragments[-1].strip().split('\n')[:3]
            subset_infile_context = "\n".join(get_first_lines) + "\n" + get_bottom_percent_lines(fragments[-1], percent)

            fragments = fragments[:-1]
            new_fragments = fragments + [subset_infile_context]
            new_prompt = "\n".join(new_fragments)
            new_line_data["prompt"] = new_prompt

            json.dump(new_line_data, outfile)
            outfile.write('\n')

# Usage
percent = 50
input_jsonl_path = 'subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_merged.jsonl'  # Update this to your input file path
output_jsonl_path = input_jsonl_path.replace(".jsonl",f"_infile_subset_{percent}.jsonl")

subset_infile_jsonl_lines(input_jsonl_path, output_jsonl_path, percent)

### Create Multiple In-File Context Subsets

In [176]:
def format_code_context(input_fragments):
    # Split the string into lines

    context_string = "\n".join(input_fragments[-1].split('\n')[2:])

    lines = context_string.splitlines()
    
    # Initialize list for blocks
    blocks = []
    
    # Extract the last 10 lines for the bottom part
    last_10_lines = lines[-10:]
    
    # Process the rest of the lines in 20-line blocks
    remaining_lines = lines[:-10]

    # Create 20-line blocks
    for i in range(0, len(remaining_lines), 20):
        block = remaining_lines[i:i+20]
        blocks.append(block)

    # Format each block
    formatted_blocks = []
    for block in blocks:
        block_string = "\n# ".join(block)
        formatted_block = (
            "# --------------------------------------------------\n"
            "# the below code fragment can be found in:\n"
            "# the main script file\n"
            "# --------------------------------------------------\n"
            f"# {block_string}"
        )
        formatted_blocks.append(formatted_block)

    # Format the last 10 lines block
    last_10_block = (
        '''\n# --------------------------------------------------\n'''
        '"""Based on the above, complete the following code from the main script file:"""\n'
        + "\n".join([f"{line}" for line in last_10_lines])
    )

    # Combine all formatted blocks into a single string
    final_output =  "\n".join(input_fragments[:-1]) +"\n" + "\n".join(formatted_blocks) + last_10_block

    return final_output

# Example usage:
# Assuming 'in_file_context' variable contains the long in-file context itself
prompt = '''# Here are some relevant code fragments from other files of the repo:\n# --------------------------------------------------\n# the below code fragment can be found in:\n# src/diffusers/configuration_utils.py\n# --------------------------------------------------\n# \n#         if cls.has_compatibles:\n#             compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]\n#         else:\n#             compatible_classes = []\n# \n#         expected_keys_comp_cls = set()\n#         for c in compatible_classes:\n#             expected_keys_c = cls._get_init_keys(c)\n#             expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)\n#         expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)\n#         config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}\n# \n#         # remove attributes from orig class that cannot be expected\n#         orig_cls_name = config_dict.pop(\"_class_name\", cls.__name__)\n#         if orig_cls_name != cls.__name__ and hasattr(diffusers_library, orig_cls_name):\n#             orig_cls = getattr(diffusers_library, orig_cls_name)\n#             unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys\n#             config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}\n# \n# --------------------------------------------------\n# the below code fragment can be found in:\n# src/diffusers/configuration_utils.py\n# --------------------------------------------------\n#         # load diffusers library to import compatible and original scheduler\n#         diffusers_library = importlib.import_module(__name__.split(\".\")[0])\n# \n#         if cls.has_compatibles:\n#             compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]\n#         else:\n#             compatible_classes = []\n# \n#         expected_keys_comp_cls = set()\n#         for c in compatible_classes:\n#             expected_keys_c = cls._get_init_keys(c)\n#             expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)\n#         expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)\n#         config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}\n# \n#         # remove attributes from orig class that cannot be expected\n#         orig_cls_name = config_dict.pop(\"_class_name\", cls.__name__)\n#         if orig_cls_name != cls.__name__ and hasattr(diffusers_library, orig_cls_name):\n#             orig_cls = getattr(diffusers_library, orig_cls_name)\n#             unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys\n# --------------------------------------------------\n# the below code fragment can be found in:\n# src/diffusers/configuration_utils.py\n# --------------------------------------------------\n# \n#         return config_dict\n# \n#     @staticmethod\n#     def _get_init_keys(cls):\n#         return set(dict(inspect.signature(cls.__init__).parameters).keys())\n# \n#     @classmethod\n#     def extract_init_dict(cls, config_dict, **kwargs):\n#         # 0. Copy origin config dict\n#         original_dict = {k: v for k, v in config_dict.items()}\n# \n#         # 1. Retrieve expected config attributes from __init__ signature\n#         expected_keys = cls._get_init_keys(cls)\n#         expected_keys.remove(\"self\")\n#         # remove general kwargs if present in dict\n#         if \"kwargs\" in expected_keys:\n#             expected_keys.remove(\"kwargs\")\n#         # remove flax internal keys\n#         if hasattr(cls, \"_flax_internal_args\"):\n# --------------------------------------------------\n# the below code fragment can be found in:\n# src/diffusers/configuration_utils.py\n# --------------------------------------------------\n#             compatible_classes = []\n# \n#         expected_keys_comp_cls = set()\n#         for c in compatible_classes:\n#             expected_keys_c = cls._get_init_keys(c)\n#             expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)\n#         expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)\n#         config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}\n# \n#         # remove attributes from orig class that cannot be expected\n#         orig_cls_name = config_dict.pop(\"_class_name\", cls.__name__)\n#         if orig_cls_name != cls.__name__ and hasattr(diffusers_library, orig_cls_name):\n#             orig_cls = getattr(diffusers_library, orig_cls_name)\n#             unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys\n#             config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}\n# \n#         # remove private attributes\n#         config_dict = {k: v for k, v in config_dict.items() if not k.startswith(\"_\")}\n# \n#         # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments\n# --------------------------------------------------\n# the below code fragment can be found in:\n# src/diffusers/configuration_utils.py\n# --------------------------------------------------\n#             compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]\n#         else:\n#             compatible_classes = []\n# \n#         expected_keys_comp_cls = set()\n#         for c in compatible_classes:\n#             expected_keys_c = cls._get_init_keys(c)\n#             expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)\n#         expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)\n#         config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}\n# \n#         # remove attributes from orig class that cannot be expected\n#         orig_cls_name = config_dict.pop(\"_class_name\", cls.__name__)\n#         if orig_cls_name != cls.__name__ and hasattr(diffusers_library, orig_cls_name):\n#             orig_cls = getattr(diffusers_library, orig_cls_name)\n#             unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys\n#             config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}\n# \n#         # remove private attributes\n#         config_dict = {k: v for k, v in config_dict.items() if not k.startswith(\"_\")}\n# --------------------------------------------------\n\"\"\"Based on the above, complete the following code:\"\"\"\n\n = 0.1 * sample\n\n            with tempfile.TemporaryDirectory() as tmpdirname:\n                scheduler.save_config(tmpdirname)\n                new_scheduler = scheduler_class.from_pretrained(tmpdirname)\n\n            if num_inference_steps is not None and hasattr(scheduler, \"set_timesteps\"):\n                scheduler.set_timesteps(num_inference_steps)\n                new_scheduler.set_timesteps(num_inference_steps)\n            elif num_inference_steps is not None and not hasattr(scheduler, \"set_timesteps\"):\n                kwargs[\"num_inference_steps\"] = num_inference_steps\n\n            # Set the seed before step() as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler\n            if \"generator\" in set(inspect.signature(scheduler.step).parameters.keys()):\n                kwargs[\"generator\"] = torch.manual_seed(0)\n            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample\n\n            if \"generator\" in set(inspect.signature(scheduler.step).parameters.keys()):\n                kwargs[\"generator\"] = torch.manual_seed(0)\n            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample\n\n            assert torch.sum(torch.abs(output - new_output)) < 1e-5, \"Scheduler outputs are not identical\"\n\n    def check_over_forward(self, time_step=0, **forward_kwargs):\n        kwargs = dict(self.forward_default_kwargs)\n        kwargs.update(forward_kwargs)\n\n        num_inference_steps = kwargs.pop(\"num_inference_steps\", None)\n\n        for scheduler_class in self.scheduler_classes:\n            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):\n                time_step = float(time_step)\n\n            scheduler_config = self.get_scheduler_config()\n            scheduler = scheduler_class(**scheduler_config)\n\n            if scheduler_class == VQDiffusionScheduler:\n                num_vec_classes = scheduler_config[\"num_vec_classes\"]\n                sample = self.dummy_sample(num_vec_classes)\n                model = self.dummy_model(num_vec_classes)\n                residual = model(sample, time_step)\n            else:\n                sample = self.dummy_sample\n                residual = 0.1 * sample\n\n            with tempfile.TemporaryDirectory() as tmpdirname:\n                scheduler.save_config(tmpdirname)\n                new_scheduler = scheduler_class.from_pretrained(tmpdirname)\n\n            if num_inference_steps is not None and hasattr(scheduler, \"set_timesteps\"):\n                scheduler.set_timesteps(num_inference_steps)\n                new_scheduler.set_timesteps(num_inference_steps)\n            elif num_inference_steps is not None and not hasattr(scheduler, \"set_timesteps\"):\n                kwargs[\"num_inference_steps\"] = num_inference_steps\n\n            if \"generator\" in set(inspect.signature(scheduler.step).parameters.keys()):\n                kwargs[\"generator\"] = torch.manual_seed(0)\n            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample\n\n            if \"generator\" in set(inspect.signature(scheduler.step).parameters.keys()):\n                kwargs[\"generator\"] = torch.manual_seed(0)\n            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample\n\n            assert torch.sum(torch.abs(output - new_output)) < 1e-5, \"Scheduler outputs are not identical\"\n\n    def test_from_save_pretrained(self):\n        kwargs = dict(self.forward_default_kwargs)\n\n        num_inference_steps = kwargs.pop(\"num_inference_steps\", None)\n\n        for scheduler_class in self.scheduler_classes:\n            timestep = 1\n            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):\n                timestep = float(timestep)\n\n            scheduler_config = self.get_scheduler_config()\n            scheduler = scheduler_class(**scheduler_config)\n\n            if scheduler_class == VQDiffusionScheduler:\n                num_vec_classes = scheduler_config[\"num_vec_classes\"]\n                sample = self.dummy_sample(num_vec_classes)\n                model = self.dummy_model(num_vec_classes)\n                residual = model(sample, timestep)\n            else:\n                sample = self.dummy_sample\n                residual = 0.1 * sample\n\n            with tempfile.TemporaryDirectory() as tmpdirname:\n                scheduler.save_config(tmpdirname)\n                new_scheduler = scheduler_class.from_pretrained(tmpdirname)\n\n            if num_inference_steps is not None and hasattr(scheduler, \"set_timesteps\"):\n                scheduler.set_timesteps(num_inference_steps)\n                new_scheduler.set_timesteps(num_inference_steps)\n            elif num_inference_steps is not None and not hasattr(scheduler, \"set_timesteps\"):\n                kwargs[\"num_inference_steps\"] = num_inference_steps\n\n            if \"generator\" in set(inspect.signature(scheduler.step).parameters.keys()):\n                kwargs[\"generator\"] = torch.manual_seed(0)\n            output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample\n\n            if \"generator\" in set(inspect.signature(scheduler.step).parameters.keys()):\n                kwargs[\"generator\"] = torch.manual_seed(0)\n            new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample\n\n            assert torch.sum(torch.abs(output - new_output)) < 1e-5, \"Scheduler outputs are not identical\"\n\n    def test_compatibles(self):\n        for scheduler_class in self.scheduler_classes:\n            scheduler_config = self.get_scheduler_config()\n\n            scheduler = scheduler_class(**scheduler_config)\n\n            assert all(c is not None for c in scheduler.compatibles)\n\n            for comp_scheduler_cls in scheduler.compatibles:\n                comp_scheduler = comp_scheduler_cls.from_config(scheduler.config)\n                assert comp_scheduler is not None\n\n            new_scheduler = scheduler_class.from_config(comp_scheduler.config)\n\n            new_scheduler_config = {k: v for k, v in new_scheduler.config.items() if k in scheduler.config}\n            scheduler_diff = {k: v for k, v in new_scheduler.config.items() if k not in scheduler.config}\n\n            # make sure that configs are essentially identical\n            assert new_scheduler_config == dict(scheduler.config)\n\n            # make sure that only differences are for configs that are not in init\n            init_keys = inspect.signature(scheduler_class.__init__).parameters.keys()\n            assert set(scheduler_diff.keys()).intersection(set(init_keys)) == set()\n\n    def test_from_pretrained(self):\n        for scheduler_class in self.scheduler_classes:'''
input_fragments = split_code_fragments(prompt)
formatted_code = format_code_context(input_fragments)
print(formatted_code)

# Here are some relevant code fragments from other files of the repo:
# --------------------------------------------------
# the below code fragment can be found in:
# src/diffusers/configuration_utils.py
# --------------------------------------------------
# 
#         if cls.has_compatibles:
#             compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]
#         else:
#             compatible_classes = []
# 
#         expected_keys_comp_cls = set()
#         for c in compatible_classes:
#             expected_keys_c = cls._get_init_keys(c)
#             expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
#         expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
#         config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
# 
#         # remove attributes from orig class that cannot be expected
#         orig_cls_name = config_dict.pop("_class_name", cls.__name__

In [177]:
# First Write a function to convert the in-file context into the same format as the out-of-file context
import json

def merge_overlapping_jsonl_lines(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            line_data = json.loads(line)
            new_line_data = line_data.copy()
            old_prompt = line_data["prompt"]
            fragments = split_code_fragments(old_prompt)
            new_prompt = format_code_context(fragments)
            new_line_data["prompt"] = new_prompt
            json.dump(new_line_data, outfile)
            outfile.write('\n')

# Usage
input_jsonl_path = 'temp_subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct.jsonl'  # Update this to your input file path
output_jsonl_path = input_jsonl_path.replace(".jsonl","_infile_snippets.jsonl")

merge_overlapping_jsonl_lines(input_jsonl_path, output_jsonl_path)

In [180]:
at = 10
input_jsonl_path = 'temp_subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_infile_snippets.jsonl'
output_jsonl_path = input_jsonl_path.replace('.jsonl', f'_{at}.jsonl')

# Use the function with the path to your JSONL file
process_jsonl_and_write(input_jsonl_path, output_jsonl_path, at)

Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will be copied randomly to ensure sufficient fragments.
Existing fragments will b

### Multiple Infile Contexts with Consistent Length

In [183]:
import random
import re

def random_combinations_of_fragments_fixed_length(fragments, num_subsets):
    # First and last fragments assumed to be the header and footer
    header_fragment = fragments[0]
    footer_fragment = fragments[-1]
    # The central fragments will be randomized
    central_fragments = fragments[1:-1]
    
    if num_subsets > 10:
        # Return 3 subsets, each with ten randomly sampled central fragments without replacement
        snippets = []
        for _ in range(3):
            sample_fragments = random.sample(central_fragments, 10)
            snippet = header_fragment + '\n' + '\n'.join(sample_fragments) + '\n' + footer_fragment
            snippets.append(snippet)
        return snippets
    else:
        # Return a snippet containing all of the fragments
        snippet = header_fragment + '\n' + '\n'.join(central_fragments) + '\n' + footer_fragment
        return [snippet]*3  # wrapped in a list for consistent return type

In [None]:
import re
import random
import json
from typing import List

def split_snippets(text: str, num_splits) -> List[str]:
    fragments = split_code_fragments(text)
    return random_combinations_of_fragments_fixed_length(fragments, num_splits)


def process_jsonl_and_write(file_path, output_jsonl_path, num_splits):
    output_variants = []

    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file: # Read line by line
            data = json.loads(line)
            prompt = data['prompt']
            task_id = data['metadata']['task_id']

            # Generate variants
            prompt_variants = split_snippets(prompt, num_splits)

            # Construct new data entries for each variant and add to output list
            for i, variant in enumerate(prompt_variants):
                new_data = {
                    "prompt": variant,
                    "metadata": {
                        "task_id": f"{task_id}",
                        "instance_id": f"{task_id}_{i}",
                        "ground_truth": data['metadata']['ground_truth'], # Keep the same ground truth
                        "fpath_tuple": data['metadata']['fpath_tuple'],
                        "context_start_lineno": data['metadata']['context_start_lineno'],
                        "line_no": data['metadata']['line_no'],
                    }
                }
                output_variants.append(new_data)

    # Write the output variants to a new JSONL file
    with open(output_jsonl_path, 'w') as outfile:
        for variant in output_variants:
            json.dump(variant, outfile)
            outfile.write('\n') # Write each JSON object on a new line

num_variants = 10

input_jsonl_path = 'subsets/rg-one-gram-ws-20-ss-2-fixed_0.1_with_instructions_temp_0.jsonl'  # Update this to your input file path
output_jsonl_path = f'temp_subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_{num_variants}_variants.jsonl' # Update this to your desired output file path

# Use the function with the path to your JSONL file
process_jsonl_and_write(input_jsonl_path, output_jsonl_path, num_variants)

In [None]:
at = 10
input_jsonl_path = 'temp_subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_infile_snippets.jsonl'
output_jsonl_path = input_jsonl_path.replace('.jsonl', f'_{at}.jsonl')

# Use the function with the path to your JSONL file
process_jsonl_and_write(input_jsonl_path, output_jsonl_path, at)

### Subset to only short responses for pass@100

In [142]:
import tiktoken

In [149]:
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    num_tokens = len(encoding.encode(string))
    return num_tokens
num_tokens_from_string("This is an example string with several tokens.")

9

In [150]:
import json

def merge_overlapping_jsonl_lines(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:

        i = 0

        for line in infile:
            line_data = json.loads(line)
            new_line_data = line_data.copy()
            ground_truth = line_data["metadata"]["ground_truth"]
            if num_tokens_from_string(ground_truth) < 10:
                i += 1
                json.dump(new_line_data, outfile)
                outfile.write('\n')

        print(f"line {i} of {input_file} written to {output_file}")

# Usage
input_jsonl_path = 'subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct.jsonl'  # Update this to your input file path
output_jsonl_path = input_jsonl_path.replace(".jsonl","_small_gt.jsonl")

merge_overlapping_jsonl_lines(input_jsonl_path, output_jsonl_path)

line 85 of subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct.jsonl written to subsets/rg-one-gram-ws-20-ss-2-one-line_0.1_instruct_small_gt.jsonl


### Apply Syntax-Checking Before Eval