In [1]:
import os
os.environ["ANTHROPIC_API_KEY"] = YOUR_ANTRHOPIC_API_KEY

from typing import DefaultDict
from birds_transforms.utils import (
    find_intervals, 
    identify_code_location, 
    prompt_anthropic,
    extract_tag_list,
)
import shutil
import time
from pathlib import Path

import argparse
from tqdm import tqdm
import json
from core.data import descriptions
from core.data import tokenization

OPUS = "claude-3-opus-20240229"
OPUS_BR = "anthropic.claude-3-opus-20240229-v1:0"
HAIKU = "anthropic.claude-3-haiku-20240307-v1:0"
SONNET = "anthropic.claude-3-sonnet-20240229-v1:0"
SONNET_3_5 = "claude-3-5-sonnet-20240620"

[32m2024-08-03 03:36:45.002[0m | [34m[1mDEBUG   [0m | [36mbirds_transforms.constants[0m:[36mgetenv[0m:[36m16[0m - [34m[1mLoading environment variable ANTHROPIC_API_KEY[0m
[32m2024-08-03 03:36:45.006[0m | [34m[1mDEBUG   [0m | [36mbirds_transforms.constants[0m:[36mgetenv[0m:[36m16[0m - [34m[1mLoading environment variable OPENAI_API_KEY[0m
[32m2024-08-03 03:36:45.007[0m | [34m[1mDEBUG   [0m | [36mbirds_transforms.constants[0m:[36mgetenv[0m:[36m16[0m - [34m[1mLoading environment variable ENVROOT[0m
[32m2024-08-03 03:36:45.008[0m | [34m[1mDEBUG   [0m | [36mbirds_transforms.constants[0m:[36mgetenv[0m:[36m21[0m - [34m[1mUsing default value for environment variable TREE_SITTER_LIBRARY_PATH[0m
  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
with open('../test_records.jsonl', 'r') as f:
    test_records = [json.loads(line) for line in f]

In [3]:
with open('../test-ids.json', 'r') as f:
    test_ids = json.load(f)

In [6]:
SYSTEM_TEMPLATE = """
You are an experienced program analyzer who can identify potential runtime errors without running the programs.
"""

TEMPLATE = """

Given the description on input and a implemented script, please prediction what kinds of runtime errors the implementation would encounter:

Here is the input:
<Input>
{input}
</Input>

Here is the implementation:
<Implementation>
{implementation}
</Implementation>

Please predict the first runtime error that might crash the program. Potential runtime errors are:\
1: 'No Error', \
2: 'Other', \
3: 'Timeout', \
4: 'AssertionError', \
5: 'AttributeError', \
6: 'decimal', \
7: 'EOFError', \
8: 'FileNotFoundError', \
9: 'ImportError', \
10: 'IndentationError', \
11: 'IndexError', \
12: 'KeyError', \
13: 'MathDomainError', \
14: 'MemoryError', \
15: 'ModuleNotFoundError', \
16: 'NameError', \
17: 'OSError', \
18: 'OverflowError', \
19: 're.error', \
20: 'RecursionError', \
21: 'RuntimeError', \
22: 'StopIteration', \
23: 'SyntaxError', \
24: 'TabError', \
25: 'TypeError', \
26: 'UnboundLocalError', \
27: 'ValueError', \
28: 'ZeroDivisionError', \
29: 'numpy.AxisError' \

Please output your predicted error type in the "Conclusion" section, being wrapped by <Conclusion></Conclusion>; \
and your reasoning in the "Reasoning" section, being wrapped by <Reasoning></Reasoning>. \
"""

In [5]:
error_types = {
    1: 'No Error',
    2: 'Other',
    3: 'Timeout',
    4: 'AssertionError',
    5: 'AttributeError',
    6: 'decimal',
    7: 'EOFError',
    8: 'FileNotFoundError',
    9: 'ImportError',
    10: 'IndentationError',
    11: 'IndexError',
    12: 'KeyError',
    13: 'MathDomainError',
    14: 'MemoryError',
    15: 'ModuleNotFoundError',
    16: 'NameError',
    17: 'OSError',
    18: 'OverflowError',
    19: 're.error',
    20: 'RecursionError',
    21: 'RuntimeError',
    22: 'StopIteration',
    23: 'SyntaxError',
    24: 'TabError',
    25: 'TypeError',
    26: 'UnboundLocalError',
    27: 'ValueError',
    28: 'ZeroDivisionError',
    29: 'numpy.AxisError'
}


In [None]:
test_ids

In [7]:
errors = []

In [8]:
encoding = 'utf-8'
for record, ids in zip(test_records, test_ids):
    error_type = record['target']['int64List']['value']
    true_error_type = int(error_type[0])
    problem_id = ids[0]
    submission_id = ids[1]
    tmp = {
        'problem_id': problem_id,
        'submission_id': submission_id,
        'true_error_type': true_error_type
    }
    if true_error_type != 1:
        errors.append(tmp)

In [None]:
import numpy as np
np.random.seed(42)
safes = []
if args.no_error:
    num_sample = len(errors)
    count = 0
    while count < num_sample:
        index = np.random.randint(len(errors))
        record = test_records[index]
        error_type = int(record['target']['int64List']['value'])
        if error_type == 1:
            safes.append(index)
            count += 1

In [9]:
len(errors)

219091

In [None]:
# save errors to a jsonl
import json
with open('errors.jsonl', 'w') as outfile:
    for error in errors:
        json.dump(error, outfile)
        outfile.write('\n')

In [29]:
true_error_type

1

In [None]:
root_directory = '/home/ubuntu/mnt/agent/amazon-Q/NGDEBirds/NGDEBirdsScienceTransforms/src/birds_transforms/examples/Project_CodeNet'
implementation_path = os.path.join(root_directory, 'data', problem_id, 'Python', f'{submission_id}.py')
with open(implementation_path, 'r') as f:
    implementation = f.read()
description_path = os.path.join(root_directory, 'problem_descriptions', f'{problem_id}.html')
with open(description_path, 'r') as f:
    problem_description = f.read()
    info = descriptions.extract_input_information(problem_description)

In [31]:
prompt = TEMPLATE.format(
    input=info,
    implementation=implementation
)
response = prompt_anthropic(
    system=SYSTEM_TEMPLATE,
    prompt=prompt,
    model_id=OPUS,
    temperature=0.1,
)
print(response)
error_type = extract_tag_list('Conclusion', response)[0].strip().split(':')[0]
print('predicted error type', error_type)
print('true error type', true_error_type)

Here is my analysis of the given implementation:

<Reasoning>
The implementation looks correct and should not encounter any runtime errors based on the given input description. Here's why:

1. The input is properly read using int(input()) and map(float, input().split()), which matches the specified input format of an integer N followed by N lines of real numbers.

2. The math operations like hypot and abs are used correctly and should not cause any domain errors or exceptions.

3. The comparisons and if-elif conditions are properly structured and will execute without any errors.

4. There are no signs of potential infinite recursion, out of memory issues, or other runtime errors.

As long as the input strictly follows the specified format, this implementation should run without any runtime errors.
</Reasoning>

<Conclusion>1: 'No Error'</Conclusion>
predicted error type 1
true error type 1
