In [1]:
import json

# read jsonl file from disk
with open('llama3-70b-solution-last.jsonl', 'r') as f:
    jsonl_lines = f.readlines()

import re


def extract_answer(data):
    # Step 1: Find the location of the "answer" keyword
    answer_start = data.find('"answer')
    if answer_start == -1:
        return None  # If "answer" is not found, return None

    # Step 2: Find the start of the value after "answer"
    answer_start = data.find(':', answer_start)  # Find the colon after "answer"
    if answer_start == -1:
        return None  # If the colon is not found, return None

    # Step 3: Find the first occurrence of "\ndata" after the colon
    data_end = data.find("\ndata", answer_start)
    if data_end == -1:
        return None  # If "\ndata" is not found, return None

    # Step 4: Extract the value between the colon and the first "\ndata"
    answer_value = data[answer_start + 1:data_end].strip().strip('"')

    return answer_value

# Function to extract Problem ID and Answer
def extract_problem_and_answer(jsonl_lines):
    extracted_data = []
    for line in jsonl_lines:
        # Try to load the JSON data from the line
        try:
            # Assuming the first key is always the problem id
            data = json.loads(line)
            problem_id = list(data.keys())[0]  # Get the first key as problem ID
            problem_data = data[problem_id]  # Get the associated data as a string

            # Find and extract the JSON object in the 'data' field
            start_index = problem_data.find('{')
            end_index = problem_data.rfind('}') + 1

            answer = extract_answer(problem_data[start_index:end_index])

            # Construct the JSONL format output
            extracted_data.append({problem_id: {"answer": str(answer)}})
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
    
    return extracted_data

# Extract data from the JSONL lines
result = extract_problem_and_answer(jsonl_lines)

# Output the extracted problem IDs and answers in JSONL format
with open('clean/output-last.jsonl', 'w') as f:
    for item in result:
        f.write(json.dumps(item) + '\n')
