In [12]:
import json  
import re  

In [13]:
def extract_last_number(text):  
    if not isinstance(text, str):  
        return None  
    # Remove commas from numbers  
    text = text.replace(',', '')  
      
    # Try to extract number within \boxed{}  
    boxed_match = re.search(r'\\boxed{(\d+)}', text)  
    if boxed_match:  
        number = boxed_match.group(1)  
        if len(number) >= 6:  
            return number  
      
    # Fallback: Find all numbers with 6 or more digits  
    numbers = re.findall(r'\b\d{6,}\b', text)  
    if numbers:  
        return numbers[-1]  # Return the last valid number  
    return None  
  

In [14]:
# Load the file  
with open('gemini_responses_complete.json', 'r') as f:  
    data = json.load(f)  

In [21]:
# Process each response to extract the answer  
results = []  
for i, item in enumerate(data):  
    new_thought = item.get('new_thought', '')  
    answer = extract_last_number(new_thought)  
    if answer:  
        item['new_generated_answer'] = int(answer)
        results.append((i, answer))  

In [19]:

  


  

  
# Print first few results to validate extraction  
print("First 5 extracted answers:")  
for i, (idx, answer) in enumerate(results[:5]):  
    print("Response " + str(idx) + ": " + answer)  
  
print("\nTotal answers found: " + str(len(results)))  
  
# Check digit count distribution  
digit_counts = [len(answer) for _, answer in results]  
print("\nDigit count distribution:")  
for digits in sorted(set(digit_counts)):  
    count = digit_counts.count(digits)  
    print(str(digits) + " digits: " + str(count) + " answers")  
  
print("done")  

First 5 extracted answers:
Response 0: 25817814
Response 1: 32127181
Response 2: 29492222
Response 3: 32447079
Response 4: 25148992

Total answers found: 1170

Digit count distribution:
6 digits: 3 answers
7 digits: 24 answers
8 digits: 1141 answers
9 digits: 2 answers
done


In [17]:
print(results)

[(0, '25817814'), (1, '32127181'), (2, '29492222'), (3, '32447079'), (4, '25148992'), (5, '30059751'), (6, '34487329'), (7, '26251950'), (8, '33517074'), (9, '22532845'), (10, '18471540'), (11, '35985795'), (12, '28209813'), (13, '24912373'), (14, '34290357'), (15, '27514068'), (16, '27941138'), (17, '32276231'), (18, '23988258'), (19, '26308879'), (20, '35890309'), (22, '27972653'), (23, '22829342'), (24, '32009093'), (26, '21588147'), (27, '44883182'), (28, '22299042'), (29, '39621696'), (30, '30419156'), (32, '49680873'), (33, '36165189'), (34, '30052588'), (35, '28160811'), (36, '33379071'), (37, '32726543'), (38, '35199580'), (41, '24141594'), (43, '20550718'), (45, '28372046'), (46, '5402628'), (47, '30010310'), (48, '32487081'), (49, '47448742'), (50, '28374273'), (51, '24578337'), (52, '40405141'), (54, '35001548'), (56, '35911496'), (57, '36279291'), (58, '36310257'), (62, '32775449'), (63, '8427319'), (64, '30046209'), (65, '15599609'), (67, '23457258'), (68, '22129195'), (69

In [22]:
# After updating the data
with open('updated_gemini_responses.json', 'w') as f:
    json.dump(data, f, indent=4)


In [1]:
import json

def process_json(input_file_path, output_file_path):
    try:
        # Load the JSON file
        with open(input_file_path, 'r') as file:
            data = json.load(file)
        
        # Initialize a list to store matching entries
        matching_entries = []
        
        # Check if data is a list
        if isinstance(data, list):
            for entry in data:
                # Check if both fields exist and have the same value
                if "actual_answer" in entry and "new_generated_answer" in entry and entry["actual_answer"] == entry["new_generated_answer"]:
                    matching_entries.append(entry)
        else:
            # Handle the case where data is not a list (e.g., a single object)
            if "actual_answer" in data and "new_generated_answer" in data and data["actual_answer"] == data["new_generated_answer"]:
                matching_entries.append(data)
        
        # Save matching entries to a new JSON file
        with open(output_file_path, 'w') as output_file:
            json.dump(matching_entries, output_file, indent=4)
        
        print(f"Matching entries saved to {output_file_path}")
    
    except FileNotFoundError:
        print("Input file not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_file_path = 'updated_gemini_responses.json'
output_file_path = 'output.json'
process_json(input_file_path, output_file_path)


Matching entries saved to output.json


## Formatting for Qwen Alpaca format   

In [1]:
import json

with open('output.json', 'r') as file:
    data = json.load(file)


In [15]:
instructions = []
inputs = []
outputs = []

for entry in data:
    textin = f'Add {entry.get('numbers')} <think>{entry.get('thought')}'
    textout = f'{entry.get('new_thought')}</think><answer>{entry.get('actual_answer')}</answer>'
    instructions.append('Add')
    inputs.append(textin)
    outputs.append(textout)

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""