In [28]:
import json
from collections import Counter

In [31]:

def load_json(filename):
    """ Load JSON data from a file """
    with open(filename, 'r') as file:
        data = json.load(file)
    return data

def is_approximately_same(str1, str2, threshold=0.99):
    """ Determine if two strings are approximately the same based on character overlap """
    count1 = Counter(str1)
    count2 = Counter(str2)
    common_chars = sum((count1 & count2).values())
    required_chars = int(min(len(str1), len(str2)) * threshold)
    return common_chars >= required_chars

def find_non_matching_contexts(file_a, file_b):
    """ Find contexts in B that do not match any passage in A """
    data_a = load_json(file_a)
    data_b = load_json(file_b)
    non_matching_contexts = []
    
    for element in data_b:
        match_found = False
        context = element['context']  # Assuming the key in B is 'contexts'
        for test_element in data_a:
            passage = test_element['passage']  # Assuming the key in A is 'passages'
            if context[:20] == passage[:20]:  # Simple equality check; adjust as needed for partial matches, etc.
                match_found = True
                # print(f"visual checking:\n test: {context[:20]}\ncomp: {passage[:20]}")

                break
        if not match_found:
            # print(f"visual checking:\n test: {context[:20]}\ncomp: {passage[:20]}")

            non_matching_contexts.append(element)

    
    return non_matching_contexts

def save_results(non_matching_contexts, output_file):
    """ Save the non-matching contexts to a JSON file """
    output_data = {"non_matching_contexts": non_matching_contexts}
    with open(output_file, 'w') as file:
        json.dump(output_data, file, indent=4,ensure_ascii=False)



In [32]:
# Example usage
file_a = '/home/recovery/Desktop/AGIEal_SFT_tools/data/benchmark_set/en_MCQ_lsat-ar.json'
file_b = '/home/recovery/Desktop/AGIEal_SFT_tools/From_data/complete_lsat_data/train_ar.json'
output_file = 'non_matching_contexts_ar.json'

non_matching_contexts = find_non_matching_contexts(file_a, file_b)
save_results(non_matching_contexts, output_file)


In [None]:

file_a = '/home/recovery/Desktop/AGIEal_SFT_tools/data/benchmark_set/en_MCQ_logiqa-en.json'
file_b = '/home/recovery/Desktop/AGIEal_SFT_tools/data/LogiQA/SFT_data_for_LogiQA_from_LogiQAofficial_Train_8678.json'
output_file = 'non_duplicates.json'


In [None]:
# Load JSON data from both files
with open(file_a, 'r', encoding='utf-8') as a:
    data_a = json.load(a)
with open(file_b, 'r', encoding='utf-8') as b:
    data_b = json.load(b)

# Create a set of contents from file A for fast lookup
contents_a = {item['passage'].strip()[:20] for item in data_a}



In [60]:

# Check for duplicates and prepare non-duplicate list
non_duplicates = []
duplicate_count = 0

for item in data_b:
    # Assuming we are comparing the 'content' of the first message in each item
    content_b = item['messages'][0]['content'].strip()[:20]
    # print(len(content_b),content_b)
    if content_b in contents_a:

        print("matched: ",content_b)
        duplicate_count += 1

    
    else:
        non_duplicates.append(item)

# Print the number of duplicates
print(f"Number of duplicates found: {duplicate_count}")

# Write non-duplicates to the output JSON file
with open(output_file, 'w', encoding='utf-8') as out_file:
    json.dump(non_duplicates, out_file, indent=4, ensure_ascii=False)

print(f"Non-duplicate items written to {output_file}")


matched:  Statistics show that
matched:  Some college teacher
matched:  According to the con
matched:  It is generally beli
matched:  Statistics show that
matched:  Some people think th
matched:  It is generally beli
matched:  It is generally beli
matched:  Statistics show that
matched:  According to statist
matched:  It is generally beli
matched:  The researchers divi
matched:  Some people think th
matched:  In recent years, Chi
matched:  Since the beginning 
matched:  It is generally beli
matched:  In recent years, the
matched:  According to statist
matched:  Research shows that 
matched:  Pollution problems c
matched:  Since the beginning 
matched:  In order to reduce t
matched:  In the planning of a
matched:  The company sent thr
matched:  In a traditional Chi
matched:  In recent years, gra
matched:  A unit conducted the
matched:  Zhang Ming, Li Ying,
matched:  The person in charge
matched:  There are five teams
matched:  Compared with small 
matched:  Researchers recently
matched: