# Spice Dataset Preparation

In [1]:
import json
from typing import Dict
from Models import *
from Entity_Resolver import *
from Dataset_stats import *
from Import import *
from Export import *

### Resolve Wikidata Entities

In [5]:
entity = 'Q81938' #Q15901648
property = 'Q838948'
print(f"{entity}: {retriev_wikidata_entity(entity)}")
print(f"{property}: {retriev_wikidata_entity(property)}")

Q81938: pain
Q838948: work of art


### Create Dataframe and Export to csv

In [None]:
input_folder_path = '../SPICE_dataset/train'
output_folder_path = './SPICE_dataset_pp/train'
create_spice_csv(input_folder_path, output_folder_path, 25, 25,189, 300)

In [None]:
input_folder_path = '../SPICE_dataset/test'
output_folder_path = './SPICE_dataset_pp/test'
create_spice_csv_random(input_folder_path, output_folder_path, 172)

## Create Test Set

In [2]:
input_path = "../SPICE_dataset/test"
output_path = "./SPICE_dataset_pp/test"

### Calculate Number of Question Types and Subcategories in Test Set

In [5]:
question_types = list_question_types_with_categories(input_path)
print(question_types)

{'Clarification': ['Simple Question|Single Entity|Indirect', 'Comparative|Count over More/Less|Single entity type|Indirect', 'Quantitative|Count|Single entity type|Indirect', 'Comparative|More/Less|Mult. entity type|Indirect', 'Quantitative|Count|Logical operators|Indirect', 'Comparative|More/Less|Single entity type|Indirect', 'Comparative|Count over More/Less|Mult. entity type|Indirect'], 'Comparative Reasoning (All)': ['Comparative|More/Less|Single entity type', 'Comparative|More/Less|Single entity type|Incomplete', 'Comparative|More/Less|Mult. entity type', 'Comparative|More/Less|Mult. entity type|Incomplete', 'Comparative|More/Less|Mult. entity type|Indirect', 'Comparative|More/Less|Single entity type|Indirect', ''], 'Comparative Reasoning (Count) (All)': ['', 'Comparative|Count over More/Less|Mult. entity type', 'Comparative|Count over More/Less|Mult. entity type|Incomplete', 'Comparative|Count over More/Less|Single entity type', 'Comparative|Count over More/Less|Mult. entity type

In [6]:
categories = [question_types[key] for key in question_types]
# Change categories to a flat list
categories = [item for sublist in categories for item in sublist]
print(categories)
print(f"Number of question types: {len(question_types)}")
print(f"Number of question (sub)categories: {len(categories)}")

['Simple Question|Single Entity|Indirect', 'Comparative|Count over More/Less|Single entity type|Indirect', 'Quantitative|Count|Single entity type|Indirect', 'Comparative|More/Less|Mult. entity type|Indirect', 'Quantitative|Count|Logical operators|Indirect', 'Comparative|More/Less|Single entity type|Indirect', 'Comparative|Count over More/Less|Mult. entity type|Indirect', 'Comparative|More/Less|Single entity type', 'Comparative|More/Less|Single entity type|Incomplete', 'Comparative|More/Less|Mult. entity type', 'Comparative|More/Less|Mult. entity type|Incomplete', 'Comparative|More/Less|Mult. entity type|Indirect', 'Comparative|More/Less|Single entity type|Indirect', '', '', 'Comparative|Count over More/Less|Mult. entity type', 'Comparative|Count over More/Less|Mult. entity type|Incomplete', 'Comparative|Count over More/Less|Single entity type', 'Comparative|Count over More/Less|Mult. entity type|Indirect', 'Comparative|Count over More/Less|Single entity type|Indirect', 'Comparative|Cou

In [3]:
question_type_instances = count_instances_of_each_question_type(input_path)
print(question_type_instances)

{'Clarification': 11529, 'Comparative Reasoning (All)': 13966, 'Comparative Reasoning (Count) (All)': 13821, 'Logical Reasoning (All)': 21539, 'Quantitative Reasoning (All)': 8361, 'Quantitative Reasoning (Count) (All)': 22820, 'Simple Question (Coreferenced)': 53568, 'Simple Question (Direct)': 80259, 'Simple Question (Ellipsis)': 9804, 'Verification (Boolean) (All)': 25607}


In [4]:
question_subtype_instances = count_instances_of_question_subtypes(input_path)
print(question_subtype_instances)

{'Simple Question (Direct) [Simple Question|Single Entity]': 46530, 'Simple Question (Coreferenced) [Simple Question|Single Entity|Indirect]': 40352, 'Simple Question (Direct) [Simple Question]': 27769, 'Simple Question (Ellipsis) [only subject is changed, parent and predicate remains same]': 8967, 'Logical Reasoning (All) [Logical|Union|Single_Relation]': 8711, 'Clarification [Simple Question|Single Entity|Indirect]': 7323, 'Simple Question (Coreferenced)': 7323, 'Simple Question (Direct) [Simple Question|Mult. Entity|Indirect]': 5960, 'Simple Question (Coreferenced) [Simple Question|Mult. Entity]': 5893, 'Verification (Boolean) (All) [Verification|2 entities, both direct]': 5484, 'Verification (Boolean) (All) [Verification|2 entities, one direct and one indirect, object is indirect]': 5349, 'Logical Reasoning (All) [Logical|Union|Multiple_Relation]': 5087, 'Verification (Boolean) (All) [Verification|one entity, multiple entities (as object) referred indirectly]': 4898, 'Quantitative 

In [5]:
question_type_percentages = calculate_percentage_for_each_question_type(question_type_instances, sort_key=lambda item: item[0])
print(question_type_percentages)

# Check rounding errors
print(f" => {sum(question_type_percentages.values())}")

{'Clarification': 4.4126, 'Comparative Reasoning (All)': 5.3453, 'Comparative Reasoning (Count) (All)': 5.2898, 'Logical Reasoning (All)': 8.2438, 'Quantitative Reasoning (All)': 3.2001, 'Quantitative Reasoning (Count) (All)': 8.7341, 'Simple Question (Coreferenced)': 20.5026, 'Simple Question (Direct)': 30.7183, 'Simple Question (Ellipsis)': 3.7524, 'Verification (Boolean) (All)': 9.8008}
 => 99.99979999999998


In [6]:
question_subtype_percentages = calculate_percentage_for_each_question_type(question_subtype_instances, sort_key=lambda item: item[1], sort_reverse=False)
print(question_subtype_percentages)

# Check rounding errors
print(f" => {sum(question_subtype_percentages.values())}")

{'Logical Reasoning (All) [Logical|Difference|Single_Relation|Incomplete]': 0.0031, 'Logical Reasoning (All) [Logical|Difference|Single_Relation]': 0.0838, 'Quantitative Reasoning (All) [Quantitative|Min/Max|Single entity type]': 0.1209, 'Logical Reasoning (All) [Logical|Intersection|Single_Relation|Incomplete]': 0.1527, 'Comparative Reasoning (Count) (All) [Comparative|Count over More/Less|Single entity type|Indirect]': 0.1546, 'Logical Reasoning (All) [Logical|Difference|Multiple_Relation]': 0.155, 'Comparative Reasoning (All) [Comparative|More/Less|Single entity type|Indirect]': 0.1565, 'Clarification [Comparative|More/Less|Single entity type|Indirect]': 0.1661, 'Clarification [Comparative|Count over More/Less|Single entity type|Indirect]': 0.173, 'Comparative Reasoning (Count) (All) [Comparative|Count over More/Less|Mult. entity type|Indirect]': 0.1963, 'Comparative Reasoning (All) [Comparative|More/Less|Mult. entity type|Indirect]': 0.2029, 'Clarification [Comparative|More/Less|Mu

### Calculate Required Number of Question Types and Subcategories for Test Subset

In [7]:
# Adapt to your needs
sample_size = 1500

In [8]:
required_samples_per_id = get_required_samples_for_each_question_type(sample_size, question_type_percentages, sort_key=lambda item: item[0], reverse=False)
print(required_samples_per_id)

# Check rounding errors
print(f" => {sum(required_samples_per_id.values())}")

{'Clarification': 66, 'Comparative Reasoning (All)': 80, 'Comparative Reasoning (Count) (All)': 79, 'Logical Reasoning (All)': 124, 'Quantitative Reasoning (All)': 48, 'Quantitative Reasoning (Count) (All)': 131, 'Simple Question (Coreferenced)': 308, 'Simple Question (Direct)': 461, 'Simple Question (Ellipsis)': 56, 'Verification (Boolean) (All)': 147}
 => 1500


In [11]:
required_samples_per_question_type = get_required_samples_for_each_question_type(sample_size, question_subtype_percentages, sort_key=lambda item: item[1], reverse=False)
print(required_samples_per_question_type)

# Check rounding errors
print(f" => {sum(required_samples_per_question_type.values())}")

{'Logical Reasoning (All) [Logical|Difference|Single_Relation|Incomplete]': 0, 'Logical Reasoning (All) [Logical|Difference|Single_Relation]': 1, 'Quantitative Reasoning (All) [Quantitative|Min/Max|Single entity type]': 2, 'Logical Reasoning (All) [Logical|Intersection|Single_Relation|Incomplete]': 2, 'Comparative Reasoning (Count) (All) [Comparative|Count over More/Less|Single entity type|Indirect]': 2, 'Logical Reasoning (All) [Logical|Difference|Multiple_Relation]': 2, 'Comparative Reasoning (All) [Comparative|More/Less|Single entity type|Indirect]': 2, 'Clarification [Comparative|More/Less|Single entity type|Indirect]': 3, 'Clarification [Comparative|Count over More/Less|Single entity type|Indirect]': 3, 'Comparative Reasoning (Count) (All) [Comparative|Count over More/Less|Mult. entity type|Indirect]': 3, 'Comparative Reasoning (All) [Comparative|More/Less|Mult. entity type|Indirect]': 3, 'Clarification [Comparative|More/Less|Mult. entity type|Indirect]': 3, 'Clarification [Compar

### Calculate Current Number of Instance per Question Subcategory in Test Subset

In [10]:
question_subtype_instances_pp = count_instances_of_question_subtypes_csv(output_path, sort_key=lambda item: item[1], reverse=True)
print(question_subtype_instances_pp)

{'Simple Question (Direct) [Simple Question|Single Entity]': 345, 'Simple Question (Coreferenced) [Simple Question|Single Entity|Indirect]': 304, 'Simple Question (Direct) [Simple Question]': 216, 'Simple Question (Ellipsis) [only subject is changed, parent and predicate remains same]': 76, 'Logical Reasoning (All) [Logical|Union|Single_Relation]': 71, 'Clarification [Simple Question|Single Entity|Indirect]': 60, 'Simple Question (Coreferenced)': 60, 'Simple Question (Coreferenced) [Simple Question|Mult. Entity]': 52, 'Simple Question (Direct) [Simple Question|Mult. Entity|Indirect]': 45, 'Comparative Reasoning (All) [Comparative|More/Less|Mult. entity type]': 40, 'Verification (Boolean) (All) [Verification|2 entities, both direct]': 40, 'Verification (Boolean) (All) [Verification|3 entities, all direct, 2 are query entities]': 39, 'Quantitative Reasoning (Count) (All) [Quantitative|Count|Single entity type]': 38, 'Verification (Boolean) (All) [Verification|2 entities, one direct and o

In [16]:
missing_samples = count_difference_of_required_samples_and_available_samples(required_samples_per_question_type, question_subtype_instances_pp, sort_key=lambda item: item[1], reverse=True)
# set missing_samples to 0 if it is negative
missing_samples = {key: 0 if missing_samples[key] < 0 else missing_samples[key] for key in missing_samples}

print(f"Number of categories with missing samples: {len([key for key in missing_samples if missing_samples[key] > 0])}")
print(f"Total number of missing instances: {sum([missing_samples[key] for key in missing_samples])}")
print(missing_samples)

Number of categories with missing samples: 0
Total number of missing instances: 0
{'Logical Reasoning (All) [Logical|Difference|Single_Relation|Incomplete]': 0, 'Quantitative Reasoning (All) [Quantitative|Min/Max|Single entity type]': 0, 'Comparative Reasoning (Count) (All) [Comparative|Count over More/Less|Mult. entity type|Indirect]': 0, 'Logical Reasoning (All) [Logical|Intersection|Single_Relation|Incomplete]': 0, 'Comparative Reasoning (All) [Comparative|More/Less|Mult. entity type|Indirect]': 0, 'Clarification [Quantitative|Count|Logical operators|Indirect]': 0, 'Quantitative Reasoning (All) [Quantitative|Atleast/ Atmost/ Approx. the same/Equal|Single entity type]': 0, 'Logical Reasoning (All) [Logical|Difference|Multiple_Relation]': 0, 'Comparative Reasoning (All) [Comparative|More/Less|Single entity type|Indirect]': 0, 'Clarification [Comparative|Count over More/Less|Single entity type|Indirect]': 0, 'Quantitative Reasoning (Count) (All) [Quantitative|Count|Logical operators|In

### Process missing samples

In [33]:
def preprocess_missing_samples(required_samples_per_question_type: Dict[str, int], input_path: str, output_path: str):
    ''' Preprocesses the missing samples '''
    file_paths = get_file_paths(input_path)
    available_samples = count_instances_of_question_subtypes_csv(output_path, sort_key=lambda item: item[1], reverse=False)
    missing_samples = count_difference_of_required_samples_and_available_samples(required_samples_per_question_type, available_samples, sort_key=lambda item: item[1], reverse=False, include_sufficient_samples=False)
    missing_instances = sum(missing_samples.values())
    i = 0

    while i < missing_instances:
        missing_sample = next(iter(missing_samples))
        print(f"Loop {i}, {missing_sample}")
   
        try:
            # Select a random file
            file_path = random.choice(file_paths)
            data_set = json.loads(get_text_from_file(file_path))

            # Check if the file is already contained in the output folder
            folder = file_path.split('/')[-2]
            file = file_path.split('/')[-1]
            if os.path.isfile(f"{output_path}/{folder}/{file.split('.')[0]}.csv"):
                print("File already processed")
                continue
            question_subtypes_of_file = []
            data_set = json.loads(get_text_from_file(file_path))

            # Loop through all questions in the file and add extract the question type and description
            for j in range(len(data_set)):
                question_type = data_set[j].get('question-type', '')
                question_description = data_set[j].get('description', '')
    
                if question_type == '':
                    continue
                
                if question_description != '':
                    question_type += f" [{question_description}]"
                
                if question_type not in question_subtypes_of_file:
                    question_subtypes_of_file.append(question_type)

            # Check if the file contains the missing sample
            if missing_sample not in question_subtypes_of_file:
                print(f"{folder}/{file} does not contain missing sample")
                continue

            handle_file(input_path, output_path, folder, file)
            i += 1
        except Exception as e:
            print(f"Error importing file {file} in folder {folder}: {e}")
            continue

        # Recalculate available and missing samples
        available_samples = count_instances_of_question_subtypes_csv(output_path, sort_key=lambda item: item[1], reverse=False)
        missing_samples = count_difference_of_required_samples_and_available_samples(required_samples_per_question_type, available_samples, sort_key=lambda item: item[1], reverse=False, include_sufficient_samples=False)
        missing_instances = sum(missing_samples.values())
        if missing_instances == 0:
            print("All missing samples added")
            break

preprocess_missing_samples(required_samples_per_question_type, input_path, output_path)

#### Fix turnID 
* It should start at turn 0
* Question and answer belong to the same turn
* TurnID has form "train#QA_0#QA_0#0" -> folder#folder#file#turn

In [None]:
file_paths = get_file_paths("./SPICE_dataset_pp/test")
for file_path in file_paths:
    dataframe = pd.read_csv(file_path)
    turn = 0
    for index, row in dataframe.iterrows():
        if index % 2 == 0 and index != 0:
            turn += 1
        
        # Update the turn of the turnId
        turnId = row['turnID']
        turnId = turnId.rsplit('#', 1)[0]
        turnId = turnId + '#' + str(turn)

        # Update the turnID
        dataframe.at[index, 'turnID'] = turnId

    # Save the updated dataframe to the same file
    dataframe.to_csv(file_path, index=False)
    print(f"{file_path} is updated")
