# Remove empty
Generate a new collection that does not contain any empty dataset

In [1]:
import os
import json

In [2]:
source = "../datasets"                          # source folder
source_qrels = "qrels.txt"                      # source qrels file
modified_qrels = "qrels_without_empty.txt"
empty_datasets_ids = "empty.txt"

In [3]:
datasets = sorted(os.listdir(source), key=lambda i: int(i))

In [4]:
from enum import Enum

class DatasetType(Enum):
    EMPTY = 0
    PARTIAL = 1
    COMPLETE = 2

def analyze_dataset(dataset_path) -> DatasetType:
    with open(dataset_path, "r") as f:
        metadata = json.load(f, strict=False)

        # check if the dataset has been downloaded completely
        error_while_downloading = len(metadata["failedURLs"]) > 0

        # check if the file dataset contains at least one file that has been parsed
        contains_a_valid_file = len(metadata["extracted"]) > 0

        # check if the dataset has some files that have not been parsed or has thrown errors while parsing
        error_while_parsing = len(metadata["unusedFiles"]) > 0

        """ 
        A dataset is complete only if all these conditions are satisfied:
        1) contains at least one valid file (>0)
        2) has been completely downloaded
        3) no file has generated error while parsing
        """

        if contains_a_valid_file and not error_while_downloading and not error_while_parsing:
            return DatasetType.COMPLETE

        """
        A dataset is partial if:
        1) contains at least one valid file (>0)
        2) some files may not have been downloaded
        3) some files may have generated errors or not being the correct type to be used
        """

        if contains_a_valid_file:
            return DatasetType.PARTIAL

        """
        If a dataset doesn't contain any file
        """
        return DatasetType.EMPTY

In [5]:
non_empty = list()
empty_datasets = list()

for dataset in datasets:
    metadata_file_path = f"{source}/{dataset}/metadata.json"
    res = analyze_dataset(metadata_file_path)

    if res == DatasetType.COMPLETE or res == DatasetType.PARTIAL:
        non_empty.append(dataset)
    
    if res == DatasetType.EMPTY:
        empty_datasets.append(dataset)

In [6]:
len(empty_datasets)

5201

In [7]:
with open(empty_datasets_ids, 'w') as empty_file:
    for id in empty_datasets:
        empty_file.write("".join(id) + "\n")

### Generate the modified QRELS file

In [8]:
# Open the input file for reading
with open(source_qrels, 'r') as input_file:
    # Read the lines from the input file
    lines = input_file.readlines()

In [9]:
filtered_lines = [line for line in lines if line.split()[2] not in empty_datasets]

In [10]:
# Open the output file for writing
with open(modified_qrels, 'w') as output_file:
    # Write the filtered lines to the output file
    output_file.writelines(filtered_lines)