In [1]:
from typing import Dict, List

class NFDISearchDocumentProcessor:
    IN_VALID_KEYS = ["timedout_sources"]
    VALID_DATA_KEYS = [
        "name",
        "author",
        # "description",
        "keywords",
        "source",
        "abstract",
        "license",
        "datePublished",
        "dateModified",
        "dateCreated",
        # "inLanguage",
        "publisher",
        "orcid",
        "affiliation",
        "address",
        "text",
    ]
    VALID_DATA_KEY_LIST_DT = ["inLanguage", "author", "keywords"]

    def process_single_doc(self, parent_topic: str, input_doc: Dict) -> str:
        processed_doc = []
        processed_doc_text = ""
        for valid_key in self.VALID_DATA_KEYS:
            value = input_doc.get(valid_key, "NONE")
            if str(value).lower() != "none" and value != "":
                if valid_key == "author":
                    authors = ""
                    for author in value:
                        authors += ", ".join([item for item in author.values() if isinstance(item, str) and item.strip() and str(item) !="Person"])

                    value = authors
                if valid_key == "inLanguage" or valid_key == "keywords":
                    value = ", ".join(value)
                if valid_key == "source":
                    sources = ""
                    try:
                        for source in value:
                            del source['identifier']
                            sources += ", ".join([item for item in source.values() if isinstance(item, str) and item.strip()])
                    except:
                        pass
                    value = sources
                processed_doc_text += (
                    f"\n{value}"
                )
        return processed_doc_text

    def process(self, items):
        processed_docs = []
        for parent_key, docs in items.items():
            if parent_key not in self.IN_VALID_KEYS:
                for index, doc in enumerate(docs):
                    processed_doc_text = self.process_single_doc(
                        parent_topic=parent_key, input_doc=doc
                    )
                    processed_docs.append(processed_doc_text)
        processed_docs = list(set(processed_docs))     
        # print(f":::::::::::::::::::: Processed documents (NO:{len(processed_docs)}) ::::::::::::::")
        return processed_docs

In [2]:
ls ../assets/evaluation/search_results/raw

100.json  135.json  16.json   203.json  238.json  272.json  33.json  68.json
101.json  136.json  170.json  204.json  239.json  273.json  34.json  69.json
102.json  137.json  171.json  205.json  23.json   274.json  35.json  6.json
103.json  138.json  172.json  206.json  240.json  275.json  36.json  70.json
104.json  139.json  173.json  207.json  241.json  276.json  37.json  71.json
105.json  13.json   174.json  208.json  242.json  277.json  38.json  72.json
106.json  140.json  175.json  209.json  243.json  278.json  39.json  73.json
107.json  141.json  176.json  20.json   244.json  279.json  3.json   74.json
108.json  142.json  177.json  210.json  245.json  27.json   40.json  75.json
109.json  143.json  178.json  211.json  246.json  280.json  41.json  76.json
10.json   144.json  179.json  212.json  247.json  281.json  42.json  77.json
110.json  145.json  17.json   213.json  248.json  282.json  43.json  78.json
111.json  146.json  180.json  214.json  249.json  283.json  44.json  79.json


In [3]:
import json
import os

def load_json(input_path):
    with open(input_path, encoding="utf-8") as f:
        json_data = json.load(f)
    return json_data

def write_json(output_path, json_data):
    with open(output_path, "w", encoding="utf-8") as outfile:
        json.dump(json_data, outfile, indent=4, ensure_ascii=False)


        
def get_stats(data):
    stats_dict = {group:len(items) for group, items in data.items() if group != 'timedout_sources' and len(items) != 0}
    return stats_dict

In [4]:

raw_dir = "../assets/evaluation/search_results/raw"
processed_dir = "../assets/evaluation/search_results/processed"
keys = []
processed_data = []
for json_file in os.listdir(raw_dir):
    if not json_file.startswith("."):
        input_path = os.path.join(raw_dir, json_file)
        # output_path = os.path.join(processed_dir, json_file)
        json_data = load_json(input_path)
        json_data[0]['stats'] = get_stats(json_data[0]['results'])
        keys += list(json_data[0]['stats'].keys())
        json_data[0]['results'] = NFDISearchDocumentProcessor().process(json_data[0]['results'])
        processed_data.append(json_data[0])

write_json(os.path.join(processed_dir, "search_results.json"), processed_data)