# Stats
Statistics about downloaded and processed files

In [None]:
import os

In [None]:
# input the relative path or the absolute path pointing to the directory in which datasets have been downloaded
folder = input()

In [None]:
complete_datasets = list()  # completely downloaded
partial_datasets = list()  # not completely downloaded / parsed (at least 1 valid file)
empty_datasets = list()  # only metadata for these datasets

In [None]:
# scan the directory containing the downloaded datasets
datasets = sorted(os.listdir(folder), key=lambda i: int(i))
total_datasets = len(datasets)

### Utility functions

In [None]:
import os

SIZE_LIMIT = 200 * 1024 * 1024  # 100 MB


def is_file_larger_than_size_limit(filepath: str) -> bool:
    size = os.path.getsize(str(filepath))
    return int(size) >= int(SIZE_LIMIT)

In [None]:
RDF_SUFFIXES = ["rdf", "ttl", "owl", "n3", "nt", "jsonld", "nq", "trig", "trix"]


def check_if_file_name_is_rdf(name: str) -> bool:
    return name.split(".")[-1] in RDF_SUFFIXES

In [None]:
import magic


def is_html(filepath: str) -> bool:
    mt = magic.from_file(filepath).lower()
    if "html" in mt:
        return True

    with open(filepath, "r") as f:
        try:
            return "<!doctype html" in f.read().lower()
        except Exception:
            return False

In [None]:
import json


def is_json(filepath: str) -> bool:
    with open(filepath, "r") as f:
        try:
            json.load(f)
            return True
        except Exception:
            return False

In [None]:
def delete_file(file_path: str):
    if os.path.isfile(file_path):
        print(f"Deleting {file_path}")
        os.remove(file_path)

### Check the processing status for each dataset

In [None]:
from enum import Enum

class DatasetType(Enum):
    EMPTY = 0
    PARTIAL = 1
    COMPLETE = 2


def analyze_dataset(dataset_path) -> DatasetType:
    with open(dataset_path, "r") as f:
        metadata = json.load(f, strict=False)

        # check if the dataset has been downloaded completely
        error_while_downloading = len(metadata["failedURLs"]) > 0

        # check if the file dataset contains at least one file that has been parsed
        contains_a_valid_file = len(metadata["extracted"]) > 0

        # check if the dataset has some files that have not been parsed or has thrown errors while parsing
        error_while_parsing = len(metadata["unusedFiles"]) > 0

        """ 
        A dataset is complete only if all these conditions are satisfied:
        1) contains at least one valid file (>0)
        2) has been completely downloaded
        3) no file has generated error while parsing
        """

        if contains_a_valid_file and not error_while_downloading and not error_while_parsing:
            return DatasetType.COMPLETE

        """
        A dataset is partial if:
        1) contains at least one valid file (>0)
        2) some files may not have been downloaded
        3) some files may have generated errors or not being the correct type to be used
        """

        if contains_a_valid_file:
            return DatasetType.PARTIAL

        """
        If a dataset doesn't contain any file
        """
        return DatasetType.EMPTY

In [None]:
for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    res = analyze_dataset(metadata_file_path)

    if res == DatasetType.COMPLETE:
        complete_datasets.append(dataset)

    if res == DatasetType.PARTIAL:
        partial_datasets.append(dataset)

    if res == DatasetType.EMPTY:
        empty_datasets.append(dataset)

In [None]:
print(f"Total number of datasets: {total_datasets}")
print(f"Complete datasets: {len(complete_datasets)}")
print(f"Partial datasets: {len(partial_datasets)}")
print(f"Empty datasets: {len(empty_datasets)}")

### List datasets with unused file

In [None]:
datasets_with_unused_files = list()

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:
            unparsable_rdf = list()
            unparsable_other = list()

            for file in metadata["unused_files"]:
                file_with_path = f"{folder}/{dataset}/{file}"

                if check_if_file_name_is_rdf(file):
                    unparsable_rdf.append(file)
                else:
                    unparsable_other.append(file)
            
            datasets_with_unused_files.append([dataset, unparsable_rdf, unparsable_other])

In [None]:
from IPython.display import display, Markdown

markdown_table = """
| Dataset ID | RDF not parsable | Other not parsable |
| --- | --- | --- |
"""

for d in datasets_with_unused_files:
    markdown_table += ("| {} | {} | {} |\n".format(d[0], str(d[1]), str(d[2])))

display(Markdown(markdown_table))

## Analyzing unused files

In [None]:
total_used_files = 0
big_files = list()
unusable_files = list()

for dataset in datasets:
    dataset_folder_path = f"{folder}/{dataset}"
    metadata_file_path = f"{dataset_folder_path}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "used_files" in keys and len(metadata["used_files"]) > 0:
            total_used_files += len(metadata["used_files"])

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:
            for uf in metadata["unused_files"]:
                file_path = f"{dataset_folder_path}/{uf}"
                if is_file_larger_than_size_limit(file_path):
                    big_files.append(file_path)
                else:
                    unusable_files.append(file_path)

print(f"Total used files: {total_used_files}")
print(f"Big files: {len(big_files)}")
print(f"Unusable files: {len(unusable_files)}")

### Unused files

In [None]:
unusable_files

### Big files

In [None]:
big_files

Check duplicates

In [None]:
%%bash
diff datasets/2/ppg-sf-dump.rdf datasets/6/ppg-sf-dump.rdf

In [None]:
%%bash
diff datasets/13263/Govwild_rdf.n3 datasets/14364/Govwild_rdf.n3

In [None]:
%%bash
diff datasets/13347/geospecies.rdf datasets/14324/geospecies.rdf

In [None]:
%%bash
cmp datasets/13368/all-geonames.rdf datasets/14344/all-geonames.rdf

#### Duplicates
- `datasets/2/ppg-sf-dump.rdf` is the same file as `datasets/6/ppg-sf-dump.rdf`
- `datasets/13263/Govwild_rdf.n3` is the same file as `datasets/14364/Govwild_rdf.n3`
- `datasets/13347/geospecies.rdf` is the same file as `datasets/14324/geospecies.rdf`
- `datasets/13368/all-geonames.rdf` is the same file as `datasets/14344/all-geonames.rdf`

#### Invalid files
While importing in GraphDB the files below generated errors
- `datasets/13263/Govwild_rdf.n3` contains syntax error, GraphDB raises `org.eclipse.rdf4j.sail.SailException: Invalid IRI value`
- `datasets/13347/geospecies.rdf` contains syntax error, GraphDB raises `org.eclipse.rdf4j.sail.SailException: Invalid IRI value`
- `datasets/13368/all-geonames.rdf` is not processable, GraphDB raises `RDF parse error: content is not allowed in prolog`
- `datasets/13565/download-20120123.rdf` contains syntax error, GraphDB raises `org.eclipse.rdf4j.sail.SailException: Invalid IRI value`
- `datasets/15243/fr.rdf` contains syntax error, GraphDB raises `RDF parse error`
- `datasets/21532/jrcnames_uri.nt` contains syntax error, GraphDB raises `org.eclipse.rdf4j.sail.SailException: Invalid IRI value`

In [None]:
distinct_valid_big_files = [
    "datasets/2/ppg-sf-dump.rdf",
    "datasets/11580/rows.rdf",
    "datasets/14079/eat.nt",
    "datasets/15243/en.rdf",
    "datasets/21023/2016-allievi-partecipanti.nt",
]

In [None]:
for file in big_files:
    if file not in distinct_valid_big_files:
        delete_file(file)