# Stats
Statistics about downloaded and processed files

In [None]:
import os
import json
from enum import Enum

In [None]:
# input the relative path or the absolute path pointing to the directory in which datasets have been downloaded
folder = input()

In [None]:
complete_datasets = list()  # completely downloaded
partial_datasets = list()  # not completely downloaded / parsed (at least 1 valid file)
empty_datasets = list()  # only metadata for these datasets
not_processed = list()

In [None]:
# scan the directory containing the downloaded datasets
datasets = sorted(os.listdir(folder), key=lambda i: int(i))
total_datasets = len(datasets)

### Check datasets with more than 1 file downloaded

In [None]:
datasets_with_more_than_one_file_downloaded = list()

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        if(len(metadata["downloaded_urls"]) > 1 ):
            datasets_with_more_than_one_file_downloaded.append(f"{folder}/{dataset}")


In [None]:
print(f"Datasets with more than one file downloaded: {len(datasets_with_more_than_one_file_downloaded)}")

In [None]:
for dataset in datasets_with_more_than_one_file_downloaded:
    print(dataset)

### Check the processing status for each dataset

In [None]:
class DatasetType(Enum):
    EMPTY = 0
    NOT_PROCESSED = 1
    PARTIAL = 2
    COMPLETE = 3


def analyze_dataset(dataset_path) -> DatasetType:
    with open(dataset_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        # check if the dataset has been mined through the data extractor
        if not "unused_files" in keys:
            return DatasetType.NOT_PROCESSED

        # check if the dataset has been downloaded completely
        completely_downloaded = False
        if "failed_download_urls" in keys:
            completely_downloaded = len(metadata["failed_download_urls"]) == 0

        # check if the file dataset contains at least one file that has been parsed
        contains_a_valid_file = len(metadata["used_files"]) > 0

        # check if the dataset has some files that have not been parsed or has thrown errors while parsing
        error_while_parsing = len(metadata["unused_files"]) == 0

        """ 
        A dataset is complete only if all these conditions are satisfied:
        1) has been completely downloaded
        2) contains at least one valid file (>0)
        3) no file has generated error while parsing
        """

        if completely_downloaded and contains_a_valid_file and not error_while_parsing:
            return DatasetType.COMPLETE

        """
        A dataset is partial if:
        1) contains at least one valid file (>0)
        2) some files may not have been downloaded
        3) some files may have generated errors or not being the correct type to be used
        """

        if contains_a_valid_file:
            return DatasetType.PARTIAL

        """
        If a dataset doesn't contain any file
        """
        return DatasetType.EMPTY

In [None]:
for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    res = analyze_dataset(metadata_file_path)

    if res == DatasetType.COMPLETE:
        complete_datasets.append(dataset)

    if res == DatasetType.PARTIAL:
        partial_datasets.append(dataset)

    if res == DatasetType.EMPTY:
        empty_datasets.append(dataset)

    if res == DatasetType.NOT_PROCESSED:
        not_processed.append(dataset)

In [None]:
print(f"Total number of datasets: {total_datasets}")
print(f"Complete datasets: {len(complete_datasets)}")   # completely downloaded and parsed
print(f"Partial datasets: {len(partial_datasets)}")
print(f"Empty datasets: {len(empty_datasets)}")
print(f"Not processed datasets: {len(not_processed)}")

## Analyzing unused files

### Files that are too big to be processed

In [None]:
# Files that are too big (>100MB) that have not been analyzed

SIZE_LIMIT = 100 * 1024 * 1024  # 100 MB

EXCLUDE = ["metadata.json"]     # generated files, that should be excluded because usually are not target for extraction

file_list = list()

for path, subdirs, files in os.walk(folder):
    for name in files:
        if not name in EXCLUDE:
            file_list.append(os.path.join(path, name))

for i in file_list:
    # Getting the size in a variable
    size = os.path.getsize(str(i))

    # Print the files that meet the condition
    if int(size) >= int(SIZE_LIMIT):
        print(str(i) + " is: " + str(size >> 20) + "MB")


### Files that potentially can be processed

In [None]:
RDF_SUFFIXES = ["rdf", "ttl", "owl", "n3", "nt", "jsonld"]


def check_if_file_name_is_rdf(name: str) -> bool:
    return name.split(".")[-1] in RDF_SUFFIXES

In [None]:
datasets_with_unused_files = list()

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:
            unparsable_rdf = list()
            unparsable_other = list()

            for file in metadata["unused_files"]:
                file_with_path = f"{folder}/{dataset}/{file}"

                if check_if_file_name_is_rdf(file):
                    unparsable_rdf.append(file)
                else:
                    unparsable_other.append(file)
            
            datasets_with_unused_files.append([dataset, unparsable_rdf, unparsable_other])

print("{:<10}{:<300}{}".format("ID", "RDF UNPARSABLE", "OTHER UNPARSABLE"))
for d in datasets_with_unused_files:
    print("{:<10}{:<300}{}".format(d[0], str(d[1]), str(d[2])))

Analyzing non parsed files

In [None]:
total_used_files = 0

rdf_files_unused = list()       # contains the path of unused RDF files
other_files_unused = list()     # contains the path on unused NON RDF files

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "used_files" in keys and len(metadata["used_files"]) > 0:
            total_used_files += len(metadata["used_files"])

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:

            for file in metadata["unused_files"]:
                file_with_path = f"{folder}/{dataset}/{file}"

                if check_if_file_name_is_rdf(file):
                    rdf_files_unused.append(file_with_path)
                else:
                    other_files_unused.append(file_with_path)

print(f"Total used files: {total_used_files}")
print(f"RDF unusable files: {len(rdf_files_unused)}")
print(f"NON RDF unusable files: {len(other_files_unused)}")

Paths of RDF files that needs to be checked (some of them contains errors, such as spaces in the IRI (very common))

In [None]:
rdf_files_unused

## Analysis of unused - NON RDF files 

You can extract archives and zip files using `tarfile` and `zipfile` modules [StackOverflow](https://stackoverflow.com/questions/35690072/how-to-check-if-it-is-a-file-or-folder-for-an-archive-in-python)

The [magic number signature](https://en.wikipedia.org/wiki/List_of_file_signatures) can hint file type

In [None]:
import magic
import zipfile
import tarfile

In [None]:
def is_gz_file(filepath):
    with open(filepath, "rb") as f:
        return f.read(2) == b"\x1f\x8b"


def is_bz2_file(filepath):
    with open(filepath, "rb") as f:
        return f.read(3) == b"\x42\x5a\x68"


def maybe_rdf(filepath):
    with open(filepath, "r") as f:
        try:
            return "<rdf:RDF" in f.read()
        except Exception:
            return False


def maybe_html(filepath):
    "<!doctype html"

    mt = magic.from_file(filepath).lower()
    if "html" in mt:
        return True

    with open(filepath, "r") as f:
        try:
            return "<!doctype html" in f.read().lower()
        except Exception:
            return False

In [None]:
archives = list()
probably_html = list()
maybe_can_cast_to_rdf = list()
other = list()

for file in other_files_unused:
    if (
        tarfile.is_tarfile(file)
        or zipfile.is_zipfile(file)
        or is_gz_file(file)
        or is_bz2_file(file)
    ):
        archives.append(file)

    elif maybe_html(file):
        probably_html.append(file)

    elif maybe_rdf(file):
        maybe_can_cast_to_rdf.append(file)

    else:
        other.append(file)

In [None]:
print(f"Total number of files recognized as archives: {len(archives)}")
print(f"Total number of files that probably are HTML documents: {len(probably_html)}")
print(f"Total number of files that probably can be casted to RDF: {len(maybe_can_cast_to_rdf)}")
print(f"Total number of unknown files: {len(other)}")


In [None]:
for file in archives:
    print(file)

In [None]:
for file in probably_html:
    print(file)

In [None]:
for file in maybe_can_cast_to_rdf:
    print(file)

In [None]:
for file in other:
    print(file, magic.from_file(file))