# Stats
Statistics about downloaded and processed files

In [None]:
import os

In [None]:
# input the relative path or the absolute path pointing to the directory in which datasets have been downloaded
folder = input()

In [None]:
complete_datasets = list()  # completely downloaded
partial_datasets = list()  # not completely downloaded / parsed (at least 1 valid file)
empty_datasets = list()  # only metadata for these datasets
not_processed = list()

In [None]:
# scan the directory containing the downloaded datasets
datasets = sorted(os.listdir(folder), key=lambda i: int(i))
total_datasets = len(datasets)

### Utility functions

In [None]:
import os

SIZE_LIMIT = 200 * 1024 * 1024  # 200 MB


def is_file_larger_than_size_limit(filepath: str) -> bool:
    size = os.path.getsize(str(filepath))
    return int(size) >= int(SIZE_LIMIT)

In [None]:
RDF_SUFFIXES = ["rdf", "ttl", "owl", "n3", "nt", "jsonld", "nq", "trig", "trix"]


def check_if_file_name_is_rdf(name: str) -> bool:
    return name.split(".")[-1] in RDF_SUFFIXES

In [None]:
import magic


def is_html(filepath: str) -> bool:
    mt = magic.from_file(filepath).lower()
    if "html" in mt:
        return True

    with open(filepath, "r") as f:
        try:
            return "<!doctype html" in f.read().lower()
        except Exception:
            return False

In [None]:
import json


def is_json(filepath: str) -> bool:
    with open(filepath, "r") as f:
        try:
            json.load(f)
            return True
        except Exception:
            return False

### Check the processing status for each dataset

In [None]:
from enum import Enum

class DatasetType(Enum):
    EMPTY = 0
    NOT_PROCESSED = 1
    PARTIAL = 2
    COMPLETE = 3


def analyze_dataset(dataset_path) -> DatasetType:
    with open(dataset_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        # check if the dataset has been mined through the data extractor
        if not "unused_files" in keys:
            return DatasetType.NOT_PROCESSED

        # check if the dataset has been downloaded completely
        completely_downloaded = False
        if "failed_download_urls" in keys:
            completely_downloaded = len(metadata["failed_download_urls"]) == 0

        # check if the file dataset contains at least one file that has been parsed
        contains_a_valid_file = len(metadata["used_files"]) > 0

        # check if the dataset has some files that have not been parsed or has thrown errors while parsing
        error_while_parsing = len(metadata["unused_files"]) == 0

        """ 
        A dataset is complete only if all these conditions are satisfied:
        1) has been completely downloaded
        2) contains at least one valid file (>0)
        3) no file has generated error while parsing
        """

        if completely_downloaded and contains_a_valid_file and not error_while_parsing:
            return DatasetType.COMPLETE

        """
        A dataset is partial if:
        1) contains at least one valid file (>0)
        2) some files may not have been downloaded
        3) some files may have generated errors or not being the correct type to be used
        """

        if contains_a_valid_file:
            return DatasetType.PARTIAL

        """
        If a dataset doesn't contain any file
        """
        return DatasetType.EMPTY

In [None]:
for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    res = analyze_dataset(metadata_file_path)

    if res == DatasetType.COMPLETE:
        complete_datasets.append(dataset)

    if res == DatasetType.PARTIAL:
        partial_datasets.append(dataset)

    if res == DatasetType.EMPTY:
        empty_datasets.append(dataset)

    if res == DatasetType.NOT_PROCESSED:
        not_processed.append(dataset)

In [None]:
print(f"Total number of datasets: {total_datasets}")
print(f"Complete datasets: {len(complete_datasets)}")   # completely downloaded and parsed
print(f"Partial datasets: {len(partial_datasets)}")
print(f"Empty datasets: {len(empty_datasets)}")
print(f"Not processed datasets: {len(not_processed)}")

### List datasets with unused file

In [None]:
datasets_with_unused_files = list()

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:
            unparsable_rdf = list()
            unparsable_other = list()

            for file in metadata["unused_files"]:
                file_with_path = f"{folder}/{dataset}/{file}"

                if check_if_file_name_is_rdf(file):
                    unparsable_rdf.append(file)
                else:
                    unparsable_other.append(file)
            
            datasets_with_unused_files.append([dataset, unparsable_rdf, unparsable_other])

In [None]:
from IPython.display import display, Markdown

markdown_table = """
| Dataset ID | RDF not parsable | Other not parsable |
| --- | --- | --- |
"""

for d in datasets_with_unused_files:
    markdown_table += ("| {} | {} | {} |\n".format(d[0], str(d[1]), str(d[2])))

display(Markdown(markdown_table))

## Analyzing unused files

In [None]:
rdf_files_unused = list()       # contains the path of unused RDF files
other_files_unused = list()     # contains the path on unused NON RDF files

In [None]:
total_used_files = 0

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "used_files" in keys and len(metadata["used_files"]) > 0:
            total_used_files += len(metadata["used_files"])

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:

            for file in metadata["unused_files"]:
                file_with_path = f"{folder}/{dataset}/{file}"

                if check_if_file_name_is_rdf(file):
                    rdf_files_unused.append(file_with_path)
                else:
                    other_files_unused.append(file_with_path)

print(f"Total used files: {total_used_files}")
print(f"RDF but unusable files: {len(rdf_files_unused)}")
print(f"NON RDF unusable files: {len(other_files_unused)}")

### Files with RDF extension that are not used

In [None]:
rdf_but_actually_html = list()
rdf_but_actually_json = list()
rdf_but_too_large = list()
rdf_unparsable = list()

for file in rdf_files_unused:
    if os.path.isfile(file):
        if is_file_larger_than_size_limit(file):
            rdf_but_too_large.append(file)
        elif is_html(file):
            rdf_but_actually_html.append(file)
        elif is_json(file):
            rdf_but_actually_json.append(file)
        else:
            rdf_unparsable.append(file)

print(f"RDF but actually HTML: {len(rdf_but_actually_html)}")
print(f"RDF but actually JSON: {len(rdf_but_actually_json)}")
print(f"RDF but too large: {len(rdf_but_too_large)}")
print(f"RDF with syntax error: {len(rdf_unparsable)}")

assert len(rdf_files_unused) == len(rdf_but_actually_html) + len(rdf_but_actually_json) + len(rdf_but_too_large) + len(rdf_unparsable)

## Analysis of unused - NON RDF files 

In [None]:
import zipfile
import tarfile
import magic


def is_gz_file(filepath):
    with open(filepath, "rb") as f:
        return f.read(2) == b"\x1f\x8b"


def is_bz2_file(filepath):
    with open(filepath, "rb") as f:
        return f.read(3) == b"\x42\x5a\x68"


def maybe_rdf(filepath):
    with open(filepath, "r") as f:
        try:
            return "<rdf:RDF" in f.read()
        except Exception:
            return False

def maybe_ttl(filepath: str) -> bool:
    with open(filepath, "r") as f:
        try:
            return "@prefix" in f.read().lower()
        except Exception:
            return False

In [None]:
archives = list()
probably_json = list()
probably_html = list()
cast_to_rdf = list()
cast_to_ttl = list()
other = list()

for file in other_files_unused:
    if (
        tarfile.is_tarfile(file)
        or zipfile.is_zipfile(file)
        or is_gz_file(file)
        or is_bz2_file(file)
    ):
        archives.append(file)

    elif is_html(file):
        probably_html.append(file)

    elif is_json(file):
        probably_json.append(file)

    elif maybe_rdf(file):
        cast_to_rdf.append(file)

    elif maybe_ttl(file):
        cast_to_ttl.append(file)

    else:
        other.append(file)


print(f"Total number of files recognized as archives: {len(archives)}")
print(f"Total number of files that probably are HTML documents: {len(probably_html)}")
print(f"Total number of files that probably are JSON documents: {len(probably_json)}")
print(f"Total number of files that probably can be casted to RDF: {len(cast_to_rdf)}")
print(f"Total number of files that probably can be casted to TTL: {len(cast_to_ttl)}")
print(f"Total number of unknown files: {len(other)}")

assert len(other_files_unused) == len(archives) + len(probably_html) + len(probably_json) + len(cast_to_rdf) + len(cast_to_ttl) + len(other)

## Final output of the analysis

### Files that are going to be deleted

Files that are HTML can be deleted

In [None]:
rdf_but_actually_html

In [None]:
probably_html

Files that are JSON can be deleted

In [None]:
rdf_but_actually_json

In [None]:
probably_json

### Files that needs to be processed

Files that probably are RDF but needs to be renamed

In [None]:
cast_to_rdf

Files that probably are TTL but needs to be renamed

In [None]:
cast_to_ttl

Files that are archives and needs to be extracted

In [None]:
archives

Files that needs to be manually processed because they are too big to be processed by RDFLib

In [None]:
rdf_but_too_large

Files that needs to be processed manually to assign them an extension

In [None]:
for file in other:
    print(f"{file:<50} | {magic.from_file(file)}")

RDF files that needs to be processed as plain text files

In [None]:
rdf_unparsable