# Stats
Statistics about downloaded and processed files

In [21]:
import os

In [2]:
# input the relative path or the absolute path pointing to the directory in which datasets have been downloaded
folder = input()

In [3]:
complete_datasets = list()  # completely downloaded
partial_datasets = list()  # not completely downloaded / parsed (at least 1 valid file)
empty_datasets = list()  # only metadata for these datasets
not_processed = list()

In [4]:
# scan the directory containing the downloaded datasets
datasets = sorted(os.listdir(folder), key=lambda i: int(i))
total_datasets = len(datasets)

### Utility functions

In [5]:
import os

SIZE_LIMIT = 100 * 1024 * 1024  # 100 MB


def is_file_larger_than_size_limit(filepath: str) -> bool:
    size = os.path.getsize(str(filepath))
    return int(size) >= int(SIZE_LIMIT)

In [6]:
RDF_SUFFIXES = ["rdf", "ttl", "owl", "n3", "nt", "jsonld", "nq", "trig", "trix"]


def check_if_file_name_is_rdf(name: str) -> bool:
    return name.split(".")[-1] in RDF_SUFFIXES

In [7]:
import magic


def is_html(filepath: str) -> bool:
    mt = magic.from_file(filepath).lower()
    if "html" in mt:
        return True

    with open(filepath, "r") as f:
        try:
            return "<!doctype html" in f.read().lower()
        except Exception:
            return False

In [8]:
import json


def is_json(filepath: str) -> bool:
    with open(filepath, "r") as f:
        try:
            json.load(f)
            return True
        except Exception:
            return False

In [39]:
def delete_file(file_path: str):
    if os.path.isfile(file_path):
        print(f"Deleting {file_path}")
        os.remove(file_path)

### Check datasets with more than 1 file downloaded

In [9]:
datasets_with_more_than_one_file_downloaded = list()

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        if(len(metadata["downloaded_urls"]) > 1 ):
            datasets_with_more_than_one_file_downloaded.append(f"{folder}/{dataset}")


In [10]:
print(f"Datasets with more than one file downloaded: {len(datasets_with_more_than_one_file_downloaded)}")

Datasets with more than one file downloaded: 250


In [11]:
datasets_with_more_than_one_file_downloaded

['datasets/1',
 'datasets/2',
 'datasets/6',
 'datasets/7',
 'datasets/83',
 'datasets/559',
 'datasets/561',
 'datasets/562',
 'datasets/10093',
 'datasets/12513',
 'datasets/13252',
 'datasets/13254',
 'datasets/13263',
 'datasets/13279',
 'datasets/13283',
 'datasets/13284',
 'datasets/13288',
 'datasets/13290',
 'datasets/13291',
 'datasets/13299',
 'datasets/13319',
 'datasets/13334',
 'datasets/13339',
 'datasets/13347',
 'datasets/13351',
 'datasets/13356',
 'datasets/13357',
 'datasets/13368',
 'datasets/13369',
 'datasets/13378',
 'datasets/13384',
 'datasets/13388',
 'datasets/13394',
 'datasets/13400',
 'datasets/13412',
 'datasets/13421',
 'datasets/13452',
 'datasets/13461',
 'datasets/13469',
 'datasets/13471',
 'datasets/13482',
 'datasets/13506',
 'datasets/13522',
 'datasets/13541',
 'datasets/13565',
 'datasets/13568',
 'datasets/13577',
 'datasets/13581',
 'datasets/13582',
 'datasets/13590',
 'datasets/13594',
 'datasets/13600',
 'datasets/13622',
 'datasets/13623',

### Check the processing status for each dataset

In [12]:
from enum import Enum

class DatasetType(Enum):
    EMPTY = 0
    NOT_PROCESSED = 1
    PARTIAL = 2
    COMPLETE = 3


def analyze_dataset(dataset_path) -> DatasetType:
    with open(dataset_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        # check if the dataset has been mined through the data extractor
        if not "unused_files" in keys:
            return DatasetType.NOT_PROCESSED

        # check if the dataset has been downloaded completely
        completely_downloaded = False
        if "failed_download_urls" in keys:
            completely_downloaded = len(metadata["failed_download_urls"]) == 0

        # check if the file dataset contains at least one file that has been parsed
        contains_a_valid_file = len(metadata["used_files"]) > 0

        # check if the dataset has some files that have not been parsed or has thrown errors while parsing
        error_while_parsing = len(metadata["unused_files"]) == 0

        """ 
        A dataset is complete only if all these conditions are satisfied:
        1) has been completely downloaded
        2) contains at least one valid file (>0)
        3) no file has generated error while parsing
        """

        if completely_downloaded and contains_a_valid_file and not error_while_parsing:
            return DatasetType.COMPLETE

        """
        A dataset is partial if:
        1) contains at least one valid file (>0)
        2) some files may not have been downloaded
        3) some files may have generated errors or not being the correct type to be used
        """

        if contains_a_valid_file:
            return DatasetType.PARTIAL

        """
        If a dataset doesn't contain any file
        """
        return DatasetType.EMPTY

In [13]:
for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    res = analyze_dataset(metadata_file_path)

    if res == DatasetType.COMPLETE:
        complete_datasets.append(dataset)

    if res == DatasetType.PARTIAL:
        partial_datasets.append(dataset)

    if res == DatasetType.EMPTY:
        empty_datasets.append(dataset)

    if res == DatasetType.NOT_PROCESSED:
        not_processed.append(dataset)

In [14]:
print(f"Total number of datasets: {total_datasets}")
print(f"Complete datasets: {len(complete_datasets)}")   # completely downloaded and parsed
print(f"Partial datasets: {len(partial_datasets)}")
print(f"Empty datasets: {len(empty_datasets)}")
print(f"Not processed datasets: {len(not_processed)}")

Total number of datasets: 31589
Complete datasets: 8
Partial datasets: 26377
Empty datasets: 5204
Not processed datasets: 0


### List datasets with unused file

In [15]:
datasets_with_unused_files = list()

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:
            unparsable_rdf = list()
            unparsable_other = list()

            for file in metadata["unused_files"]:
                file_with_path = f"{folder}/{dataset}/{file}"

                if check_if_file_name_is_rdf(file):
                    unparsable_rdf.append(file)
                else:
                    unparsable_other.append(file)
            
            datasets_with_unused_files.append([dataset, unparsable_rdf, unparsable_other])

In [16]:
from IPython.display import display, Markdown

markdown_table = """
| Dataset ID | RDF not parsable | Other not parsable |
| --- | --- | --- |
"""

for d in datasets_with_unused_files:
    markdown_table += ("| {} | {} | {} |\n".format(d[0], str(d[1]), str(d[2])))

display(Markdown(markdown_table))


| Dataset ID | RDF not parsable | Other not parsable |
| --- | --- | --- |
| 2 | ['ppg-sf-dump.rdf'] | [] |
| 6 | ['ppg-sf-dump.rdf'] | [] |
| 11580 | ['rows.rdf'] | [] |
| 13263 | ['Govwild_rdf.n3'] | [] |
| 13283 | ['txn-images.ttl', 'txn-distribution.ttl'] | [] |
| 13284 | ['deck.rdf'] | [] |
| 13347 | ['geospecies.rdf'] | [] |
| 13368 | ['all-geonames.rdf'] | [] |
| 13378 | ['wordnet-partmeronym.rdf', 'wordnet-substancemeronym.rdf', 'wordnet-seealso.rdf', 'wordnet-pertainsto.rdf', 'wordnet-causes.rdf', 'wordnet-sameverbgroupas.rdf', 'wordnet-hyponym.rdf', 'wordnet-classifiedby.rdf', 'wordnet-membermeronym.rdf', 'wordnet-participleof.rdf', 'wordnet-entailment.rdf', 'wordnet-similarity.rdf', 'wordnet-antonym.rdf', 'wordnet-attribute.rdf', 'wordnet-derivationallyrelated.rdf'] | [] |
| 13565 | ['download-20120123.rdf'] | [] |
| 13997 | ['PERSEE_align_wikipedia_2021-09-24.rdf', 'PERSEE_align_ORCID_2021-09-24.rdf', 'PERSEE_align_Idref_2021-09-24.rdf', 'PERSEE_align_All_2021-09-24.rdf', 'PERSEE_align_DBpediaFR_2021-09-24.rdf', 'PERSEE_align_Bnf_2021-09-24.rdf', 'PERSEE_align_Isni_2021-09-24.rdf', 'PERSEE_align_idHAL_2021-09-24.rdf', 'PERSEE_align_viaf_2021-09-24.rdf', 'PERSEE_align_RePEc_2021-09-24.rdf', 'PERSEE_align_wikidata_2021-09-24.rdf', 'PERSEE_align_wikipediaFR_2021-09-24.rdf', 'PERSEE_align_DBpedia_2021-09-24.rdf'] | ['persee-person-align-rdf.tar.gz', 'license.txt'] |
| 14054 | ['rdf.rdf'] | [] |
| 14079 | ['eat.nt', 'wheat.rdf'] | [] |
| 14252 | ['wordnet-partmeronym.rdf', 'wordnet-substancemeronym.rdf', 'wordnet-seealso.rdf', 'wordnet-pertainsto.rdf', 'wordnet-causes.rdf', 'wordnet-sameverbgroupas.rdf', 'wordnet-hyponym.rdf', 'wordnet-classifiedby.rdf', 'wordnet-membermeronym.rdf', 'wordnet-participleof.rdf', 'wordnet-entailment.rdf', 'wordnet-similarity.rdf', 'wordnet-antonym.rdf', 'wordnet-attribute.rdf', 'wordnet-derivationallyrelated.rdf'] | [] |
| 14277 | ['txn-images.ttl', 'txn-distribution.ttl'] | [] |
| 14324 | ['geospecies.rdf'] | [] |
| 14344 | ['all-geonames.rdf'] | [] |
| 14364 | ['Govwild_rdf.n3'] | [] |
| 14417 | ['deck.rdf'] | [] |
| 15243 | ['en.rdf', 'fr.rdf'] | [] |
| 21023 | ['2016-allievi-partecipanti.nt'] | [] |
| 21532 | ['jrcnames_uri.nt'] | [] |


## Analyzing unused files

In [22]:
total_used_files = 0
big_files = list()
unusable_files = list()

for dataset in datasets:
    dataset_folder_path = f"{folder}/{dataset}"
    metadata_file_path = f"{dataset_folder_path}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "used_files" in keys and len(metadata["used_files"]) > 0:
            total_used_files += len(metadata["used_files"])

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:
            for uf in metadata["unused_files"]:
                file_path = f"{dataset_folder_path}/{uf}"
                if is_file_larger_than_size_limit(file_path):
                    big_files.append(file_path)
                else:
                    unusable_files.append(file_path)

print(f"Total used files: {total_used_files}")
print(f"Big files: {len(big_files)}")
print(f"Unusable files: {len(unusable_files)}")

Total used files: 27886
Big files: 15
Unusable files: 53


### Unused files

In [20]:
unusable_files

['datasets/13283/txn-images.ttl',
 'datasets/13283/txn-distribution.ttl',
 'datasets/13284/deck.rdf',
 'datasets/13378/wordnet-partmeronym.rdf',
 'datasets/13378/wordnet-substancemeronym.rdf',
 'datasets/13378/wordnet-seealso.rdf',
 'datasets/13378/wordnet-pertainsto.rdf',
 'datasets/13378/wordnet-causes.rdf',
 'datasets/13378/wordnet-sameverbgroupas.rdf',
 'datasets/13378/wordnet-hyponym.rdf',
 'datasets/13378/wordnet-classifiedby.rdf',
 'datasets/13378/wordnet-membermeronym.rdf',
 'datasets/13378/wordnet-participleof.rdf',
 'datasets/13378/wordnet-entailment.rdf',
 'datasets/13378/wordnet-similarity.rdf',
 'datasets/13378/wordnet-antonym.rdf',
 'datasets/13378/wordnet-attribute.rdf',
 'datasets/13378/wordnet-derivationallyrelated.rdf',
 'datasets/13997/persee-person-align-rdf.tar.gz',
 'datasets/13997/license.txt',
 'datasets/13997/PERSEE_align_wikipedia_2021-09-24.rdf',
 'datasets/13997/PERSEE_align_ORCID_2021-09-24.rdf',
 'datasets/13997/PERSEE_align_Idref_2021-09-24.rdf',
 'datase

In [26]:
for f in unusable_files:
    delete_file(f)

Deleting datasets/13283/txn-images.ttl
Deleting datasets/13283/txn-distribution.ttl
Deleting datasets/13284/deck.rdf
Deleting datasets/13378/wordnet-partmeronym.rdf
Deleting datasets/13378/wordnet-substancemeronym.rdf
Deleting datasets/13378/wordnet-seealso.rdf
Deleting datasets/13378/wordnet-pertainsto.rdf
Deleting datasets/13378/wordnet-causes.rdf
Deleting datasets/13378/wordnet-sameverbgroupas.rdf
Deleting datasets/13378/wordnet-hyponym.rdf
Deleting datasets/13378/wordnet-classifiedby.rdf
Deleting datasets/13378/wordnet-membermeronym.rdf
Deleting datasets/13378/wordnet-participleof.rdf
Deleting datasets/13378/wordnet-entailment.rdf
Deleting datasets/13378/wordnet-similarity.rdf
Deleting datasets/13378/wordnet-antonym.rdf
Deleting datasets/13378/wordnet-attribute.rdf
Deleting datasets/13378/wordnet-derivationallyrelated.rdf
Deleting datasets/13997/persee-person-align-rdf.tar.gz
Deleting datasets/13997/license.txt
Deleting datasets/13997/PERSEE_align_wikipedia_2021-09-24.rdf
Deleting 

### Big files

In [23]:
big_files

['datasets/2/ppg-sf-dump.rdf',
 'datasets/6/ppg-sf-dump.rdf',
 'datasets/11580/rows.rdf',
 'datasets/13263/Govwild_rdf.n3',
 'datasets/13347/geospecies.rdf',
 'datasets/13368/all-geonames.rdf',
 'datasets/13565/download-20120123.rdf',
 'datasets/14079/eat.nt',
 'datasets/14324/geospecies.rdf',
 'datasets/14344/all-geonames.rdf',
 'datasets/14364/Govwild_rdf.n3',
 'datasets/15243/en.rdf',
 'datasets/15243/fr.rdf',
 'datasets/21023/2016-allievi-partecipanti.nt',
 'datasets/21532/jrcnames_uri.nt']

Check duplicates

In [2]:
%%bash
diff datasets/2/ppg-sf-dump.rdf datasets/6/ppg-sf-dump.rdf

In [4]:
%%bash
diff datasets/13263/Govwild_rdf.n3 datasets/14364/Govwild_rdf.n3

In [5]:
%%bash
diff datasets/13347/geospecies.rdf datasets/14324/geospecies.rdf

In [8]:
%%bash
cmp datasets/13368/all-geonames.rdf datasets/14344/all-geonames.rdf

#### Duplicates
- `datasets/2/ppg-sf-dump.rdf` is the same file as `datasets/6/ppg-sf-dump.rdf`
- `datasets/13263/Govwild_rdf.n3` is the same file as `datasets/14364/Govwild_rdf.n3`
- `datasets/13347/geospecies.rdf` is the same file as `datasets/14324/geospecies.rdf`
- `datasets/13368/all-geonames.rdf` is the same file as `datasets/14344/all-geonames.rdf`

#### Invalid files
While importing in GraphDB the files below generated errors
- `datasets/13263/Govwild_rdf.n3` contains syntax error, GraphDB raises `org.eclipse.rdf4j.sail.SailException: Invalid IRI value`
- `datasets/13347/geospecies.rdf` contains syntax error, GraphDB raises `org.eclipse.rdf4j.sail.SailException: Invalid IRI value`
- `datasets/13368/all-geonames.rdf` is not processable, GraphDB raises `RDF parse error: content is not allowed in prolog`
- `datasets/13565/download-20120123.rdf` contains syntax error, GraphDB raises `org.eclipse.rdf4j.sail.SailException: Invalid IRI value`
- `datasets/15243/fr.rdf` contains syntax error, GraphDB raises `RDF parse error`
- `datasets/21532/jrcnames_uri.nt` contains syntax error, GraphDB raises `org.eclipse.rdf4j.sail.SailException: Invalid IRI value`

In [28]:
# TO DELETE

big_files = [
    "datasets/2/ppg-sf-dump.rdf",
    "datasets/6/ppg-sf-dump.rdf",
    "datasets/11580/rows.rdf",
    "datasets/13263/Govwild_rdf.n3",
    "datasets/13347/geospecies.rdf",
    "datasets/13368/all-geonames.rdf",
    "datasets/13565/download-20120123.rdf",
    "datasets/14079/eat.nt",
    "datasets/14324/geospecies.rdf",
    "datasets/14344/all-geonames.rdf",
    "datasets/14364/Govwild_rdf.n3",
    "datasets/15243/en.rdf",
    "datasets/15243/fr.rdf",
    "datasets/21023/2016-allievi-partecipanti.nt",
    "datasets/21532/jrcnames_uri.nt",
]

unusable_files = [
    "datasets/13283/txn-images.ttl",
    "datasets/13283/txn-distribution.ttl",
    "datasets/13284/deck.rdf",
    "datasets/13378/wordnet-partmeronym.rdf",
    "datasets/13378/wordnet-substancemeronym.rdf",
    "datasets/13378/wordnet-seealso.rdf",
    "datasets/13378/wordnet-pertainsto.rdf",
    "datasets/13378/wordnet-causes.rdf",
    "datasets/13378/wordnet-sameverbgroupas.rdf",
    "datasets/13378/wordnet-hyponym.rdf",
    "datasets/13378/wordnet-classifiedby.rdf",
    "datasets/13378/wordnet-membermeronym.rdf",
    "datasets/13378/wordnet-participleof.rdf",
    "datasets/13378/wordnet-entailment.rdf",
    "datasets/13378/wordnet-similarity.rdf",
    "datasets/13378/wordnet-antonym.rdf",
    "datasets/13378/wordnet-attribute.rdf",
    "datasets/13378/wordnet-derivationallyrelated.rdf",
    "datasets/13997/persee-person-align-rdf.tar.gz",
    "datasets/13997/license.txt",
    "datasets/13997/PERSEE_align_wikipedia_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_ORCID_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_Idref_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_All_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_DBpediaFR_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_Bnf_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_Isni_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_idHAL_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_viaf_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_RePEc_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_wikidata_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_wikipediaFR_2021-09-24.rdf",
    "datasets/13997/PERSEE_align_DBpedia_2021-09-24.rdf",
    "datasets/14054/rdf.rdf",
    "datasets/14079/wheat.rdf",
    "datasets/14252/wordnet-partmeronym.rdf",
    "datasets/14252/wordnet-substancemeronym.rdf",
    "datasets/14252/wordnet-seealso.rdf",
    "datasets/14252/wordnet-pertainsto.rdf",
    "datasets/14252/wordnet-causes.rdf",
    "datasets/14252/wordnet-sameverbgroupas.rdf",
    "datasets/14252/wordnet-hyponym.rdf",
    "datasets/14252/wordnet-classifiedby.rdf",
    "datasets/14252/wordnet-membermeronym.rdf",
    "datasets/14252/wordnet-participleof.rdf",
    "datasets/14252/wordnet-entailment.rdf",
    "datasets/14252/wordnet-similarity.rdf",
    "datasets/14252/wordnet-antonym.rdf",
    "datasets/14252/wordnet-attribute.rdf",
    "datasets/14252/wordnet-derivationallyrelated.rdf",
    "datasets/14277/txn-images.ttl",
    "datasets/14277/txn-distribution.ttl",
    "datasets/14417/deck.rdf",
]

In [29]:
distinct_valid_big_files = [
    "datasets/2/ppg-sf-dump.rdf",
    "datasets/11580/rows.rdf",
    "datasets/14079/eat.nt",
    "datasets/15243/en.rdf",
    "datasets/21023/2016-allievi-partecipanti.nt",
]

In [40]:
for file in big_files:
    if file not in distinct_valid_big_files:
        delete_file(file)

Deleting datasets/6/ppg-sf-dump.rdf
Deleting datasets/13263/Govwild_rdf.n3
Deleting datasets/13347/geospecies.rdf
Deleting datasets/13368/all-geonames.rdf
Deleting datasets/13565/download-20120123.rdf
Deleting datasets/14324/geospecies.rdf
Deleting datasets/14344/all-geonames.rdf
Deleting datasets/14364/Govwild_rdf.n3
Deleting datasets/15243/fr.rdf
Deleting datasets/21532/jrcnames_uri.nt
