# Stats
Statistics about downloaded and processed files

In [1]:
import os

In [2]:
# input the relative path or the absolute path pointing to the directory in which datasets have been downloaded
folder = input()

In [3]:
complete_datasets = list()  # completely downloaded
partial_datasets = list()  # not completely downloaded / parsed (at least 1 valid file)
empty_datasets = list()  # only metadata for these datasets
not_processed = list()

In [4]:
# scan the directory containing the downloaded datasets
datasets = sorted(os.listdir(folder), key=lambda i: int(i))
total_datasets = len(datasets)

### Utility functions

In [5]:
import os

SIZE_LIMIT = 100 * 1024 * 1024  # 100 MB


def is_file_larger_than_size_limit(filepath: str) -> bool:
    size = os.path.getsize(str(filepath))
    return int(size) >= int(SIZE_LIMIT)

In [6]:
RDF_SUFFIXES = ["rdf", "ttl", "owl", "n3", "nt", "jsonld", "nq", "trig", "trix"]


def check_if_file_name_is_rdf(name: str) -> bool:
    return name.split(".")[-1] in RDF_SUFFIXES

In [7]:
import magic


def is_html(filepath: str) -> bool:
    mt = magic.from_file(filepath).lower()
    if "html" in mt:
        return True

    with open(filepath, "r") as f:
        try:
            return "<!doctype html" in f.read().lower()
        except Exception:
            return False

In [8]:
import json


def is_json(filepath: str) -> bool:
    with open(filepath, "r") as f:
        try:
            json.load(f)
            return True
        except Exception:
            return False

### Check the processing status for each dataset

In [9]:
from enum import Enum

class DatasetType(Enum):
    EMPTY = 0
    NOT_PROCESSED = 1
    PARTIAL = 2
    COMPLETE = 3


def analyze_dataset(dataset_path) -> DatasetType:
    with open(dataset_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        # check if the dataset has been mined through the data extractor
        if not "unused_files" in keys:
            return DatasetType.NOT_PROCESSED

        # check if the dataset has been downloaded completely
        completely_downloaded = False
        if "failed_download_urls" in keys:
            completely_downloaded = len(metadata["failed_download_urls"]) == 0

        # check if the file dataset contains at least one file that has been parsed
        contains_a_valid_file = len(metadata["used_files"]) > 0

        # check if the dataset has some files that have not been parsed or has thrown errors while parsing
        error_while_parsing = len(metadata["unused_files"]) == 0

        """ 
        A dataset is complete only if all these conditions are satisfied:
        1) has been completely downloaded
        2) contains at least one valid file (>0)
        3) no file has generated error while parsing
        """

        if completely_downloaded and contains_a_valid_file and not error_while_parsing:
            return DatasetType.COMPLETE

        """
        A dataset is partial if:
        1) contains at least one valid file (>0)
        2) some files may not have been downloaded
        3) some files may have generated errors or not being the correct type to be used
        """

        if contains_a_valid_file:
            return DatasetType.PARTIAL

        """
        If a dataset doesn't contain any file
        """
        return DatasetType.EMPTY

In [10]:
for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    res = analyze_dataset(metadata_file_path)

    if res == DatasetType.COMPLETE:
        complete_datasets.append(dataset)

    if res == DatasetType.PARTIAL:
        partial_datasets.append(dataset)

    if res == DatasetType.EMPTY:
        empty_datasets.append(dataset)

    if res == DatasetType.NOT_PROCESSED:
        not_processed.append(dataset)

In [11]:
print(f"Total number of datasets: {total_datasets}")
print(f"Complete datasets: {len(complete_datasets)}")   # completely downloaded and parsed
print(f"Partial datasets: {len(partial_datasets)}")
print(f"Empty datasets: {len(empty_datasets)}")
print(f"Not processed datasets: {len(not_processed)}")

Total number of datasets: 31589
Complete datasets: 94
Partial datasets: 26288
Empty datasets: 5207
Not processed datasets: 0


### List datasets with unused file

In [12]:
datasets_with_unused_files = list()

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:
            unparsable_rdf = list()
            unparsable_other = list()

            for file in metadata["unused_files"]:
                file_with_path = f"{folder}/{dataset}/{file}"

                if check_if_file_name_is_rdf(file):
                    unparsable_rdf.append(file)
                else:
                    unparsable_other.append(file)
            
            datasets_with_unused_files.append([dataset, unparsable_rdf, unparsable_other])

In [13]:
from IPython.display import display, Markdown

markdown_table = """
| Dataset ID | RDF not parsable | Other not parsable |
| --- | --- | --- |
"""

for d in datasets_with_unused_files:
    markdown_table += ("| {} | {} | {} |\n".format(d[0], str(d[1]), str(d[2])))

display(Markdown(markdown_table))


| Dataset ID | RDF not parsable | Other not parsable |
| --- | --- | --- |
| 2 | ['ppg-sf-dump.rdf'] | [] |
| 6 | ['ppg-sf-dump.rdf'] | [] |
| 561 | ['covidcountystatistics.ttl', 'covidstatisticsprofile.ttl'] | [] |
| 569 | ['rows.rdf'] | [] |
| 573 | ['rows.rdf'] | [] |
| 574 | ['rows.rdf'] | [] |
| 576 | ['rows.rdf'] | [] |
| 588 | ['rows.rdf'] | [] |
| 599 | ['rows.rdf'] | [] |
| 615 | ['rows.rdf'] | [] |
| 618 | ['rows.rdf'] | [] |
| 622 | ['rows.rdf'] | [] |
| 623 | ['rows.rdf'] | [] |
| 633 | ['rows.rdf'] | [] |
| 635 | ['rows.rdf'] | [] |
| 636 | ['rows.rdf'] | [] |
| 644 | ['rows.rdf'] | [] |
| 659 | ['rows.rdf'] | [] |
| 684 | ['rows.rdf'] | [] |
| 687 | ['rows.rdf'] | [] |
| 725 | ['rows.rdf'] | [] |
| 726 | ['rows.rdf'] | [] |
| 743 | ['rows.rdf'] | [] |
| 748 | ['rows.rdf'] | [] |
| 753 | ['rows.rdf'] | [] |
| 764 | ['rows.rdf'] | [] |
| 769 | ['rows.rdf'] | [] |
| 772 | ['rows.rdf'] | [] |
| 773 | ['rows.rdf'] | [] |
| 780 | ['rows.rdf'] | [] |
| 784 | ['rows.rdf'] | [] |
| 786 | ['rows.rdf'] | [] |
| 794 | ['rows.rdf'] | [] |
| 807 | ['rows.rdf'] | [] |
| 811 | ['rows.rdf'] | [] |
| 818 | ['rows.rdf'] | [] |
| 830 | ['rows.rdf'] | [] |
| 831 | ['rows.rdf'] | [] |
| 837 | ['rows.rdf'] | [] |
| 849 | ['rows.rdf'] | [] |
| 858 | ['rows.rdf'] | [] |
| 859 | ['rows.rdf'] | [] |
| 861 | ['rows.rdf'] | [] |
| 880 | ['rows.rdf'] | [] |
| 888 | ['rows.rdf'] | [] |
| 894 | ['rows.rdf'] | [] |
| 895 | ['rows.rdf'] | [] |
| 2210 | ['rows.rdf'] | [] |
| 2556 | ['rows.rdf'] | [] |
| 4354 | ['rows.rdf'] | [] |
| 5364 | ['rows.rdf'] | [] |
| 5451 | ['rows.rdf'] | [] |
| 5662 | ['rows.rdf'] | [] |
| 5663 | ['rows.rdf'] | [] |
| 5665 | ['rows.rdf'] | [] |
| 5719 | ['rows.rdf'] | [] |
| 5720 | ['rows.rdf'] | [] |
| 5721 | ['rows.rdf'] | [] |
| 5739 | ['rows.rdf'] | [] |
| 5740 | ['rows.rdf'] | [] |
| 5741 | ['rows.rdf'] | [] |
| 5742 | ['rows.rdf'] | [] |
| 5743 | ['rows.rdf'] | [] |
| 5744 | ['rows.rdf'] | [] |
| 6046 | ['rows.rdf'] | [] |
| 8112 | ['rows.rdf'] | [] |
| 9298 | ['rows.rdf'] | [] |
| 9461 | ['rows.rdf'] | [] |
| 9692 | ['rows.rdf'] | [] |
| 10144 | ['rows.rdf'] | [] |
| 10145 | ['rows.rdf'] | [] |
| 10235 | ['rows.rdf'] | [] |
| 10251 | ['rows.rdf'] | [] |
| 10284 | ['rows.rdf'] | [] |
| 10658 | ['rows.rdf'] | [] |
| 10703 | ['rows.rdf'] | [] |
| 10825 | ['rows.rdf'] | [] |
| 10936 | ['rows.rdf'] | [] |
| 10959 | ['rows.rdf'] | [] |
| 11003 | ['rows.rdf'] | [] |
| 11022 | ['rows.rdf'] | [] |
| 11024 | ['rows.rdf'] | [] |
| 11035 | ['rows.rdf'] | [] |
| 11099 | ['rows.rdf'] | [] |
| 11167 | ['rows.rdf'] | [] |
| 11191 | ['rows.rdf'] | [] |
| 11238 | ['rows.rdf'] | [] |
| 11247 | ['rows.rdf'] | [] |
| 11263 | ['rows.rdf'] | [] |
| 11285 | ['rows.rdf'] | [] |
| 11322 | ['rows.rdf'] | [] |
| 11360 | ['rows.rdf'] | [] |
| 11401 | ['rows.rdf'] | [] |
| 11511 | ['rows.rdf'] | [] |
| 11520 | ['rows.rdf'] | [] |
| 11580 | ['rows.rdf'] | [] |
| 11611 | ['rows.rdf'] | [] |
| 11625 | ['rows.rdf'] | [] |
| 11631 | ['rows.rdf'] | [] |
| 11655 | ['rows.rdf'] | [] |
| 11689 | ['rows.rdf'] | [] |
| 11709 | ['rows.rdf'] | [] |
| 11828 | ['rows.rdf'] | [] |
| 12275 | ['rows.rdf'] | [] |
| 12577 | ['foaf.rdf'] | [] |
| 12635 | ['foaf.rdf'] | [] |
| 12704 | ['makxdekkers.rdf'] | [] |
| 12720 | ['foaf.rdf'] | [] |
| 12819 | ['index.rdf'] | [] |
| 12935 | ['foaf-momus.rdf'] | [] |
| 12942 | ['foaf.rdf'] | [] |
| 12949 | ['foaf.rdf'] | [] |
| 12989 | ['aigp-dblp-sameas.rdf'] | [] |
| 13004 | ['foaf.rdf'] | [] |
| 13043 | ['foaf.rdf'] | [] |
| 13109 | ['foaf.rdf'] | [] |
| 13148 | ['foaf.rdf'] | [] |
| 13152 | ['doap.rdf'] | [] |
| 13252 | [] | ['affymetrix-185061-at'] |
| 13254 | [] | ['rdf.xml', 'databnf-all-rdf-xml.tar.gz-user-databnf-password-databnf'] |
| 13263 | [] | ['govwild-rdf-2012-01-30.zip'] |
| 13279 | [] | ['hgnc-7'] |
| 13283 | [] | ['txn-ocs.ttl.gz', 'txn-distribution.ttl.gz', 'txn-images.ttl.gz', 'txn-misc.ttl.gz'] |
| 13284 | [] | ['deck'] |
| 13288 | [] | ['54b30'] |
| 13290 | ['006893251.ttl', '006893251.rdf'] | ['006893251.json'] |
| 13291 | ['territory-environment-section.nt'] | ['society-section', 'population-section', 'economy-section', 'active-population-economic-sector-nace09-timeseries', 'vocab'] |
| 13299 | [] | ['homologene-1000'] |
| 13334 | [] | ['s2377506.xml'] |
| 13347 | [] | ['geospecies.rdf.gz'] |
| 13351 | [] | ['cc195814-83a4-386f-509a-37e2f35a204b.html'] |
| 13356 | [] | ['523', 'spatial'] |
| 13357 | ['owl.owl', 'rdf-schema.rdf', 'contact.nt', '22-rdf-syntax-ns.nt', 'pml-justification.owl', 'event.owl'] | ['data-view', 'wgs84-pos', 'doap', 'cube', 'prov', 'vsr', 'tvcg.2010.181', 'ns', 'void', 'dcat', 'core'] |
| 13368 | [] | ['all-geonames-rdf.zip'] |
| 13369 | [] | ['dbpedia-3.6.owl.bz2', 'linux'] |
| 13378 | ['wordsense-entity-noun-1.nt'] | ['wn20full.zip'] |
| 13384 | [] | ['4bf87ee3-8f52-94bb-2398-44e7af6980e4.html'] |
| 13388 | ['paris.nt'] | ['paris', 'dbpedia-3.6.owl.bz2'] |
| 13394 | [] | ['5f51f368-635f-adb1-5968-f8a8d8471070.html'] |
| 13400 | ['laser-printer.ttl'] | [] |
| 13412 | ['owl.owl', 'rdf-schema.rdf', 'contact.nt', '22-rdf-syntax-ns.nt'] | ['sparql-service-description', 'nif-core', 'doap', 'prov', 'vsr', 'ns', 'void', 'dcat', 'core'] |
| 13421 | [] | ['cb6ceeb7-47af-f243-3e45-3c0a916f6c17.html'] |
| 13452 | [] | ['7d0e4f43-405b-760e-3f62-88874c758714.html'] |
| 13461 | [] | ['dbpedia-3.5.1.owl.bz2'] |
| 13469 | ['interpro-ipr000100.nt'] | [] |
| 13471 | [] | ['omim-603903'] |
| 13482 | [] | ['sgd-s000006169'] |
| 13506 | [] | ['biomodels-biomd0000000048'] |
| 13522 | [] | ['irefindex.irogid-1069566'] |
| 13565 | ['download-20120123.rdf'] | [] |
| 13568 | [] | ['1745'] |
| 13577 | ['lemonentriespointingtosinglefile.nt', 'individualspointingtosinglefile.nt', 'lemonentriespointingtobigfile.nt', 'simpleentries.nt', 'simpleontology.nt', 'individualspointingtobigfile.nt'] | ['psclemon'] |
| 13581 | ['caster-complessi.rdf', 'titolariomaic.rdf', 'entimaic.rdf', 'commissioniparlamentariinchiesta.rdf', 'uodmaic.rdf', 'luoghiacs.rdf', 'assembleacostituente.rdf', 'personemaic.rdf', 'caster-soggettiproduttori.rdf', 'caster-soggetticonservatori.rdf'] | ['cd1700000001'] |
| 13582 | [] | ['p305757'] |
| 13583 | ['gold-2010.owl'] | [] |
| 13590 | ['l99.rdf', 's40.rdf', 'asjp.owl', 'r1234.rdf', 'fromance.rdf', 'd13-468.rdf'] | ['families.rdf.zip', 'senses.rdf.zip', 'renderings.ttl.zip', 'languoids.rdf.zip'] |
| 13594 | [] | ['aanmonstering-del-gem-1890-35'] |
| 13600 | [] | ['drugbank-db00001'] |
| 13623 | [] | ['cc195814-83a4-386f-509a-37e2f35a204b.html'] |
| 13715 | [] | ['gb'] |
| 13723 | ['00100204.ttl'] | [] |
| 13759 | ['owl.owl', 'rdf-schema.rdf', 'contact.nt', '22-rdf-syntax-ns.nt'] | ['doap', 'prov', 'ns', 'void', 'dcat', 'core'] |
| 13796 | ['owl.owl', 'rdf-schema.rdf', 'contact.nt', '22-rdf-syntax-ns.nt'] | ['sparql-service-description', 'doap', 'prov', 'ns', 'void', 'dcat', 'core'] |
| 13822 | [] | ['lsoa.ttl.zip'] |
| 13845 | ['ntn-individuals.owl'] | [] |
| 13950 | ['cell2159549-5275-262-1.rdf'] | [] |
| 13990 | [] | ['examples'] |
| 13997 | [] | ['bulmo-0007-473x-2000-num-158-2-2373', '196565-person', 'persee-person-align-rdf.tar.gz', 'ahess-0395-2649'] |
| 14013 | ['eventkg-r2.ttl'] | [] |
| 14054 | [] | ['rdf.xml'] |
| 14079 | ['mapping-eat-dbpedia.rdf'] | ['eat.nt.gz', 'wheat'] |
| 14080 | ['33383.ttl'] | [] |
| 14160 | ['fulldump.nq'] | [] |
| 14195 | ['nomisma.ttl'] | [] |
| 14196 | ['fro-usc-title-15-chapter-2d-s1-5.ttl', 'fro-banking.ttl'] | [] |
| 14198 | ['data-and-taxonomy.ttl'] | [] |
| 14252 | ['wordsense-entity-noun-1.nt'] | ['wn20full.zip'] |
| 14277 | [] | ['txn-ocs.ttl.gz', 'txn-distribution.ttl.gz', 'txn-images.ttl.gz', 'txn-misc.ttl.gz'] |
| 14324 | [] | ['geospecies.rdf.gz'] |
| 14336 | [] | ['dbpedia-3.6.owl.bz2', 'linux'] |
| 14344 | [] | ['all-geonames-rdf.zip'] |
| 14364 | [] | ['govwild-rdf-2012-01-30.zip'] |
| 14382 | ['laser-printer.ttl'] | [] |
| 14387 | [] | ['hgnc-7'] |
| 14396 | [] | ['omim-603903'] |
| 14411 | [] | ['affymetrix-185061-at'] |
| 14417 | [] | ['deck'] |
| 14418 | ['173390.rdf'] | ['gtaa'] |
| 14430 | [] | ['gb'] |
| 14451 | [] | ['nuts1', 'lv'] |
| 14469 | ['00100204.ttl'] | [] |
| 14472 | ['bio2rdf-ndc-sio-mapping.owl'] | ['ndc-49288-0001-272c9910-2160-4d41-afbd-faab3055ba1d'] |
| 14473 | [] | ['mesh-d018377'] |
| 14486 | [] | ['aanmonstering-del-gem-1890-35'] |
| 14491 | [] | ['p305757'] |
| 14493 | ['caster-complessi.rdf', 'titolariomaic.rdf', 'entimaic.rdf', 'commissioniparlamentariinchiesta.rdf', 'uodmaic.rdf', 'luoghiacs.rdf', 'assembleacostituente.rdf', 'personemaic.rdf', 'caster-soggettiproduttori.rdf', 'caster-soggetticonservatori.rdf'] | ['cd1700000001'] |
| 14517 | [] | ['irefindex.irogid-1069566'] |
| 14519 | [] | ['go-0006915'] |
| 14520 | [] | ['drugbank-db00001'] |
| 14558 | [] | ['taxonomy-9606'] |
| 14580 | [] | ['54b30'] |
| 14583 | [] | ['s2377506.xml'] |
| 14587 | [] | ['4bf87ee3-8f52-94bb-2398-44e7af6980e4.html'] |
| 14599 | [] | ['5f51f368-635f-adb1-5968-f8a8d8471070.html'] |
| 14603 | ['owl.owl', 'rdf-schema.rdf', 'contact.nt', '22-rdf-syntax-ns.nt', 'pml-justification.owl', 'event.owl'] | ['data-view', 'wgs84-pos', 'doap', 'cube', 'prov', 'vsr', 'tvcg.2010.181', 'ns', 'void', 'dcat', 'core'] |
| 14604 | [] | ['e199bcf2-bf94-6b87-81a0-4dacaec11f46.html'] |
| 14636 | ['492dbb62-b569-f6cb-5822-545c474bc3db.rdf'] | ['492dbb62-b569-f6cb-5822-545c474bc3db.html'] |
| 14637 | [] | ['7d0e4f43-405b-760e-3f62-88874c758714.html'] |
| 14735 | ['dump.nt', 'terms.rdf'] | ['data.nobelprize.org', '1'] |
| 14742 | [] | ['sample-20160515.rdf.xml'] |
| 14801 | [] | ['usage.nt.gz'] |
| 14806 | ['owl.owl', 'rdf-schema.rdf', 'contact.nt', '22-rdf-syntax-ns.nt'] | ['sparql-service-description', 'doap', 'prov', 'ns', 'void', 'dcat', 'core'] |
| 14821 | ['gold-2010.owl'] | [] |
| 14921 | ['lemonentriespointingtosinglefile.nt', 'individualspointingtosinglefile.nt', 'lemonentriespointingtobigfile.nt', 'simpleentries.nt', 'simpleontology.nt', 'individualspointingtobigfile.nt'] | ['psclemon'] |
| 15006 | [] | ['138269.3'] |
| 15056 | ['l99.rdf', 's40.rdf', 'asjp.owl', 'r1234.rdf', 'fromance.rdf', 'd13-468.rdf'] | ['families.rdf.zip', 'senses.rdf.zip', 'renderings.ttl.zip', 'languoids.rdf.zip'] |
| 15082 | ['owl.owl', 'rdf-schema.rdf', 'contact.nt', '22-rdf-syntax-ns.nt'] | ['sparql-service-description', 'nif-core', 'doap', 'prov', 'vsr', 'ns', 'void', 'dcat', 'core'] |
| 15144 | ['n3.n3', 'void.ttl'] | ['turtle'] |
| 15150 | ['owl.owl', 'rdf-schema.rdf', 'contact.nt', '22-rdf-syntax-ns.nt'] | ['doap', 'prov', 'ns', 'void', 'dcat', 'core'] |
| 15205 | ['ntn-individuals.owl'] | [] |
| 15243 | ['en.rdf', 'fr.rdf'] | [] |
| 15282 | [] | ['lsoa.ttl.zip'] |
| 15333 | ['territory-environment-section.nt'] | ['society-section', 'population-section', 'economy-section', 'active-population-economic-sector-nace09-timeseries', 'vocab'] |
| 15381 | [] | ['523', 'spatial'] |
| 15414 | ['paris.nt'] | ['paris', 'dbpedia-3.6.owl.bz2'] |
| 15458 | [] | ['2ffc8ca0-4822-36b8-620b-24d356101748.html'] |
| 15632 | ['1920402.rdf'] | [] |
| 15633 | ['applications-by-account.nt'] | [] |
| 15636 | ['dmo-government-staff-and-salary-data-template-march-2011version-4.rdf', 'hmt-staffdata-mar2011.rdf', 'apa-staffdata-oct2011.rdf', '300911-dmo-organogram-ver-2.rdf', 'hmt-treasury-staff-and-salary-data-template-september-2011-for-input-final-v14.rdf', 'apa-staffdata-mar2011.rdf'] | [] |
| 15641 | [] | ['moj-data.rdf.txt'] |
| 15658 | ['ccwater-organogram-30-sept-2011.rdf'] | [] |
| 15715 | ['junior-and-senior-organogram-data-2011-03-31.rdf'] | [] |
| 15736 | ['tsol-staff-data-march-2011.rdf', 'tsol-staff-data-september-2011.rdf'] | [] |
| 15737 | ['wndc-senior-salary-and-organogram-march-2011v24.rdf', '300911-wndc-organogram-ver2.rdf'] | [] |
| 15739 | ['hornimanstaff-and-salary-data.rdf'] | [] |
| 15743 | ['wales-office-organogram.rdf'] | [] |
| 15749 | ['government-staff-and-salary-data-march-2011-gscc.rdf'] | [] |
| 15753 | ['may-2011-government-staff-and-salary-data-partnerships-for-schools-30-march-2011.rdf'] | [] |
| 15757 | ['iwm-staff-and-salary-data-sept-2011.rdf'] | [] |
| 15761 | ['government-staff-and-salary-data-ihol-march-2011-2.rdf'] | [] |
| 15762 | ['transparency-organogram.rdf'] | [] |
| 15766 | ['geffrye-museum-staff-and-salary-data-july-2011.rdf', 'geffrye-museum-staff-and-salary-data-nov-2011.rdf'] | [] |
| 15767 | ['1932550.rdf', '2035652.rdf'] | [] |
| 15771 | ['2011-03-31-nhm-organograms.rdf', '2011-09-30-nhm-organograms.rdf'] | [] |
| 15774 | ['2011-05-26-staff-posts-and-salary-data.rdf', '2011-12-05-ofqual-staff-salary-data-30092011.rdf'] | [] |
| 15779 | ['government-staff-and-salary-data-template-v2.rdf', 'copy-of-government-staff-and-salary-data-blank-template-october-2011-2.rdf'] | [] |
| 15781 | ['1881-senior-and-junior-staff-and-their-pay-grades.rdf'] | [] |
| 15786 | ['uksport-staff-and-salary-data-march2011.rdf'] | [] |
| 15788 | ['nml-organogram2010.rdf'] | ['2011-nml-organogram.rdf-service-wms-request-getcapabilities-version-1.3'] |
| 15798 | ['organisation-data-1321011754.rdf', 'government-staff-and-salary-data-blank-template-1311955022.rdf'] | [] |
| 15803 | ['dfe-20data-20september-202011.rdf'] | [] |
| 15976 | ['ingresos.jsonld', 'gastosfuncionales.jsonld', 'gastos.jsonld'] | [] |
| 15977 | ['ingresos.jsonld', 'gastosfuncionales.jsonld', 'gastos.jsonld'] | [] |
| 15978 | ['ingresos.jsonld', 'gastosfuncionales.jsonld', 'gastos.jsonld'] | [] |
| 15998 | ['rdfdatamodel-bandi-forniture-servizi.rdf', 'rdfdatamodel-altri-bandi.rdf', 'rdfdatamodel-bandi-lavori-pubblici.rdf'] | [] |
| 16001 | ['rdfschema.rdf'] | [] |
| 16002 | ['rdfdatamodel-notizia.rdf'] | [] |
| 16004 | ['rdfdatamodel-concorso.rdf'] | [] |
| 16005 | ['rdfdatamodel-incarico.rdf'] | [] |
| 16006 | ['rdfdatamodel-hotspot.rdf'] | [] |
| 16034 | ['ore-rem.ttl'] | [] |
| 16035 | ['ore-rem.ttl'] | [] |
| 16036 | ['ore-rem.ttl'] | [] |
| 16037 | ['ore-rem.ttl'] | [] |
| 16038 | ['ore-rem.ttl'] | [] |
| 16039 | ['ore-rem.ttl'] | [] |
| 16040 | ['ore-rem.ttl'] | [] |
| 16041 | ['ore-rem.ttl'] | [] |
| 16042 | ['ore-rem.ttl'] | [] |
| 16043 | ['ore-rem.ttl'] | [] |
| 16044 | ['ore-rem.ttl'] | [] |
| 16045 | ['ore-rem.ttl'] | [] |
| 16046 | ['ore-rem.ttl'] | [] |
| 16047 | ['ore-rem.ttl'] | [] |
| 16048 | ['ore-rem.ttl'] | [] |
| 16049 | ['ore-rem.ttl'] | [] |
| 16050 | ['ore-rem.ttl'] | [] |
| 16051 | ['ore-rem.ttl'] | [] |
| 16052 | ['ore-rem.ttl'] | [] |
| 16053 | ['ore-rem.ttl'] | [] |
| 16054 | ['ore-rem.ttl'] | [] |
| 16055 | ['ore-rem.ttl'] | [] |
| 16056 | ['ore-rem.ttl'] | [] |
| 16057 | ['ore-rem.ttl'] | [] |
| 16058 | ['ore-rem.ttl'] | [] |
| 16059 | ['ore-rem.ttl'] | [] |
| 16060 | ['ore-rem.ttl'] | [] |
| 16061 | ['ore-rem.ttl'] | [] |
| 16062 | ['ore-rem.ttl'] | [] |
| 16063 | ['ore-rem.ttl'] | [] |
| 16064 | ['ore-rem.ttl'] | [] |
| 16065 | ['ore-rem.ttl'] | [] |
| 16066 | ['ore-rem.ttl'] | [] |
| 16067 | ['ore-rem.ttl'] | [] |
| 16068 | ['ore-rem.ttl'] | [] |
| 16069 | ['ore-rem.ttl'] | [] |
| 16070 | ['ore-rem.ttl'] | [] |
| 16072 | ['ore-rem.ttl'] | [] |
| 16073 | ['ore-rem.ttl'] | [] |
| 16074 | ['ore-rem.ttl'] | [] |
| 16075 | ['ore-rem.ttl'] | [] |
| 16076 | ['ore-rem.ttl'] | [] |
| 16077 | ['ore-rem.ttl'] | [] |
| 17283 | ['rows.rdf'] | [] |
| 17375 | ['rows.rdf'] | [] |
| 17913 | ['rows.rdf'] | [] |
| 17925 | ['rows.rdf'] | [] |
| 19019 | ['rows.rdf'] | [] |
| 21023 | ['2016-allievi-partecipanti.nt'] | [] |
| 21468 | [] | ['proposte-qtxt-select-20-20where-20-7b-s-20-p-20-o-7d-20limit-201000'] |
| 21522 | ['rdfdatamodel-immobile.rdf'] | [] |
| 21523 | ['rdfschema.rdf'] | [] |
| 21525 | ['rdfdatamodel-incarico.rdf'] | [] |
| 21526 | ['rdfdatamodel-bandi-forniture-servizi.rdf', 'rdfdatamodel-altri-bandi.rdf', 'rdfdatamodel-bandi-lavori-pubblici.rdf'] | [] |
| 21527 | ['rdfdatamodel-concorso.rdf'] | [] |
| 21528 | ['rdfdatamodel-notizia.rdf'] | [] |
| 21529 | ['rdfdatamodel-hotspot.rdf'] | [] |
| 21532 | [] | ['jrcnames-uri.zip'] |
| 21682 | ['mf-portfif-2-th-12.rdf'] | [] |
| 21683 | ['mf-portfif-1-th-12.rdf'] | [] |
| 21685 | ['mf-regis-th-12.rdf'] | [] |
| 24154 | ['tbgy-vw5z.rdf'] | [] |
| 29412 | ['kc9i-wq85.rdf'] | [] |
| 29436 | ['cnfp-tsxc.rdf'] | [] |
| 29783 | ['qizy-d2wf.rdf'] | [] |
| 31118 | ['ukv7-jfmv.rdf'] | [] |
| 31599 | ['54yg-8jz5.rdf'] | [] |
| 38134 | ['2er7-i3zj.rdf'] | [] |
| 39483 | ['adzi-ting.rdf'] | [] |
| 39531 | ['e5vi-a3tx.rdf'] | [] |
| 39552 | ['3ukt-87ju.rdf'] | [] |
| 40758 | ['wrpp-is68.rdf'] | [] |
| 40908 | ['xfij-5ugz.rdf'] | [] |
| 41134 | ['e4mh-a2u3.rdf'] | [] |
| 41151 | ['ty3c-qr7r.rdf'] | [] |
| 41371 | ['9evq-q4ni.rdf'] | [] |
| 41387 | ['t4mm-ynkd.rdf'] | [] |
| 41391 | ['czii-xhj9.rdf'] | [] |
| 41437 | ['ifrs-uzjp.rdf'] | [] |
| 41439 | ['4icd-wtsk.rdf'] | [] |
| 41444 | ['3gk2-vh5k.rdf'] | [] |
| 41445 | ['ipzh-446j.rdf'] | [] |
| 41449 | ['482t-q4rs.rdf'] | [] |
| 41450 | ['uh35-acwz.rdf'] | [] |
| 41454 | ['42yc-giac.rdf'] | [] |
| 41455 | ['v6fm-igu4.rdf'] | [] |
| 41456 | ['crr5-2d65.rdf'] | [] |
| 41465 | ['nvy7-tkh5.rdf'] | [] |
| 41469 | ['i5wn-ri8c.rdf'] | [] |
| 41470 | ['ihqv-a9gs.rdf'] | [] |
| 41485 | ['jqvy-ujks.rdf'] | [] |
| 41501 | ['e5jd-ytjd.rdf'] | [] |
| 41503 | ['pmjs-6uqq.rdf'] | [] |
| 41504 | ['6nsk-9bvz.rdf'] | [] |
| 41506 | ['kcq6-et5q.rdf'] | [] |
| 41507 | ['umf4-5kmu.rdf'] | [] |
| 41515 | ['262v-pqa4.rdf'] | [] |
| 41516 | ['bua2-8m9z.rdf'] | [] |
| 41517 | ['euqj-acaw.rdf'] | [] |
| 41518 | ['u8mi-c6pg.rdf'] | [] |
| 41519 | ['eamz-affj.rdf'] | [] |
| 41520 | ['ckfq-pjk7.rdf'] | [] |
| 41521 | ['thk4-xhpc.rdf'] | [] |
| 41522 | ['edbj-pf5t.rdf'] | [] |
| 41529 | ['bdqp-b69h.rdf'] | [] |
| 41531 | ['7c6w-2qvk.rdf'] | [] |
| 41534 | ['pfin-hytb.rdf'] | [] |
| 41539 | ['8axi-nkrk.rdf'] | [] |
| 41543 | ['jt95-7nbp.rdf'] | [] |
| 41544 | ['ve8x-7wpr.rdf'] | [] |
| 41545 | ['icy5-rmhs.rdf'] | [] |
| 41551 | ['wv9g-2f6t.rdf'] | [] |
| 41553 | ['ebpk-uqj6.rdf'] | [] |
| 41554 | ['peqv-8v8d.rdf'] | [] |
| 41562 | ['9jqf-bv5r.rdf'] | [] |
| 41566 | ['u2pr-yjem.rdf'] | [] |
| 41567 | ['dtbb-ntrk.rdf'] | [] |
| 41575 | ['hrms-kvnm.rdf'] | [] |
| 41579 | ['ceue-hyth.rdf'] | [] |
| 41601 | ['uyqg-fzse.rdf'] | [] |
| 41621 | ['jxnv-8xjh.rdf'] | [] |
| 41623 | ['uinr-uget.rdf'] | [] |
| 41628 | ['hjfz-c27j.rdf'] | [] |
| 41629 | ['yv5n-n5p6.rdf'] | [] |
| 41633 | ['juhd-9k9e.rdf'] | [] |
| 41637 | ['8u94-yfgk.rdf'] | [] |
| 41642 | ['tpe5-6nkd.rdf'] | [] |
| 41646 | ['56wk-ihbf.rdf'] | [] |
| 41650 | ['mei3-ydbi.rdf'] | [] |
| 41653 | ['2qeb-tphe.rdf'] | [] |
| 41654 | ['knjf-uj4v.rdf'] | [] |
| 41658 | ['yrc6-b6sx.rdf'] | [] |
| 41659 | ['azh4-cire.rdf'] | [] |
| 41664 | ['w6du-72j6.rdf'] | [] |
| 41675 | ['yszw-pq5x.rdf'] | [] |
| 41683 | ['32gq-nuwm.rdf'] | [] |
| 41685 | ['hbkv-72qc.rdf'] | [] |
| 41686 | ['fhqq-2mjq.rdf'] | [] |
| 41690 | ['j5h2-6za2.rdf'] | [] |
| 41701 | ['xpng-ppj3.rdf'] | [] |
| 41702 | ['95br-iyzp.rdf'] | [] |
| 41703 | ['7xmy-88ts.rdf'] | [] |
| 41704 | ['ega3-ewab.rdf'] | [] |
| 41705 | ['ahxt-tnnb.rdf'] | [] |
| 41706 | ['gz3r-fk2y.rdf'] | [] |
| 41707 | ['6sdr-8fgz.rdf'] | [] |
| 43154 | ['3e5s-3q2t.rdf'] | [] |
| 44016 | ['6x9d-idz4.rdf'] | [] |
| 44051 | ['cf7e-dhrb.rdf'] | [] |
| 44088 | ['tzak-8e66.rdf'] | [] |
| 44849 | ['h9vb-3tcy.rdf'] | [] |
| 44859 | ['ng43-255b.rdf'] | [] |
| 44871 | ['ud3j-i2ws.rdf'] | [] |
| 44877 | ['f2q7-6uiv.rdf'] | [] |
| 45524 | ['w4sk-nq57.rdf'] | [] |
| 45526 | ['tvq9-ec9w.rdf'] | [] |
| 45529 | ['hz9m-tj6z.rdf'] | [] |
| 45553 | ['vqqm-nsqg.rdf'] | [] |
| 45604 | ['jhsu-2pka.rdf'] | [] |
| 45627 | ['5aye-4rtt.rdf'] | [] |
| 45806 | ['nen9-84ke.rdf'] | [] |
| 45815 | ['nhy6-gqam.rdf'] | [] |
| 46447 | ['xr9u-qg7n.rdf'] | [] |
| 46542 | ['ajtb-tms2.rdf'] | [] |
| 47505 | ['fjs5-35cn.rdf'] | [] |
| 47562 | ['r6vz-x6jf.rdf'] | [] |
| 51693 | ['gpn8-sbpq.rdf'] | [] |
| 51718 | ['3gkq-ags9.rdf'] | [] |
| 51786 | ['w9kt-3hk8.rdf'] | [] |
| 51895 | ['rpzm-6wvf.rdf'] | [] |
| 51932 | ['rgq9-zjqv.rdf'] | [] |
| 52511 | ['dszh-kvzr.rdf'] | [] |
| 52562 | ['wq8k-pnqg.rdf'] | [] |
| 52591 | ['dscg-f8mh.rdf'] | [] |
| 52690 | ['ncz3-ptfk.rdf'] | [] |
| 52743 | ['nwdp-pxq4.rdf'] | [] |
| 52856 | ['yay2-kfah.rdf'] | [] |
| 54958 | ['hqa9-fu65.rdf'] | [] |
| 54959 | ['26n4-t3i2.rdf'] | [] |
| 54960 | ['bxas-tthn.rdf'] | [] |
| 54961 | ['e5yz-rhka.rdf'] | [] |
| 54962 | ['qqc4-zyw5.rdf'] | [] |
| 54963 | ['ncb4-kvyy.rdf'] | [] |
| 54964 | ['2nt5-t4gz.rdf'] | [] |
| 54965 | ['jm5p-4tih.rdf'] | [] |
| 54966 | ['e46j-f6vr.rdf'] | [] |
| 54967 | ['33c2-vvwz.rdf'] | [] |
| 54968 | ['bcmx-n3c4.rdf'] | [] |
| 54969 | ['xay3-emkq.rdf'] | [] |
| 54970 | ['aa8h-5r4x.rdf'] | [] |
| 54971 | ['bn76-emh9.rdf'] | [] |
| 54972 | ['w5r2-3t5j.rdf'] | [] |
| 63637 | ['mdmf-aswt.rdf'] | [] |
| 63805 | ['giq8-gspk.rdf'] | [] |
| 63944 | ['c34u-vnew.rdf'] | [] |
| 63951 | ['d83j-k75n.rdf'] | [] |
| 63958 | ['szs6-ihk8.rdf'] | [] |
| 64157 | ['kptj-tafi.rdf'] | [] |
| 64159 | ['vkyy-2v6z.rdf'] | [] |
| 64163 | ['i85s-twae.rdf'] | [] |
| 64165 | ['vrsm-k79r.rdf'] | [] |
| 64167 | ['4f3r-bvja.rdf'] | [] |
| 64168 | ['3z7i-zwvu.rdf'] | [] |
| 64173 | ['vrkj-8w4d.rdf'] | [] |
| 64174 | ['vnqw-sx9g.rdf'] | [] |
| 64181 | ['mudx-8k94.rdf'] | [] |
| 64186 | ['gd4u-cu57.rdf'] | [] |
| 64194 | ['5t3u-serj.rdf'] | [] |
| 64200 | ['b3bu-fswq.rdf'] | [] |
| 64225 | ['bhrt-29rb.rdf'] | [] |
| 65708 | ['myju-bx58.rdf'] | [] |
| 65952 | ['snxs-8atp.rdf'] | [] |
| 66005 | ['2ynm-erms.rdf'] | [] |
| 66165 | ['khtu-ck6k.rdf'] | [] |
| 66467 | ['tb7s-6pqn.rdf'] | [] |
| 68298 | ['2qji-4zqf.rdf'] | [] |
| 68732 | ['vsbg-t3e9.rdf'] | [] |
| 73786 | ['3wtr-89us.rdf'] | [] |
| 75678 | ['j55h-3upk.rdf'] | [] |
| 76281 | ['hv9n-xgy4.rdf'] | [] |
| 77345 | ['8qjh-sbs9.rdf'] | [] |
| 77951 | ['rj4h-8pn4.rdf'] | [] |
| 80848 | ['r6sk-x3g9.rdf'] | [] |
| 88131 | ['v9dk-xgzr.rdf'] | [] |


## Analyzing unused files

In [14]:
rdf_files_unused = list()       # contains the path of unused RDF files
other_files_unused = list()     # contains the path on unused NON RDF files

In [15]:
total_used_files = 0

for dataset in datasets:
    metadata_file_path = f"{folder}/{dataset}/metadata.json"

    with open(metadata_file_path, "r") as f:
        metadata = json.load(f, strict=False)

        keys = metadata.keys()

        if "used_files" in keys and len(metadata["used_files"]) > 0:
            total_used_files += len(metadata["used_files"])

        if "unused_files" in keys and len(metadata["unused_files"]) > 0:

            for file in metadata["unused_files"]:
                file_with_path = f"{folder}/{dataset}/{file}"

                if check_if_file_name_is_rdf(file):
                    rdf_files_unused.append(file_with_path)
                else:
                    other_files_unused.append(file_with_path)

print(f"Total used files: {total_used_files}")
print(f"RDF but unusable files: {len(rdf_files_unused)}")
print(f"NON RDF unusable files: {len(other_files_unused)}")

Total used files: 27800
RDF but unusable files: 513
NON RDF unusable files: 185


### Files with RDF extension that are not used

In [16]:
rdf_but_actually_html = list()
rdf_but_actually_json = list()
rdf_but_too_large = list()
rdf_unparsable = list()

for file in rdf_files_unused:
    if os.path.isfile(file):
        if is_file_larger_than_size_limit(file):
            rdf_but_too_large.append(file)
        elif is_html(file):
            rdf_but_actually_html.append(file)
        elif is_json(file):
            rdf_but_actually_json.append(file)
        else:
            rdf_unparsable.append(file)

print(f"RDF but actually HTML: {len(rdf_but_actually_html)}")
print(f"RDF but actually JSON: {len(rdf_but_actually_json)}")
print(f"RDF but too large: {len(rdf_but_too_large)}")
print(f"RDF with syntax error: {len(rdf_unparsable)}")

assert len(rdf_files_unused) == len(rdf_but_actually_html) + len(rdf_but_actually_json) + len(rdf_but_too_large) + len(rdf_unparsable)

RDF but actually HTML: 214
RDF but actually JSON: 45
RDF but too large: 7
RDF with syntax error: 247


## Analysis of unused - NON RDF files 

In [17]:
import zipfile
import tarfile
import magic


def is_gz_file(filepath):
    with open(filepath, "rb") as f:
        return f.read(2) == b"\x1f\x8b"


def is_bz2_file(filepath):
    with open(filepath, "rb") as f:
        return f.read(3) == b"\x42\x5a\x68"


def maybe_rdf(filepath):
    with open(filepath, "r") as f:
        try:
            return "<rdf:RDF" in f.read()
        except Exception:
            return False

def maybe_ttl(filepath: str) -> bool:
    with open(filepath, "r") as f:
        try:
            return "@prefix" in f.read().lower()
        except Exception:
            return False

In [18]:
archives = list()
probably_json = list()
probably_html = list()
cast_to_rdf = list()
cast_to_ttl = list()
other = list()

for file in other_files_unused:
    if (
        tarfile.is_tarfile(file)
        or zipfile.is_zipfile(file)
        or is_gz_file(file)
        or is_bz2_file(file)
    ):
        archives.append(file)

    elif is_html(file):
        probably_html.append(file)

    elif is_json(file):
        probably_json.append(file)

    elif maybe_rdf(file):
        cast_to_rdf.append(file)

    elif maybe_ttl(file):
        cast_to_ttl.append(file)

    else:
        other.append(file)


print(f"Total number of files recognized as archives: {len(archives)}")
print(f"Total number of files that probably are HTML documents: {len(probably_html)}")
print(f"Total number of files that probably are JSON documents: {len(probably_json)}")
print(f"Total number of files that probably can be casted to RDF: {len(cast_to_rdf)}")
print(f"Total number of files that probably can be casted to TTL: {len(cast_to_ttl)}")
print(f"Total number of unknown files: {len(other)}")

assert len(other_files_unused) == len(archives) + len(probably_html) + len(probably_json) + len(cast_to_rdf) + len(cast_to_ttl) + len(other)

Total number of files recognized as archives: 27
Total number of files that probably are HTML documents: 85
Total number of files that probably are JSON documents: 1
Total number of files that probably can be casted to RDF: 47
Total number of files that probably can be casted to TTL: 21
Total number of unknown files: 4


## Final output of the analysis

### Files that are going to be deleted

Files that are HTML can be deleted

In [19]:
rdf_but_actually_html

['datasets/561/covidcountystatistics.ttl',
 'datasets/561/covidstatisticsprofile.ttl',
 'datasets/4354/rows.rdf',
 'datasets/5662/rows.rdf',
 'datasets/5663/rows.rdf',
 'datasets/5665/rows.rdf',
 'datasets/5719/rows.rdf',
 'datasets/5720/rows.rdf',
 'datasets/5721/rows.rdf',
 'datasets/5739/rows.rdf',
 'datasets/5740/rows.rdf',
 'datasets/5741/rows.rdf',
 'datasets/5742/rows.rdf',
 'datasets/5743/rows.rdf',
 'datasets/5744/rows.rdf',
 'datasets/9692/rows.rdf',
 'datasets/11003/rows.rdf',
 'datasets/11520/rows.rdf',
 'datasets/12635/foaf.rdf',
 'datasets/12704/makxdekkers.rdf',
 'datasets/12942/foaf.rdf',
 'datasets/12949/foaf.rdf',
 'datasets/13004/foaf.rdf',
 'datasets/13148/foaf.rdf',
 'datasets/13290/006893251.ttl',
 'datasets/13290/006893251.rdf',
 'datasets/13291/territory-environment-section.nt',
 'datasets/13357/pml-justification.owl',
 'datasets/13357/event.owl',
 'datasets/13400/laser-printer.ttl',
 'datasets/13581/caster-complessi.rdf',
 'datasets/13581/titolariomaic.rdf',
 '

In [20]:
probably_html

['datasets/13254/databnf-all-rdf-xml.tar.gz-user-databnf-password-databnf',
 'datasets/13290/006893251.json',
 'datasets/13291/society-section',
 'datasets/13291/population-section',
 'datasets/13291/economy-section',
 'datasets/13291/active-population-economic-sector-nace09-timeseries',
 'datasets/13351/cc195814-83a4-386f-509a-37e2f35a204b.html',
 'datasets/13356/523',
 'datasets/13356/spatial',
 'datasets/13357/data-view',
 'datasets/13357/prov',
 'datasets/13357/tvcg.2010.181',
 'datasets/13357/void',
 'datasets/13357/dcat',
 'datasets/13369/linux',
 'datasets/13384/4bf87ee3-8f52-94bb-2398-44e7af6980e4.html',
 'datasets/13388/paris',
 'datasets/13394/5f51f368-635f-adb1-5968-f8a8d8471070.html',
 'datasets/13412/sparql-service-description',
 'datasets/13412/prov',
 'datasets/13412/void',
 'datasets/13412/dcat',
 'datasets/13421/cb6ceeb7-47af-f243-3e45-3c0a916f6c17.html',
 'datasets/13452/7d0e4f43-405b-760e-3f62-88874c758714.html',
 'datasets/13568/1745',
 'datasets/13581/cd1700000001'

Files that are JSON can be deleted

In [21]:
rdf_but_actually_json

['datasets/569/rows.rdf',
 'datasets/573/rows.rdf',
 'datasets/574/rows.rdf',
 'datasets/576/rows.rdf',
 'datasets/588/rows.rdf',
 'datasets/599/rows.rdf',
 'datasets/615/rows.rdf',
 'datasets/618/rows.rdf',
 'datasets/622/rows.rdf',
 'datasets/623/rows.rdf',
 'datasets/633/rows.rdf',
 'datasets/635/rows.rdf',
 'datasets/636/rows.rdf',
 'datasets/644/rows.rdf',
 'datasets/659/rows.rdf',
 'datasets/684/rows.rdf',
 'datasets/687/rows.rdf',
 'datasets/725/rows.rdf',
 'datasets/726/rows.rdf',
 'datasets/743/rows.rdf',
 'datasets/748/rows.rdf',
 'datasets/753/rows.rdf',
 'datasets/764/rows.rdf',
 'datasets/769/rows.rdf',
 'datasets/772/rows.rdf',
 'datasets/773/rows.rdf',
 'datasets/780/rows.rdf',
 'datasets/784/rows.rdf',
 'datasets/786/rows.rdf',
 'datasets/794/rows.rdf',
 'datasets/807/rows.rdf',
 'datasets/811/rows.rdf',
 'datasets/818/rows.rdf',
 'datasets/830/rows.rdf',
 'datasets/831/rows.rdf',
 'datasets/837/rows.rdf',
 'datasets/849/rows.rdf',
 'datasets/858/rows.rdf',
 'datasets/8

In [22]:
probably_json

['datasets/14418/gtaa']

### Files that needs to be processed

Files that probably are RDF but needs to be renamed

In [23]:
cast_to_rdf

['datasets/13254/rdf.xml',
 'datasets/13284/deck',
 'datasets/13288/54b30',
 'datasets/13291/vocab',
 'datasets/13334/s2377506.xml',
 'datasets/13357/wgs84-pos',
 'datasets/13357/doap',
 'datasets/13357/vsr',
 'datasets/13357/ns',
 'datasets/13357/core',
 'datasets/13412/doap',
 'datasets/13412/vsr',
 'datasets/13412/ns',
 'datasets/13412/core',
 'datasets/13577/psclemon',
 'datasets/13759/doap',
 'datasets/13759/ns',
 'datasets/13759/core',
 'datasets/13796/doap',
 'datasets/13796/ns',
 'datasets/13796/core',
 'datasets/13997/bulmo-0007-473x-2000-num-158-2-2373',
 'datasets/13997/196565-person',
 'datasets/13997/ahess-0395-2649',
 'datasets/14054/rdf.xml',
 'datasets/14079/wheat',
 'datasets/14417/deck',
 'datasets/14580/54b30',
 'datasets/14583/s2377506.xml',
 'datasets/14603/wgs84-pos',
 'datasets/14603/doap',
 'datasets/14603/vsr',
 'datasets/14603/ns',
 'datasets/14603/core',
 'datasets/14806/doap',
 'datasets/14806/ns',
 'datasets/14806/core',
 'datasets/14921/psclemon',
 'datase

Files that probably are TTL but needs to be renamed

In [24]:
cast_to_ttl

['datasets/13252/affymetrix-185061-at',
 'datasets/13279/hgnc-7',
 'datasets/13299/homologene-1000',
 'datasets/13357/cube',
 'datasets/13412/nif-core',
 'datasets/13471/omim-603903',
 'datasets/13482/sgd-s000006169',
 'datasets/13600/drugbank-db00001',
 'datasets/13715/gb',
 'datasets/14387/hgnc-7',
 'datasets/14396/omim-603903',
 'datasets/14411/affymetrix-185061-at',
 'datasets/14430/gb',
 'datasets/14472/ndc-49288-0001-272c9910-2160-4d41-afbd-faab3055ba1d',
 'datasets/14473/mesh-d018377',
 'datasets/14519/go-0006915',
 'datasets/14520/drugbank-db00001',
 'datasets/14558/taxonomy-9606',
 'datasets/14603/cube',
 'datasets/15082/nif-core',
 'datasets/15144/turtle']

Files that are archives and needs to be extracted

In [25]:
archives

['datasets/13263/govwild-rdf-2012-01-30.zip',
 'datasets/13283/txn-ocs.ttl.gz',
 'datasets/13283/txn-distribution.ttl.gz',
 'datasets/13283/txn-images.ttl.gz',
 'datasets/13283/txn-misc.ttl.gz',
 'datasets/13347/geospecies.rdf.gz',
 'datasets/13368/all-geonames-rdf.zip',
 'datasets/13369/dbpedia-3.6.owl.bz2',
 'datasets/13378/wn20full.zip',
 'datasets/13388/dbpedia-3.6.owl.bz2',
 'datasets/13461/dbpedia-3.5.1.owl.bz2',
 'datasets/13822/lsoa.ttl.zip',
 'datasets/13997/persee-person-align-rdf.tar.gz',
 'datasets/14079/eat.nt.gz',
 'datasets/14252/wn20full.zip',
 'datasets/14277/txn-ocs.ttl.gz',
 'datasets/14277/txn-distribution.ttl.gz',
 'datasets/14277/txn-images.ttl.gz',
 'datasets/14277/txn-misc.ttl.gz',
 'datasets/14324/geospecies.rdf.gz',
 'datasets/14336/dbpedia-3.6.owl.bz2',
 'datasets/14344/all-geonames-rdf.zip',
 'datasets/14364/govwild-rdf-2012-01-30.zip',
 'datasets/14801/usage.nt.gz',
 'datasets/15282/lsoa.ttl.zip',
 'datasets/15414/dbpedia-3.6.owl.bz2',
 'datasets/21532/jrcn

Files that needs to be manually processed because they are too big to be processed by RDFLib

In [26]:
rdf_but_too_large

['datasets/2/ppg-sf-dump.rdf',
 'datasets/6/ppg-sf-dump.rdf',
 'datasets/11580/rows.rdf',
 'datasets/13565/download-20120123.rdf',
 'datasets/15243/en.rdf',
 'datasets/15243/fr.rdf',
 'datasets/21023/2016-allievi-partecipanti.nt']

Files that needs to be processed manually to assign them an extension

In [27]:
for file in other:
    print(f"{file:<50} | {magic.from_file(file)}")

datasets/13506/biomodels-biomd0000000048           | ASCII text
datasets/13522/irefindex.irogid-1069566            | ASCII text
datasets/14517/irefindex.irogid-1069566            | ASCII text
datasets/14742/sample-20160515.rdf.xml             | UTF-8 Unicode text, with very long lines


RDF files that needs to be processed as plain text files

In [28]:
rdf_unparsable

['datasets/2210/rows.rdf',
 'datasets/2556/rows.rdf',
 'datasets/5364/rows.rdf',
 'datasets/5451/rows.rdf',
 'datasets/6046/rows.rdf',
 'datasets/8112/rows.rdf',
 'datasets/9298/rows.rdf',
 'datasets/9461/rows.rdf',
 'datasets/10144/rows.rdf',
 'datasets/10145/rows.rdf',
 'datasets/10235/rows.rdf',
 'datasets/10251/rows.rdf',
 'datasets/10284/rows.rdf',
 'datasets/10658/rows.rdf',
 'datasets/10703/rows.rdf',
 'datasets/10825/rows.rdf',
 'datasets/10936/rows.rdf',
 'datasets/10959/rows.rdf',
 'datasets/11022/rows.rdf',
 'datasets/11024/rows.rdf',
 'datasets/11035/rows.rdf',
 'datasets/11099/rows.rdf',
 'datasets/11167/rows.rdf',
 'datasets/11191/rows.rdf',
 'datasets/11238/rows.rdf',
 'datasets/11247/rows.rdf',
 'datasets/11263/rows.rdf',
 'datasets/11285/rows.rdf',
 'datasets/11322/rows.rdf',
 'datasets/11360/rows.rdf',
 'datasets/11401/rows.rdf',
 'datasets/11511/rows.rdf',
 'datasets/11611/rows.rdf',
 'datasets/11625/rows.rdf',
 'datasets/11631/rows.rdf',
 'datasets/11655/rows.rdf',
