# Semi-automatic modifications to the collection

### Utility functions

In [7]:
import os

SIZE_LIMIT = 200 * 1024 * 1024  # 200 MB


def is_file_larger_than_size_limit(filepath: str) -> bool:
    size = os.path.getsize(str(filepath))
    return int(size) >= int(SIZE_LIMIT)


def read_head(filepath: str) -> str:
    head = os.popen(f"head {filepath}").read()
    return head

In [8]:
RDF_SUFFIXES = ["rdf", "ttl", "owl", "n3", "nt", "jsonld", "nq", "trig", "trix"]


def check_if_file_name_is_rdf(name: str) -> bool:
    return name.split(".")[-1] in RDF_SUFFIXES

In [9]:
import time
from pathlib import Path


def delete_file(file_path: str):
    if os.path.isfile(file_path):
        print(f"Deleting {file_path}")
        os.remove(file_path)


def change_extension(filepath: str, new_extension: str):
    suffix = Path(filepath).suffix

    new_name = None

    if len(suffix) > 0:
        new_name = filepath.replace(suffix, f".{new_extension}")
    else:
        new_name = f"{filepath}.{new_extension}"

    if os.path.isfile(new_name):
        time_str = time.strftime("%Y_%m_%d-%I_%M_%S")
        new_name = f"{time_str}-{new_name}"

    print(f"Renaming {filepath} to {new_name}")
    os.rename(filepath, new_name)

In [10]:
def diff_file(file_path_1: str, file_path_2: str) -> str:
    return os.popen(f"diff {file_path_1} {file_path_2}").read()

In [11]:
def diff_dataset(dataset_path_1: str, dataset_path_2: str) -> str:
    return os.popen(f"diff -qr {dataset_path_1} {dataset_path_2}").read()

## Delete HTML files

In [12]:
rdf_but_actually_html = [
    "datasets/561/covidcountystatistics.ttl",
    "datasets/561/covidstatisticsprofile.ttl",
    "datasets/4354/rows.rdf",
    "datasets/5662/rows.rdf",
    "datasets/5663/rows.rdf",
    "datasets/5665/rows.rdf",
    "datasets/5719/rows.rdf",
    "datasets/5720/rows.rdf",
    "datasets/5721/rows.rdf",
    "datasets/5739/rows.rdf",
    "datasets/5740/rows.rdf",
    "datasets/5741/rows.rdf",
    "datasets/5742/rows.rdf",
    "datasets/5743/rows.rdf",
    "datasets/5744/rows.rdf",
    "datasets/9692/rows.rdf",
    "datasets/11003/rows.rdf",
    "datasets/11520/rows.rdf",
    "datasets/12635/foaf.rdf",
    "datasets/12704/makxdekkers.rdf",
    "datasets/12942/foaf.rdf",
    "datasets/12949/foaf.rdf",
    "datasets/13004/foaf.rdf",
    "datasets/13148/foaf.rdf",
    "datasets/13290/006893251.ttl",
    "datasets/13290/006893251.rdf",
    "datasets/13291/territory-environment-section.nt",
    "datasets/13357/pml-justification.owl",
    "datasets/13357/event.owl",
    "datasets/13400/laser-printer.ttl",
    "datasets/13581/caster-complessi.rdf",
    "datasets/13581/titolariomaic.rdf",
    "datasets/13581/entimaic.rdf",
    "datasets/13581/commissioniparlamentariinchiesta.rdf",
    "datasets/13581/uodmaic.rdf",
    "datasets/13581/luoghiacs.rdf",
    "datasets/13581/assembleacostituente.rdf",
    "datasets/13581/personemaic.rdf",
    "datasets/13581/caster-soggettiproduttori.rdf",
    "datasets/13581/caster-soggetticonservatori.rdf",
    "datasets/13590/l99.rdf",
    "datasets/13590/s40.rdf",
    "datasets/13590/asjp.owl",
    "datasets/13590/r1234.rdf",
    "datasets/13590/fromance.rdf",
    "datasets/13590/d13-468.rdf",
    "datasets/13723/00100204.ttl",
    "datasets/13845/ntn-individuals.owl",
    "datasets/13950/cell2159549-5275-262-1.rdf",
    "datasets/14013/eventkg-r2.ttl",
    "datasets/14160/fulldump.nq",
    "datasets/14196/fro-usc-title-15-chapter-2d-s1-5.ttl",
    "datasets/14196/fro-banking.ttl",
    "datasets/14382/laser-printer.ttl",
    "datasets/14469/00100204.ttl",
    "datasets/14493/caster-complessi.rdf",
    "datasets/14493/titolariomaic.rdf",
    "datasets/14493/entimaic.rdf",
    "datasets/14493/commissioniparlamentariinchiesta.rdf",
    "datasets/14493/uodmaic.rdf",
    "datasets/14493/luoghiacs.rdf",
    "datasets/14493/assembleacostituente.rdf",
    "datasets/14493/personemaic.rdf",
    "datasets/14493/caster-soggettiproduttori.rdf",
    "datasets/14493/caster-soggetticonservatori.rdf",
    "datasets/14603/pml-justification.owl",
    "datasets/14603/event.owl",
    "datasets/14636/492dbb62-b569-f6cb-5822-545c474bc3db.rdf",
    "datasets/14735/dump.nt",
    "datasets/14735/terms.rdf",
    "datasets/15056/l99.rdf",
    "datasets/15056/s40.rdf",
    "datasets/15056/asjp.owl",
    "datasets/15056/r1234.rdf",
    "datasets/15056/fromance.rdf",
    "datasets/15056/d13-468.rdf",
    "datasets/15144/n3.n3",
    "datasets/15205/ntn-individuals.owl",
    "datasets/15333/territory-environment-section.nt",
    "datasets/15632/1920402.rdf",
    "datasets/15636/dmo-government-staff-and-salary-data-template-march-2011version-4.rdf",
    "datasets/15636/hmt-staffdata-mar2011.rdf",
    "datasets/15636/apa-staffdata-oct2011.rdf",
    "datasets/15636/300911-dmo-organogram-ver-2.rdf",
    "datasets/15636/hmt-treasury-staff-and-salary-data-template-september-2011-for-input-final-v14.rdf",
    "datasets/15636/apa-staffdata-mar2011.rdf",
    "datasets/15715/junior-and-senior-organogram-data-2011-03-31.rdf",
    "datasets/15736/tsol-staff-data-march-2011.rdf",
    "datasets/15736/tsol-staff-data-september-2011.rdf",
    "datasets/15737/wndc-senior-salary-and-organogram-march-2011v24.rdf",
    "datasets/15737/300911-wndc-organogram-ver2.rdf",
    "datasets/15739/hornimanstaff-and-salary-data.rdf",
    "datasets/15743/wales-office-organogram.rdf",
    "datasets/15749/government-staff-and-salary-data-march-2011-gscc.rdf",
    "datasets/15753/may-2011-government-staff-and-salary-data-partnerships-for-schools-30-march-2011.rdf",
    "datasets/15757/iwm-staff-and-salary-data-sept-2011.rdf",
    "datasets/15761/government-staff-and-salary-data-ihol-march-2011-2.rdf",
    "datasets/15762/transparency-organogram.rdf",
    "datasets/15766/geffrye-museum-staff-and-salary-data-july-2011.rdf",
    "datasets/15766/geffrye-museum-staff-and-salary-data-nov-2011.rdf",
    "datasets/15767/1932550.rdf",
    "datasets/15767/2035652.rdf",
    "datasets/15771/2011-03-31-nhm-organograms.rdf",
    "datasets/15771/2011-09-30-nhm-organograms.rdf",
    "datasets/15774/2011-05-26-staff-posts-and-salary-data.rdf",
    "datasets/15774/2011-12-05-ofqual-staff-salary-data-30092011.rdf",
    "datasets/15779/government-staff-and-salary-data-template-v2.rdf",
    "datasets/15779/copy-of-government-staff-and-salary-data-blank-template-october-2011-2.rdf",
    "datasets/15781/1881-senior-and-junior-staff-and-their-pay-grades.rdf",
    "datasets/15786/uksport-staff-and-salary-data-march2011.rdf",
    "datasets/15788/nml-organogram2010.rdf",
    "datasets/15798/organisation-data-1321011754.rdf",
    "datasets/15798/government-staff-and-salary-data-blank-template-1311955022.rdf",
    "datasets/15998/rdfdatamodel-bandi-forniture-servizi.rdf",
    "datasets/15998/rdfdatamodel-altri-bandi.rdf",
    "datasets/15998/rdfdatamodel-bandi-lavori-pubblici.rdf",
    "datasets/16001/rdfschema.rdf",
    "datasets/16002/rdfdatamodel-notizia.rdf",
    "datasets/16004/rdfdatamodel-concorso.rdf",
    "datasets/16005/rdfdatamodel-incarico.rdf",
    "datasets/16006/rdfdatamodel-hotspot.rdf",
    "datasets/21522/rdfdatamodel-immobile.rdf",
    "datasets/21523/rdfschema.rdf",
    "datasets/21525/rdfdatamodel-incarico.rdf",
    "datasets/21526/rdfdatamodel-bandi-forniture-servizi.rdf",
    "datasets/21526/rdfdatamodel-altri-bandi.rdf",
    "datasets/21526/rdfdatamodel-bandi-lavori-pubblici.rdf",
    "datasets/21527/rdfdatamodel-concorso.rdf",
    "datasets/21528/rdfdatamodel-notizia.rdf",
    "datasets/21529/rdfdatamodel-hotspot.rdf",
    "datasets/41371/9evq-q4ni.rdf",
    "datasets/41387/t4mm-ynkd.rdf",
    "datasets/41391/czii-xhj9.rdf",
    "datasets/41437/ifrs-uzjp.rdf",
    "datasets/41439/4icd-wtsk.rdf",
    "datasets/41444/3gk2-vh5k.rdf",
    "datasets/41445/ipzh-446j.rdf",
    "datasets/41449/482t-q4rs.rdf",
    "datasets/41450/uh35-acwz.rdf",
    "datasets/41454/42yc-giac.rdf",
    "datasets/41455/v6fm-igu4.rdf",
    "datasets/41456/crr5-2d65.rdf",
    "datasets/41465/nvy7-tkh5.rdf",
    "datasets/41469/i5wn-ri8c.rdf",
    "datasets/41470/ihqv-a9gs.rdf",
    "datasets/41485/jqvy-ujks.rdf",
    "datasets/41501/e5jd-ytjd.rdf",
    "datasets/41503/pmjs-6uqq.rdf",
    "datasets/41504/6nsk-9bvz.rdf",
    "datasets/41506/kcq6-et5q.rdf",
    "datasets/41507/umf4-5kmu.rdf",
    "datasets/41515/262v-pqa4.rdf",
    "datasets/41516/bua2-8m9z.rdf",
    "datasets/41517/euqj-acaw.rdf",
    "datasets/41518/u8mi-c6pg.rdf",
    "datasets/41519/eamz-affj.rdf",
    "datasets/41520/ckfq-pjk7.rdf",
    "datasets/41521/thk4-xhpc.rdf",
    "datasets/41522/edbj-pf5t.rdf",
    "datasets/41529/bdqp-b69h.rdf",
    "datasets/41531/7c6w-2qvk.rdf",
    "datasets/41534/pfin-hytb.rdf",
    "datasets/41539/8axi-nkrk.rdf",
    "datasets/41543/jt95-7nbp.rdf",
    "datasets/41544/ve8x-7wpr.rdf",
    "datasets/41545/icy5-rmhs.rdf",
    "datasets/41551/wv9g-2f6t.rdf",
    "datasets/41553/ebpk-uqj6.rdf",
    "datasets/41554/peqv-8v8d.rdf",
    "datasets/41562/9jqf-bv5r.rdf",
    "datasets/41566/u2pr-yjem.rdf",
    "datasets/41567/dtbb-ntrk.rdf",
    "datasets/41575/hrms-kvnm.rdf",
    "datasets/41579/ceue-hyth.rdf",
    "datasets/41601/uyqg-fzse.rdf",
    "datasets/41621/jxnv-8xjh.rdf",
    "datasets/41623/uinr-uget.rdf",
    "datasets/41628/hjfz-c27j.rdf",
    "datasets/41629/yv5n-n5p6.rdf",
    "datasets/41633/juhd-9k9e.rdf",
    "datasets/41637/8u94-yfgk.rdf",
    "datasets/41642/tpe5-6nkd.rdf",
    "datasets/41646/56wk-ihbf.rdf",
    "datasets/41650/mei3-ydbi.rdf",
    "datasets/41653/2qeb-tphe.rdf",
    "datasets/41654/knjf-uj4v.rdf",
    "datasets/41658/yrc6-b6sx.rdf",
    "datasets/41659/azh4-cire.rdf",
    "datasets/41664/w6du-72j6.rdf",
    "datasets/41675/yszw-pq5x.rdf",
    "datasets/41683/32gq-nuwm.rdf",
    "datasets/41685/hbkv-72qc.rdf",
    "datasets/41686/fhqq-2mjq.rdf",
    "datasets/41690/j5h2-6za2.rdf",
    "datasets/41701/xpng-ppj3.rdf",
    "datasets/41702/95br-iyzp.rdf",
    "datasets/41703/7xmy-88ts.rdf",
    "datasets/41704/ega3-ewab.rdf",
    "datasets/41705/ahxt-tnnb.rdf",
    "datasets/41706/gz3r-fk2y.rdf",
    "datasets/41707/6sdr-8fgz.rdf",
    "datasets/64157/kptj-tafi.rdf",
    "datasets/64159/vkyy-2v6z.rdf",
    "datasets/64163/i85s-twae.rdf",
    "datasets/64165/vrsm-k79r.rdf",
    "datasets/64167/4f3r-bvja.rdf",
    "datasets/64168/3z7i-zwvu.rdf",
    "datasets/64173/vrkj-8w4d.rdf",
    "datasets/64174/vnqw-sx9g.rdf",
    "datasets/64181/mudx-8k94.rdf",
    "datasets/64186/gd4u-cu57.rdf",
    "datasets/64194/5t3u-serj.rdf",
    "datasets/64200/b3bu-fswq.rdf",
    "datasets/77951/rj4h-8pn4.rdf",
]

In [13]:
for f in rdf_but_actually_html:
    delete_file(f)

Deleting datasets/561/covidcountystatistics.ttl
Deleting datasets/561/covidstatisticsprofile.ttl
Deleting datasets/4354/rows.rdf
Deleting datasets/5662/rows.rdf
Deleting datasets/5663/rows.rdf
Deleting datasets/5665/rows.rdf
Deleting datasets/5719/rows.rdf
Deleting datasets/5720/rows.rdf
Deleting datasets/5721/rows.rdf
Deleting datasets/5739/rows.rdf
Deleting datasets/5740/rows.rdf
Deleting datasets/5741/rows.rdf
Deleting datasets/5742/rows.rdf
Deleting datasets/5743/rows.rdf
Deleting datasets/5744/rows.rdf
Deleting datasets/9692/rows.rdf
Deleting datasets/11003/rows.rdf
Deleting datasets/11520/rows.rdf
Deleting datasets/12635/foaf.rdf
Deleting datasets/12704/makxdekkers.rdf
Deleting datasets/12942/foaf.rdf
Deleting datasets/12949/foaf.rdf
Deleting datasets/13004/foaf.rdf
Deleting datasets/13148/foaf.rdf
Deleting datasets/13290/006893251.ttl
Deleting datasets/13290/006893251.rdf
Deleting datasets/13291/territory-environment-section.nt
Deleting datasets/13357/pml-justification.owl
Delet

In [14]:
unparsed_html = [
    "datasets/13254/databnf-all-rdf-xml.tar.gz-user-databnf-password-databnf",
    "datasets/13290/006893251.json",
    "datasets/13291/society-section",
    "datasets/13291/population-section",
    "datasets/13291/economy-section",
    "datasets/13291/active-population-economic-sector-nace09-timeseries",
    "datasets/13351/cc195814-83a4-386f-509a-37e2f35a204b.html",
    "datasets/13356/523",
    "datasets/13356/spatial",
    "datasets/13357/data-view",
    "datasets/13357/prov",
    "datasets/13357/tvcg.2010.181",
    "datasets/13357/void",
    "datasets/13357/dcat",
    "datasets/13369/linux",
    "datasets/13384/4bf87ee3-8f52-94bb-2398-44e7af6980e4.html",
    "datasets/13388/paris",
    "datasets/13394/5f51f368-635f-adb1-5968-f8a8d8471070.html",
    "datasets/13412/sparql-service-description",
    "datasets/13412/prov",
    "datasets/13412/void",
    "datasets/13412/dcat",
    "datasets/13421/cb6ceeb7-47af-f243-3e45-3c0a916f6c17.html",
    "datasets/13452/7d0e4f43-405b-760e-3f62-88874c758714.html",
    "datasets/13568/1745",
    "datasets/13581/cd1700000001",
    "datasets/13582/p305757",
    "datasets/13590/families.rdf.zip",
    "datasets/13590/senses.rdf.zip",
    "datasets/13590/renderings.ttl.zip",
    "datasets/13590/languoids.rdf.zip",
    "datasets/13594/aanmonstering-del-gem-1890-35",
    "datasets/13623/cc195814-83a4-386f-509a-37e2f35a204b.html",
    "datasets/13759/prov",
    "datasets/13759/void",
    "datasets/13759/dcat",
    "datasets/13796/sparql-service-description",
    "datasets/13796/prov",
    "datasets/13796/void",
    "datasets/13796/dcat",
    "datasets/13990/examples",
    "datasets/14336/linux",
    "datasets/14451/nuts1",
    "datasets/14451/lv",
    "datasets/14486/aanmonstering-del-gem-1890-35",
    "datasets/14491/p305757",
    "datasets/14493/cd1700000001",
    "datasets/14587/4bf87ee3-8f52-94bb-2398-44e7af6980e4.html",
    "datasets/14599/5f51f368-635f-adb1-5968-f8a8d8471070.html",
    "datasets/14603/data-view",
    "datasets/14603/prov",
    "datasets/14603/tvcg.2010.181",
    "datasets/14603/void",
    "datasets/14603/dcat",
    "datasets/14604/e199bcf2-bf94-6b87-81a0-4dacaec11f46.html",
    "datasets/14636/492dbb62-b569-f6cb-5822-545c474bc3db.html",
    "datasets/14637/7d0e4f43-405b-760e-3f62-88874c758714.html",
    "datasets/14735/data.nobelprize.org",
    "datasets/14735/1",
    "datasets/14806/sparql-service-description",
    "datasets/14806/prov",
    "datasets/14806/void",
    "datasets/14806/dcat",
    "datasets/15006/138269.3",
    "datasets/15056/families.rdf.zip",
    "datasets/15056/senses.rdf.zip",
    "datasets/15056/renderings.ttl.zip",
    "datasets/15056/languoids.rdf.zip",
    "datasets/15082/sparql-service-description",
    "datasets/15082/prov",
    "datasets/15082/void",
    "datasets/15082/dcat",
    "datasets/15150/prov",
    "datasets/15150/void",
    "datasets/15150/dcat",
    "datasets/15333/society-section",
    "datasets/15333/population-section",
    "datasets/15333/economy-section",
    "datasets/15333/active-population-economic-sector-nace09-timeseries",
    "datasets/15381/523",
    "datasets/15381/spatial",
    "datasets/15414/paris",
    "datasets/15458/2ffc8ca0-4822-36b8-620b-24d356101748.html",
    "datasets/15788/2011-nml-organogram.rdf-service-wms-request-getcapabilities-version-1.3",
    "datasets/21468/proposte-qtxt-select-20-20where-20-7b-s-20-p-20-o-7d-20limit-201000",
]

In [15]:
for f in unparsed_html:
    delete_file(f)

Deleting datasets/13254/databnf-all-rdf-xml.tar.gz-user-databnf-password-databnf
Deleting datasets/13290/006893251.json
Deleting datasets/13291/society-section
Deleting datasets/13291/population-section
Deleting datasets/13291/economy-section
Deleting datasets/13291/active-population-economic-sector-nace09-timeseries
Deleting datasets/13351/cc195814-83a4-386f-509a-37e2f35a204b.html
Deleting datasets/13356/523
Deleting datasets/13356/spatial
Deleting datasets/13357/data-view
Deleting datasets/13357/prov
Deleting datasets/13357/tvcg.2010.181
Deleting datasets/13357/void
Deleting datasets/13357/dcat
Deleting datasets/13369/linux
Deleting datasets/13384/4bf87ee3-8f52-94bb-2398-44e7af6980e4.html
Deleting datasets/13388/paris
Deleting datasets/13394/5f51f368-635f-adb1-5968-f8a8d8471070.html
Deleting datasets/13412/sparql-service-description
Deleting datasets/13412/prov
Deleting datasets/13412/void
Deleting datasets/13412/dcat
Deleting datasets/13421/cb6ceeb7-47af-f243-3e45-3c0a916f6c17.html


## Delete JSON files

In [16]:
rdf_but_actually_json = [
    "datasets/569/rows.rdf",
    "datasets/573/rows.rdf",
    "datasets/574/rows.rdf",
    "datasets/576/rows.rdf",
    "datasets/588/rows.rdf",
    "datasets/599/rows.rdf",
    "datasets/615/rows.rdf",
    "datasets/618/rows.rdf",
    "datasets/622/rows.rdf",
    "datasets/623/rows.rdf",
    "datasets/633/rows.rdf",
    "datasets/635/rows.rdf",
    "datasets/636/rows.rdf",
    "datasets/644/rows.rdf",
    "datasets/659/rows.rdf",
    "datasets/684/rows.rdf",
    "datasets/687/rows.rdf",
    "datasets/725/rows.rdf",
    "datasets/726/rows.rdf",
    "datasets/743/rows.rdf",
    "datasets/748/rows.rdf",
    "datasets/753/rows.rdf",
    "datasets/764/rows.rdf",
    "datasets/769/rows.rdf",
    "datasets/772/rows.rdf",
    "datasets/773/rows.rdf",
    "datasets/780/rows.rdf",
    "datasets/784/rows.rdf",
    "datasets/786/rows.rdf",
    "datasets/794/rows.rdf",
    "datasets/807/rows.rdf",
    "datasets/811/rows.rdf",
    "datasets/818/rows.rdf",
    "datasets/830/rows.rdf",
    "datasets/831/rows.rdf",
    "datasets/837/rows.rdf",
    "datasets/849/rows.rdf",
    "datasets/858/rows.rdf",
    "datasets/859/rows.rdf",
    "datasets/861/rows.rdf",
    "datasets/880/rows.rdf",
    "datasets/888/rows.rdf",
    "datasets/894/rows.rdf",
    "datasets/895/rows.rdf",
    "datasets/14418/173390.rdf",
]

In [17]:
for f in rdf_but_actually_json:
    delete_file(f)

Deleting datasets/569/rows.rdf
Deleting datasets/573/rows.rdf
Deleting datasets/574/rows.rdf
Deleting datasets/576/rows.rdf
Deleting datasets/588/rows.rdf
Deleting datasets/599/rows.rdf
Deleting datasets/615/rows.rdf
Deleting datasets/618/rows.rdf
Deleting datasets/622/rows.rdf
Deleting datasets/623/rows.rdf
Deleting datasets/633/rows.rdf
Deleting datasets/635/rows.rdf
Deleting datasets/636/rows.rdf
Deleting datasets/644/rows.rdf
Deleting datasets/659/rows.rdf
Deleting datasets/684/rows.rdf
Deleting datasets/687/rows.rdf
Deleting datasets/725/rows.rdf
Deleting datasets/726/rows.rdf
Deleting datasets/743/rows.rdf
Deleting datasets/748/rows.rdf
Deleting datasets/753/rows.rdf
Deleting datasets/764/rows.rdf
Deleting datasets/769/rows.rdf
Deleting datasets/772/rows.rdf
Deleting datasets/773/rows.rdf
Deleting datasets/780/rows.rdf
Deleting datasets/784/rows.rdf
Deleting datasets/786/rows.rdf
Deleting datasets/794/rows.rdf
Deleting datasets/807/rows.rdf
Deleting datasets/811/rows.rdf
Deleting

In [18]:
unparsed_json = ["datasets/14418/gtaa"]

In [19]:
for f in unparsed_json:
    delete_file(f)

Deleting datasets/14418/gtaa


## Cast from UTF-8 BOM to UTF-8

UTF-8 BOM is a sequence of bytes (0xEF, 0xBB, 0xBF) that is sometimes added at the beginning of a text file to indicate that the file is encoded in UTF-8. The BOM serves as a signature or marker to indicate the encoding. Its primary purpose is to help applications distinguish between different Unicode encodings. When a BOM is present, it indicates that the file is UTF-8 encoded.

In [20]:
def is_utf8_bom(file_path: str) -> bool:
    with open(file_path, "rb") as file:
        # Read the first three bytes of the file
        bom_bytes = file.read(3)
        return bom_bytes == b"\xEF\xBB\xBF"  # Check if the bytes match the UTF-8 BOM


def remove_utf8_bom(file_path: str):
    content = None

    # Read the file contents
    with open(file_path, "r", encoding="utf-8-sig") as file:
        content = file.read()

    # Write the contents back to the file without BOM
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

In [21]:
dataset_folder = "datasets"
datasets = sorted(os.listdir(dataset_folder), key=lambda i: int(i))

for dataset in datasets:
    folder_path = f"{dataset_folder}/{dataset}"
    files = os.listdir(folder_path)
    for f in files:
        file_path = f"{folder_path}/{f}"
        if(is_utf8_bom(file_path)):
            print(f"REMOVING UTF-8 BOM from {file_path}")
            remove_utf8_bom(file_path)

REMOVING UTF-8 BOM from datasets/12513/cst20201211.rdf
REMOVING UTF-8 BOM from datasets/12513/tsb20201211.rdf
REMOVING UTF-8 BOM from datasets/12670/foaf.rdf
REMOVING UTF-8 BOM from datasets/14434/void.ttl
REMOVING UTF-8 BOM from datasets/14822/nlr771515.rdf
REMOVING UTF-8 BOM from datasets/14822/nlr66147783.rdf
REMOVING UTF-8 BOM from datasets/14822/title5.rdf
REMOVING UTF-8 BOM from datasets/15976/ingresos.jsonld
REMOVING UTF-8 BOM from datasets/15976/gastos.jsonld
REMOVING UTF-8 BOM from datasets/15976/gastosfuncionales.jsonld
REMOVING UTF-8 BOM from datasets/15977/ingresos.jsonld
REMOVING UTF-8 BOM from datasets/15977/gastos.jsonld
REMOVING UTF-8 BOM from datasets/15977/gastosfuncionales.jsonld
REMOVING UTF-8 BOM from datasets/15978/ingresos.jsonld
REMOVING UTF-8 BOM from datasets/15978/gastos.jsonld
REMOVING UTF-8 BOM from datasets/15978/gastosfuncionales.jsonld
REMOVING UTF-8 BOM from datasets/21691/2552-2557.rdf
REMOVING UTF-8 BOM from datasets/21692/2553-2558.rdf


## Cast to RDF

In [22]:
cast_to_rdf = [
    "datasets/13254/rdf.xml",
    "datasets/13284/deck",
    "datasets/13288/54b30",
    "datasets/13291/vocab",
    "datasets/13334/s2377506.xml",
    "datasets/13357/wgs84-pos",
    "datasets/13357/doap",
    "datasets/13357/vsr",
    "datasets/13357/ns",
    "datasets/13357/core",
    "datasets/13412/doap",
    "datasets/13412/vsr",
    "datasets/13412/ns",
    "datasets/13412/core",
    "datasets/13577/psclemon",
    "datasets/13759/doap",
    "datasets/13759/ns",
    "datasets/13759/core",
    "datasets/13796/doap",
    "datasets/13796/ns",
    "datasets/13796/core",
    "datasets/13997/bulmo-0007-473x-2000-num-158-2-2373",
    "datasets/13997/196565-person",
    "datasets/13997/ahess-0395-2649",
    "datasets/14054/rdf.xml",
    "datasets/14079/wheat",
    "datasets/14417/deck",
    "datasets/14580/54b30",
    "datasets/14583/s2377506.xml",
    "datasets/14603/wgs84-pos",
    "datasets/14603/doap",
    "datasets/14603/vsr",
    "datasets/14603/ns",
    "datasets/14603/core",
    "datasets/14806/doap",
    "datasets/14806/ns",
    "datasets/14806/core",
    "datasets/14921/psclemon",
    "datasets/15082/doap",
    "datasets/15082/vsr",
    "datasets/15082/ns",
    "datasets/15082/core",
    "datasets/15150/doap",
    "datasets/15150/ns",
    "datasets/15150/core",
    "datasets/15333/vocab",
    "datasets/15641/moj-data.rdf.txt",
]

In [23]:
for f in cast_to_rdf:
    change_extension(f, "rdf")

Renaming datasets/13254/rdf.xml to datasets/13254/rdf.rdf
Renaming datasets/13284/deck to datasets/13284/deck.rdf
Renaming datasets/13288/54b30 to datasets/13288/54b30.rdf
Renaming datasets/13291/vocab to datasets/13291/vocab.rdf
Renaming datasets/13334/s2377506.xml to datasets/13334/s2377506.rdf
Renaming datasets/13357/wgs84-pos to datasets/13357/wgs84-pos.rdf
Renaming datasets/13357/doap to datasets/13357/doap.rdf
Renaming datasets/13357/vsr to datasets/13357/vsr.rdf
Renaming datasets/13357/ns to datasets/13357/ns.rdf
Renaming datasets/13357/core to datasets/13357/core.rdf
Renaming datasets/13412/doap to datasets/13412/doap.rdf
Renaming datasets/13412/vsr to datasets/13412/vsr.rdf
Renaming datasets/13412/ns to datasets/13412/ns.rdf
Renaming datasets/13412/core to datasets/13412/core.rdf
Renaming datasets/13577/psclemon to datasets/13577/psclemon.rdf
Renaming datasets/13759/doap to datasets/13759/doap.rdf
Renaming datasets/13759/ns to datasets/13759/ns.rdf
Renaming datasets/13759/core

## Cast to TTL

In [24]:
cast_to_ttl = [
    "datasets/13252/affymetrix-185061-at",
    "datasets/13279/hgnc-7",
    "datasets/13299/homologene-1000",
    "datasets/13357/cube",
    "datasets/13412/nif-core",
    "datasets/13471/omim-603903",
    "datasets/13482/sgd-s000006169",
    "datasets/13600/drugbank-db00001",
    "datasets/13715/gb",
    "datasets/14387/hgnc-7",
    "datasets/14396/omim-603903",
    "datasets/14411/affymetrix-185061-at",
    "datasets/14430/gb",
    "datasets/14472/ndc-49288-0001-272c9910-2160-4d41-afbd-faab3055ba1d",
    "datasets/14473/mesh-d018377",
    "datasets/14519/go-0006915",
    "datasets/14520/drugbank-db00001",
    "datasets/14558/taxonomy-9606",
    "datasets/14603/cube",
    "datasets/15082/nif-core",
    "datasets/15144/turtle",
]

In [25]:
for f in cast_to_ttl:
    change_extension(f, "ttl")

Renaming datasets/13252/affymetrix-185061-at to datasets/13252/affymetrix-185061-at.ttl
Renaming datasets/13279/hgnc-7 to datasets/13279/hgnc-7.ttl
Renaming datasets/13299/homologene-1000 to datasets/13299/homologene-1000.ttl
Renaming datasets/13357/cube to datasets/13357/cube.ttl
Renaming datasets/13412/nif-core to datasets/13412/nif-core.ttl
Renaming datasets/13471/omim-603903 to datasets/13471/omim-603903.ttl
Renaming datasets/13482/sgd-s000006169 to datasets/13482/sgd-s000006169.ttl
Renaming datasets/13600/drugbank-db00001 to datasets/13600/drugbank-db00001.ttl
Renaming datasets/13715/gb to datasets/13715/gb.ttl
Renaming datasets/14387/hgnc-7 to datasets/14387/hgnc-7.ttl
Renaming datasets/14396/omim-603903 to datasets/14396/omim-603903.ttl
Renaming datasets/14411/affymetrix-185061-at to datasets/14411/affymetrix-185061-at.ttl
Renaming datasets/14430/gb to datasets/14430/gb.ttl
Renaming datasets/14472/ndc-49288-0001-272c9910-2160-4d41-afbd-faab3055ba1d to datasets/14472/ndc-49288-00

## Analysis of unused - NON RDF files

### Files recognized as archives

In [26]:
import magic
import zipfile
import tarfile

def is_gz_file(filepath):
    with open(filepath, "rb") as f:
        return f.read(2) == b"\x1f\x8b"


def is_bz2_file(filepath):
    with open(filepath, "rb") as f:
        return f.read(3) == b"\x42\x5a\x68"

In [27]:
# Files that have been recognized as archives
archives = [
    "datasets/13263/govwild-rdf-2012-01-30.zip",
    "datasets/13283/txn-ocs.ttl.gz",
    "datasets/13283/txn-distribution.ttl.gz",
    "datasets/13283/txn-images.ttl.gz",
    "datasets/13283/txn-misc.ttl.gz",
    "datasets/13347/geospecies.rdf.gz",
    "datasets/13368/all-geonames-rdf.zip",
    "datasets/13369/dbpedia-3.6.owl.bz2",
    "datasets/13378/wn20full.zip",
    "datasets/13388/dbpedia-3.6.owl.bz2",
    "datasets/13461/dbpedia-3.5.1.owl.bz2",
    "datasets/13822/lsoa.ttl.zip",
    "datasets/13997/persee-person-align-rdf.tar.gz",
    "datasets/14079/eat.nt.gz",
    "datasets/14252/wn20full.zip",
    "datasets/14277/txn-ocs.ttl.gz",
    "datasets/14277/txn-distribution.ttl.gz",
    "datasets/14277/txn-images.ttl.gz",
    "datasets/14277/txn-misc.ttl.gz",
    "datasets/14324/geospecies.rdf.gz",
    "datasets/14336/dbpedia-3.6.owl.bz2",
    "datasets/14344/all-geonames-rdf.zip",
    "datasets/14364/govwild-rdf-2012-01-30.zip",
    "datasets/14801/usage.nt.gz",
    "datasets/15282/lsoa.ttl.zip",
    "datasets/15414/dbpedia-3.6.owl.bz2",
    "datasets/21532/jrcnames-uri.zip",
]

In [28]:
tar = list()
zip = list()
tar_gz = list()
bz2 = list()

for file in archives:
    if tarfile.is_tarfile(file):
        tar.append(file)
    elif zipfile.is_zipfile(file):
        zip.append(file)
    elif is_gz_file(file):
        tar_gz.append(file)
    elif is_bz2_file(file):
        bz2.append(file)

assert len(archives) == len(tar) + len(zip) + len(tar_gz) + len(bz2)

print(f"TAR: {len(tar)}")
print(f"ZIP: {len(zip)}")
print(f"TAR GZ: {len(tar_gz)}")
print(f"BZ2: {len(bz2)}")


TAR: 1
ZIP: 9
TAR GZ: 12
BZ2: 5


#### TAR

In [29]:
for a in tar:
    print(a)

datasets/13997/persee-person-align-rdf.tar.gz


In [30]:
%%bash
ls -la datasets/13997/
du -sh datasets/13997/persee-person-align-rdf.tar.gz

totale 9900
drwxr-xr-x     2 riccardo riccardo    4096  7 giu 21.47 .
drwxr-xr-x 31591 riccardo riccardo  655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo  530900  8 apr 15.12 196565-person.rdf
-rw-r--r--     1 riccardo riccardo   24713  8 apr 15.12 ahess-0395-2649.rdf
-rw-r--r--     1 riccardo riccardo   11627  8 apr 15.12 bulmo-0007-473x-2000-num-158-2-2373.rdf
-rw-r--r--     1 riccardo riccardo    1816  5 giu 12.19 metadata.json
-rw-r--r--     1 riccardo riccardo   96354  8 apr 15.12 persee-ontology.owl
-rw-r--r--     1 riccardo riccardo 8796163  8 apr 15.12 persee-person-align-rdf.tar.gz
8,4M	datasets/13997/persee-person-align-rdf.tar.gz


In [31]:
%%bash
cd datasets/13997/
tar -xf persee-person-align-rdf.tar.gz
rm -rf persee-person-align-rdf.tar.gz
rm license.txt
tree .

[01;34m.[00m
├── 196565-person.rdf
├── ahess-0395-2649.rdf
├── bulmo-0007-473x-2000-num-158-2-2373.rdf
├── define_UMS_Persee.rdf
├── metadata.json
├── PERSEE_align_All_2021-09-24.rdf
├── PERSEE_align_Bnf_2021-09-24.rdf
├── PERSEE_align_DBpedia_2021-09-24.rdf
├── PERSEE_align_DBpediaFR_2021-09-24.rdf
├── PERSEE_align_idHAL_2021-09-24.rdf
├── PERSEE_align_Idref_2021-09-24.rdf
├── PERSEE_align_Isni_2021-09-24.rdf
├── PERSEE_align_ORCID_2021-09-24.rdf
├── PERSEE_align_RePEc_2021-09-24.rdf
├── PERSEE_align_viaf_2021-09-24.rdf
├── PERSEE_align_wikidata_2021-09-24.rdf
├── PERSEE_align_wikipedia_2021-09-24.rdf
├── PERSEE_align_wikipediaFR_2021-09-24.rdf
└── persee-ontology.owl

0 directories, 19 files


#### ZIP

In [32]:
for a in zip:
    print(a)

datasets/13263/govwild-rdf-2012-01-30.zip
datasets/13368/all-geonames-rdf.zip
datasets/13378/wn20full.zip
datasets/13822/lsoa.ttl.zip
datasets/14252/wn20full.zip
datasets/14344/all-geonames-rdf.zip
datasets/14364/govwild-rdf-2012-01-30.zip
datasets/15282/lsoa.ttl.zip
datasets/21532/jrcnames-uri.zip


##### datasets/13263/govwild-rdf-2012-01-30.zip

In [33]:
%%bash
cd datasets/13263
ls -la
du -sh govwild-rdf-2012-01-30.zip


totale 241000
drwxr-xr-x     2 riccardo riccardo      4096  8 mag 17.32 .
drwxr-xr-x 31591 riccardo riccardo    655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo 246077569  8 apr 15.06 govwild-rdf-2012-01-30.zip
-rw-r--r--     1 riccardo riccardo     33191  8 apr 15.06 gwontology.rdf
-rw-r--r--     1 riccardo riccardo      1283  5 giu 12.18 metadata.json
235M	govwild-rdf-2012-01-30.zip


In [34]:
%%bash 
cd datasets/13263
unzip govwild-rdf-2012-01-30.zip

Archive:  govwild-rdf-2012-01-30.zip
   creating: Govwild_rdf_2012-01-30/
  inflating: Govwild_rdf_2012-01-30/changelog_2012-01-30.txt  
  inflating: Govwild_rdf_2012-01-30/Govwild_rdf.n3  


In [35]:
%%bash
cd datasets/13263
mv Govwild_rdf_2012-01-30/Govwild_rdf.n3 .
mv Govwild_rdf_2012-01-30/changelog_2012-01-30.txt .

In [36]:
%%bash
cd datasets/13263
rm -rf Govwild_rdf_2012-01-30
rm -rf govwild-rdf-2012-01-30.zip
tree

[01;34m.[00m
├── changelog_2012-01-30.txt
├── [01;32mGovwild_rdf.n3[00m
├── gwontology.rdf
└── metadata.json

0 directories, 5 files


In [37]:
%%bash
head datasets/13263/changelog_2012-01-30.txt

- bug fixing:
	- only calculate "euFinanceTotalReceived" for the countries of the EU, analogous for "earmarksTotalReceived" and the states in the US
	- remove duplicate sameAs-links for Freebase
	- huge funds aren't negative any more
- cleansed sourceDetailed (wrongly added EuFinance)
- optimized splitting of FreebaseCompany
- ontology changes:
	- rename "Relationship" to "PersonRelationship"
	- rename "NYTArticle" to "NewsArticle"
	- rename "Place" to "Address"


In [38]:
%%bash
rm -rf datasets/13263/changelog_2012-01-30.txt

In [39]:
%%bash 
tree datasets/13263

[01;34mdatasets/13263[00m
├── [01;32mGovwild_rdf.n3[00m
├── gwontology.rdf
└── metadata.json

0 directories, 4 files


##### datasets/13368/all-geonames-rdf.zip

In [40]:
%%bash 
cd datasets/13368
ls -la
du -sh all-geonames-rdf.zip

totale 725732
drwxr-xr-x     2 riccardo riccardo      4096  8 mag 17.32 .
drwxr-xr-x 31591 riccardo riccardo    655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo      2833  8 apr 15.06 about.rdf
-rw-r--r--     1 riccardo riccardo 742269561  8 apr 15.06 all-geonames-rdf.zip
-rw-r--r--     1 riccardo riccardo      1244  5 giu 12.19 metadata.json
-rw-r--r--     1 riccardo riccardo    196853  8 apr 15.06 ontology-v2.rdf
708M	all-geonames-rdf.zip


In [41]:
%%bash 
cd datasets/13368
unzip all-geonames-rdf.zip

Archive:  all-geonames-rdf.zip
  inflating: all-geonames-rdf.txt    


In [42]:
%%bash 
cd datasets/13368
tree .

[01;34m.[00m
├── about.rdf
├── all-geonames-rdf.txt
├── [01;31mall-geonames-rdf.zip[00m
├── metadata.json
└── ontology-v2.rdf

0 directories, 5 files


In [43]:
%%bash
du -sh datasets/13368/all-geonames-rdf.txt
head datasets/13368/all-geonames-rdf.txt

18G	datasets/13368/all-geonames-rdf.txt
https://sws.geonames.org/3/
<?xml version="1.0" encoding="UTF-8" standalone="no"?><rdf:RDF xmlns:cc="http://creativecommons.org/ns#" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:gn="http://www.geonames.org/ontology#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:wgs84_pos="http://www.w3.org/2003/01/geo/wgs84_pos#">    <gn:Feature rdf:about="https://sws.geonames.org/3/">        <rdfs:isDefinedBy rdf:resource="https://sws.geonames.org/3/about.rdf"/>        <gn:name>Zamīn Sūkhteh</gn:name>        <gn:alternateName xml:lang="fa">زمين سوخته</gn:alternateName>        <gn:alternateName xml:lang="fa">Zamīn Sūkhteh</gn:alternateName>        <gn:featureClass rdf:resource="https://www.geonames.org/ontology#S"/>        <gn:featureCode rdf:resource="https://www.geonames.org/ontology#S.CRRL"/>        <gn:count

In [44]:
%%bash
mv datasets/13368/all-geonames-rdf.txt datasets/13368/all-geonames.rdf
rm -rf datasets/13368/all-geonames-rdf.zip

In [45]:
%%bash
tree datasets/13368

[01;34mdatasets/13368[00m
├── about.rdf
├── all-geonames.rdf
├── metadata.json
└── ontology-v2.rdf

0 directories, 4 files


##### datasets/13378/wn20full.zip

In [46]:
%%bash
ls -la datasets/13378/
du -sh datasets/13378/wn20full.zip

totale 14204
drwxr-xr-x     2 riccardo riccardo     4096  8 mag 17.32 .
drwxr-xr-x 31591 riccardo riccardo   655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo     1009  5 giu 12.19 metadata.json
-rw-r--r--     1 riccardo riccardo 13832499  8 apr 15.06 wn20full.zip
-rw-r--r--     1 riccardo riccardo    35332  8 apr 15.06 wnfull.rdf
-rw-r--r--     1 riccardo riccardo      661  8 apr 15.06 wordsense-entity-noun-1.nt
14M	datasets/13378/wn20full.zip


In [47]:
%%bash
cd datasets/13378/
unzip wn20full.zip

Archive:  wn20full.zip
  inflating: wnfull.rdfs             
  inflating: wordnet-antonym.rdf     
  inflating: wordnet-attribute.rdf   
  inflating: wordnet-causes.rdf      
  inflating: wordnet-classifiedby.rdf  
  inflating: wordnet-derivationallyrelated.rdf  
  inflating: wordnet-entailment.rdf  
  inflating: wordnet-frame.rdf       
  inflating: wordnet-glossary.rdf    
  inflating: wordnet-hyponym.rdf     
  inflating: wordnet-membermeronym.rdf  
  inflating: wordnet-participleof.rdf  
  inflating: wordnet-partmeronym.rdf  
  inflating: wordnet-pertainsto.rdf  
  inflating: wordnet-sameverbgroupas.rdf  
  inflating: wordnet-seealso.rdf     
  inflating: wordnet-similarity.rdf  
  inflating: wordnet-substancemeronym.rdf  
  inflating: wordnet-synset.rdf      
  inflating: wordnet-wordsensesandwords.rdf  


In [48]:
%%bash
cd  datasets/13378/
rm -rf wn20full.zip wnfull.rdfs
tree .

[01;34m.[00m
├── metadata.json
├── wnfull.rdf
├── wordnet-antonym.rdf
├── wordnet-attribute.rdf
├── wordnet-causes.rdf
├── wordnet-classifiedby.rdf
├── wordnet-derivationallyrelated.rdf
├── wordnet-entailment.rdf
├── wordnet-frame.rdf
├── wordnet-glossary.rdf
├── wordnet-hyponym.rdf
├── wordnet-membermeronym.rdf
├── wordnet-participleof.rdf
├── wordnet-partmeronym.rdf
├── wordnet-pertainsto.rdf
├── wordnet-sameverbgroupas.rdf
├── wordnet-seealso.rdf
├── wordnet-similarity.rdf
├── wordnet-substancemeronym.rdf
├── wordnet-synset.rdf
├── wordnet-wordsensesandwords.rdf
└── wordsense-entity-noun-1.nt

0 directories, 22 files


##### datasets/13822/lsoa.ttl.zip

In [49]:
%%bash
ls -la datasets/13822
du -sh datasets/13822/lsoa.ttl.zip

totale 1588
drwxr-xr-x     2 riccardo riccardo   4096  8 mag 17.32 .
drwxr-xr-x 31591 riccardo riccardo 655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo   1678  8 apr 15.12 e01000001.nt
-rw-r--r--     1 riccardo riccardo   1465  8 apr 15.12 e01000001.rdf
-rw-r--r--     1 riccardo riccardo    907  8 apr 15.12 e01000001.ttl
-rw-r--r--     1 riccardo riccardo 944805  8 apr 15.12 lsoa.ttl.zip
-rw-r--r--     1 riccardo riccardo   1438  5 giu 12.19 metadata.json
924K	datasets/13822/lsoa.ttl.zip


In [50]:
%%bash
cd datasets/13822
unzip lsoa.ttl.zip
rm lsoa.ttl.zip

Archive:  lsoa.ttl.zip
  inflating: lsoa.ttl                


In [51]:
%%bash
tree datasets/13822

[01;34mdatasets/13822[00m
├── e01000001.nt
├── e01000001.rdf
├── e01000001.ttl
├── lsoa.ttl
└── metadata.json

0 directories, 5 files


##### datasets/14252/wn20full.zip (related to datasets/13378)

In [52]:
%%bash
ls -la datasets/14252/
du -sh datasets/14252/wn20full.zip

totale 14204
drwxr-xr-x     2 riccardo riccardo     4096  8 mag 17.32 .
drwxr-xr-x 31591 riccardo riccardo   655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo     1122  5 giu 12.19 metadata.json
-rw-r--r--     1 riccardo riccardo 13832499  8 apr 15.12 wn20full.zip
-rw-r--r--     1 riccardo riccardo    35332  8 apr 15.12 wnfull.rdf
-rw-r--r--     1 riccardo riccardo      661  8 apr 15.12 wordsense-entity-noun-1.nt
14M	datasets/14252/wn20full.zip


Seems related to `datasets/13378`

In [53]:
diff_dataset("datasets/14252", "datasets/13378")

'I file datasets/14252/metadata.json e datasets/13378/metadata.json sono diversi\nSolo in datasets/14252: wn20full.zip\nSolo in datasets/13378: wordnet-antonym.rdf\nSolo in datasets/13378: wordnet-attribute.rdf\nSolo in datasets/13378: wordnet-causes.rdf\nSolo in datasets/13378: wordnet-classifiedby.rdf\nSolo in datasets/13378: wordnet-derivationallyrelated.rdf\nSolo in datasets/13378: wordnet-entailment.rdf\nSolo in datasets/13378: wordnet-frame.rdf\nSolo in datasets/13378: wordnet-glossary.rdf\nSolo in datasets/13378: wordnet-hyponym.rdf\nSolo in datasets/13378: wordnet-membermeronym.rdf\nSolo in datasets/13378: wordnet-participleof.rdf\nSolo in datasets/13378: wordnet-partmeronym.rdf\nSolo in datasets/13378: wordnet-pertainsto.rdf\nSolo in datasets/13378: wordnet-sameverbgroupas.rdf\nSolo in datasets/13378: wordnet-seealso.rdf\nSolo in datasets/13378: wordnet-similarity.rdf\nSolo in datasets/13378: wordnet-substancemeronym.rdf\nSolo in datasets/13378: wordnet-synset.rdf\nSolo in dat

In [54]:
%%bash
cd datasets/14252/
unzip wn20full.zip

Archive:  wn20full.zip
  inflating: wnfull.rdfs             
  inflating: wordnet-antonym.rdf     
  inflating: wordnet-attribute.rdf   
  inflating: wordnet-causes.rdf      
  inflating: wordnet-classifiedby.rdf  
  inflating: wordnet-derivationallyrelated.rdf  
  inflating: wordnet-entailment.rdf  
  inflating: wordnet-frame.rdf       
  inflating: wordnet-glossary.rdf    
  inflating: wordnet-hyponym.rdf     
  inflating: wordnet-membermeronym.rdf  
  inflating: wordnet-participleof.rdf  
  inflating: wordnet-partmeronym.rdf  
  inflating: wordnet-pertainsto.rdf  
  inflating: wordnet-sameverbgroupas.rdf  
  inflating: wordnet-seealso.rdf     
  inflating: wordnet-similarity.rdf  
  inflating: wordnet-substancemeronym.rdf  
  inflating: wordnet-synset.rdf      
  inflating: wordnet-wordsensesandwords.rdf  


In [55]:
%%bash
cd datasets/14252/
rm wn20full.zip wnfull.rdfs

In [56]:
%%bash
cd datasets/14252/
tree .

[01;34m.[00m
├── metadata.json
├── wnfull.rdf
├── wordnet-antonym.rdf
├── wordnet-attribute.rdf
├── wordnet-causes.rdf
├── wordnet-classifiedby.rdf
├── wordnet-derivationallyrelated.rdf
├── wordnet-entailment.rdf
├── wordnet-frame.rdf
├── wordnet-glossary.rdf
├── wordnet-hyponym.rdf
├── wordnet-membermeronym.rdf
├── wordnet-participleof.rdf
├── wordnet-partmeronym.rdf
├── wordnet-pertainsto.rdf
├── wordnet-sameverbgroupas.rdf
├── wordnet-seealso.rdf
├── wordnet-similarity.rdf
├── wordnet-substancemeronym.rdf
├── wordnet-synset.rdf
├── wordnet-wordsensesandwords.rdf
└── wordsense-entity-noun-1.nt

0 directories, 22 files


##### datasets/14344/all-geonames-rdf.zip (related to datasets/13368)

In [57]:
%%bash
ls -la datasets/14344/
du -sh datasets/14344/all-geonames-rdf.zip

totale 725732
drwxr-xr-x     2 riccardo riccardo      4096  8 mag 17.32 .
drwxr-xr-x 31591 riccardo riccardo    655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo      2833  8 apr 15.13 about.rdf
-rw-r--r--     1 riccardo riccardo 742269561  8 apr 15.13 all-geonames-rdf.zip
-rw-r--r--     1 riccardo riccardo      1325  5 giu 12.19 metadata.json
-rw-r--r--     1 riccardo riccardo    196853  8 apr 15.12 ontology-v2.rdf
708M	datasets/14344/all-geonames-rdf.zip


In [58]:
%%bash
cd datasets/14344/
unzip all-geonames-rdf.zip

Archive:  all-geonames-rdf.zip
  inflating: all-geonames-rdf.txt    


In [59]:
%%bash
cd datasets/14344/
rm all-geonames-rdf.zip
mv all-geonames-rdf.txt all-geonames.rdf

In [60]:
diff_dataset("datasets/14344", "datasets/13368")

'I file datasets/14344/metadata.json e datasets/13368/metadata.json sono diversi\n'

In [61]:
%%bash
tree datasets/14344/

[01;34mdatasets/14344/[00m
├── about.rdf
├── all-geonames.rdf
├── metadata.json
└── ontology-v2.rdf

0 directories, 4 files


##### datasets/14364/govwild-rdf-2012-01-30.zip (related to datasets/13263)

In [62]:
%%bash
cd datasets/14364
ls -la
du -sh govwild-rdf-2012-01-30.zip

totale 241000
drwxr-xr-x     2 riccardo riccardo      4096  8 mag 17.32 .
drwxr-xr-x 31591 riccardo riccardo    655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo 246077569  8 apr 15.13 govwild-rdf-2012-01-30.zip
-rw-r--r--     1 riccardo riccardo     33191  8 apr 15.13 gwontology.rdf
-rw-r--r--     1 riccardo riccardo      1268  5 giu 12.19 metadata.json
235M	govwild-rdf-2012-01-30.zip


In [63]:
%%bash 
cd datasets/14364
unzip govwild-rdf-2012-01-30.zip
mv Govwild_rdf_2012-01-30/* .
rm -rf Govwild_rdf_2012-01-30
rm -rf govwild-rdf-2012-01-30.zip

Archive:  govwild-rdf-2012-01-30.zip
   creating: Govwild_rdf_2012-01-30/
  inflating: Govwild_rdf_2012-01-30/changelog_2012-01-30.txt  
  inflating: Govwild_rdf_2012-01-30/Govwild_rdf.n3  


In [64]:
%%bash
head datasets/14364/changelog_2012-01-30.txt
rm -rf datasets/14364/changelog_2012-01-30.txt

- bug fixing:
	- only calculate "euFinanceTotalReceived" for the countries of the EU, analogous for "earmarksTotalReceived" and the states in the US
	- remove duplicate sameAs-links for Freebase
	- huge funds aren't negative any more
- cleansed sourceDetailed (wrongly added EuFinance)
- optimized splitting of FreebaseCompany
- ontology changes:
	- rename "Relationship" to "PersonRelationship"
	- rename "NYTArticle" to "NewsArticle"
	- rename "Place" to "Address"


In [65]:
%%bash
tree datasets/14364

[01;34mdatasets/14364[00m
├── [01;32mGovwild_rdf.n3[00m
├── gwontology.rdf
└── metadata.json

0 directories, 4 files


In [66]:
diff_dataset("datasets/14364", "datasets/13263")

'I file datasets/14364/metadata.json e datasets/13263/metadata.json sono diversi\n'

In [67]:
diff_file("datasets/14364/metadata.json", "datasets/13263/metadata.json")

'2c2\n<     "id": "14364",\n---\n>     "id": "13263",\n6c6\n<     "tags": "format-rdf;gov;government;lod;lodcloud-diagram-2011-09-19;lodcloud-diagram-2014-08-30;no-deref-vocab;opendatachallenge;published-by-third-party;",\n---\n>     "tags": "LinkedDataCrawl2014;format-owl;format-rdf;format-rdfs;gov;government;lod;no-deref-vocab;opendatachallenge;part-deref-vocab;prop-vocab;published-by-third-party;",\n'

##### datasets/15282/lsoa.ttl.zip (related to datasets/13822)

In [68]:
%%bash
ls -la datasets/15282
du -sh datasets/15282/lsoa.ttl.zip

totale 1588
drwxr-xr-x     2 riccardo riccardo   4096  8 mag 17.32 .
drwxr-xr-x 31591 riccardo riccardo 655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo   1678  8 apr 15.14 e01000001.nt
-rw-r--r--     1 riccardo riccardo   1465  8 apr 15.14 e01000001.rdf
-rw-r--r--     1 riccardo riccardo    907  8 apr 15.14 e01000001.ttl
-rw-r--r--     1 riccardo riccardo 944805  8 apr 15.14 lsoa.ttl.zip
-rw-r--r--     1 riccardo riccardo   1438  5 giu 12.18 metadata.json
924K	datasets/15282/lsoa.ttl.zip


In [69]:
%%bash
cd datasets/15282
unzip lsoa.ttl.zip
rm lsoa.ttl.zip

Archive:  lsoa.ttl.zip
  inflating: lsoa.ttl                


In [70]:
%%bash
tree datasets/15282

[01;34mdatasets/15282[00m
├── e01000001.nt
├── e01000001.rdf
├── e01000001.ttl
├── lsoa.ttl
└── metadata.json

0 directories, 5 files


In [71]:
diff_dataset("datasets/15282", "datasets/13822")

'I file datasets/15282/metadata.json e datasets/13822/metadata.json sono diversi\n'

In [72]:
diff_file("datasets/15282/metadata.json", "datasets/13822/metadata.json")

'2c2\n<     "id": "15282",\n---\n>     "id": "13822",\n'

##### datasets/21532/jrcnames-uri.zip

In [73]:
%%bash
cd datasets/21532
ls -la
du -sh jrcnames-uri.zip

totale 774816
drwxr-xr-x     2 riccardo riccardo      4096  8 mag 17.32 .
drwxr-xr-x 31591 riccardo riccardo    655360 17 apr 22.34 ..
-rw-r--r--     1 riccardo riccardo      9112  8 apr 15.15 jrcnamesmodel.rdf
-rw-r--r--     1 riccardo riccardo 792724845  8 apr 15.15 jrcnames-uri.zip
-rw-r--r--     1 riccardo riccardo      3529  5 giu 12.19 metadata.json
757M	jrcnames-uri.zip


In [74]:
%%bash 
cd datasets/21532
unzip jrcnames-uri.zip

Archive:  jrcnames-uri.zip
  inflating: jrcnames_uri.nt         


In [75]:
%%bash
rm datasets/21532/jrcnames-uri.zip

In [76]:
%%bash
tree datasets/21532

[01;34mdatasets/21532[00m
├── jrcnamesmodel.rdf
├── jrcnames_uri.nt
└── metadata.json

0 directories, 3 files


#### GZ files

In [77]:
for a in tar_gz:
    print(a)

datasets/13283/txn-ocs.ttl.gz
datasets/13283/txn-distribution.ttl.gz
datasets/13283/txn-images.ttl.gz
datasets/13283/txn-misc.ttl.gz
datasets/13347/geospecies.rdf.gz
datasets/14079/eat.nt.gz
datasets/14277/txn-ocs.ttl.gz
datasets/14277/txn-distribution.ttl.gz
datasets/14277/txn-images.ttl.gz
datasets/14277/txn-misc.ttl.gz
datasets/14324/geospecies.rdf.gz
datasets/14801/usage.nt.gz


##### datasets 13283 and 14277 seems to have the same files

In [78]:
diff_dataset("datasets/13283", "datasets/14277")

'I file datasets/13283/metadata.json e datasets/14277/metadata.json sono diversi\n'

In [79]:
diff_file("diff datasets/13283/metadata.json", "datasets/14277/metadata.json")

diff: extra operando "datasets/14277/metadata.json"
diff: Usare "diff --help" per maggiori informazioni.


''

In [80]:
%%bash
tree datasets/13283

[01;34mdatasets/13283[00m
├── f522444a-2dd9-400e-be59-47213ef38cb9.rdf
├── metadata.json
├── [01;31mtxn-distribution.ttl.gz[00m
├── [01;31mtxn-images.ttl.gz[00m
├── [01;31mtxn-misc.ttl.gz[00m
├── [01;31mtxn-ocs.ttl.gz[00m
└── txn.owl

0 directories, 7 files


In [81]:
%%bash
cd datasets/13283
gzip -d *.gz

In [82]:
%%bash
tree datasets/13283

[01;34mdatasets/13283[00m
├── f522444a-2dd9-400e-be59-47213ef38cb9.rdf
├── metadata.json
├── txn-distribution.ttl
├── txn-images.ttl
├── txn-misc.ttl
├── txn-ocs.ttl
└── txn.owl

0 directories, 7 files


In [83]:
%%bash
tree datasets/14277

[01;34mdatasets/14277[00m
├── f522444a-2dd9-400e-be59-47213ef38cb9.rdf
├── metadata.json
├── [01;31mtxn-distribution.ttl.gz[00m
├── [01;31mtxn-images.ttl.gz[00m
├── [01;31mtxn-misc.ttl.gz[00m
├── [01;31mtxn-ocs.ttl.gz[00m
└── txn.owl

0 directories, 7 files


In [84]:
%%bash
cd datasets/14277
gzip -d *.gz

In [85]:
%%bash
tree datasets/14277

[01;34mdatasets/14277[00m
├── f522444a-2dd9-400e-be59-47213ef38cb9.rdf
├── metadata.json
├── txn-distribution.ttl
├── txn-images.ttl
├── txn-misc.ttl
├── txn-ocs.ttl
└── txn.owl

0 directories, 7 files


##### datasets 13347 and 14324 seems to have the same files

In [86]:
diff_dataset("datasets/13347", "datasets/14324")

'I file datasets/13347/metadata.json e datasets/14324/metadata.json sono diversi\n'

In [87]:
diff_file("datasets/13347/metadata.json", "datasets/14324/metadata.json")

'2c2\n<     "id": "13347",\n---\n>     "id": "14324",\n6c6\n<     "tags": "LinkedDataCrawl2014;alt. access;crawledLinkedDataCloud2014;dataset-level-metadata;dump;format-bibo;format-cc;format-dcterm;format-foaf;format-geosp;format-gn;format-owl;format-rdf;format-rdfs;format-skos;format-void;format-wdrs;format-wlo;license-metadata",\n---\n>     "tags": "bio;biodiversity;ckanupload.esw.200910;deref-vocab;format-bibo;format-cc;format-dbp;format-dc;format-doap;format-foaf;format-geo;format-geonames;format-geospecies;format-owl;format-rdf;format-rdfs;format-skos;format-txn;format-umbel;format-uniprot;license",\n'

In [88]:
%%bash
tree datasets/13347

[01;34mdatasets/13347[00m
├── geospecies.owl
├── [01;31mgeospecies.rdf.gz[00m
└── metadata.json

0 directories, 3 files


In [89]:
%%bash
cd datasets/13347
gzip -d *.gz

In [90]:
%%bash
tree datasets/13347

[01;34mdatasets/13347[00m
├── geospecies.owl
├── geospecies.rdf
└── metadata.json

0 directories, 3 files


In [91]:
%%bash
tree datasets/14324

[01;34mdatasets/14324[00m
├── geospecies.owl
├── [01;31mgeospecies.rdf.gz[00m
└── metadata.json

0 directories, 3 files


In [92]:
%%bash
cd datasets/14324
gzip -d *.gz

In [93]:
%%bash
tree datasets/14324

[01;34mdatasets/14324[00m
├── geospecies.owl
├── geospecies.rdf
└── metadata.json

0 directories, 3 files


##### datasets/14079/eat.nt.gz

In [94]:
%%bash
tree datasets/14079

[01;34mdatasets/14079[00m
├── [01;31meat.nt.gz[00m
├── mapping-eat-dbpedia.rdf
├── metadata.json
├── vocab.ttl
└── wheat.rdf

0 directories, 5 files


In [95]:
%%bash
cd datasets/14079
gzip -d *.gz

In [96]:
%%bash
tree datasets/14079

[01;34mdatasets/14079[00m
├── eat.nt
├── mapping-eat-dbpedia.rdf
├── metadata.json
├── vocab.ttl
└── wheat.rdf

0 directories, 5 files


##### datasets/14801/usage.nt.gz

In [97]:
%%bash
tree datasets/14801

[01;34mdatasets/14801[00m
├── 2050.rdf
├── metadata.json
└── [01;31musage.nt.gz[00m

0 directories, 3 files


In [98]:
%%bash
cd datasets/14801
gzip -d *.gz

In [99]:
%%bash
tree datasets/14801

[01;34mdatasets/14801[00m
├── 2050.rdf
├── metadata.json
└── usage.nt

0 directories, 3 files


#### BZ2

In [100]:
for a in bz2:
    print(a)

datasets/13369/dbpedia-3.6.owl.bz2
datasets/13388/dbpedia-3.6.owl.bz2
datasets/13461/dbpedia-3.5.1.owl.bz2
datasets/14336/dbpedia-3.6.owl.bz2
datasets/15414/dbpedia-3.6.owl.bz2


##### datasets/13369/dbpedia-3.6.owl.bz2

In [101]:
%%bash
tree datasets/13369

[01;34mdatasets/13369[00m
├── [01;31mdbpedia-3.6.owl.bz2[00m
├── linux.n3
├── linux.rdf
└── metadata.json

0 directories, 4 files


In [102]:
%%bash
cd datasets/13369
bzip2 -d dbpedia-3.6.owl.bz2

In [103]:
%%bash
tree datasets/13369

[01;34mdatasets/13369[00m
├── dbpedia-3.6.owl
├── linux.n3
├── linux.rdf
└── metadata.json

0 directories, 4 files


##### datasets/13388/dbpedia-3.6.owl.bz2

In [104]:
%%bash
tree datasets/13388

[01;34mdatasets/13388[00m
├── [01;31mdbpedia-3.6.owl.bz2[00m
├── metadata.json
├── paris.n3
├── paris.nt
└── paris.rdf

0 directories, 5 files


In [105]:
%%bash
cd datasets/13388
bzip2 -d dbpedia-3.6.owl.bz2

In [106]:
%%bash
tree datasets/13388

[01;34mdatasets/13388[00m
├── dbpedia-3.6.owl
├── metadata.json
├── paris.n3
├── paris.nt
└── paris.rdf

0 directories, 5 files


##### datasets/13461/dbpedia-3.5.1.owl.bz2

In [107]:
%%bash
tree datasets/13461

[01;34mdatasets/13461[00m
├── berlin.n3
├── berlin.rdf
├── [01;31mdbpedia-3.5.1.owl.bz2[00m
└── metadata.json

0 directories, 4 files


In [108]:
%%bash
cd datasets/13461
bzip2 -d dbpedia-3.5.1.owl.bz2

In [109]:
%%bash
tree datasets/13461

[01;34mdatasets/13461[00m
├── berlin.n3
├── berlin.rdf
├── dbpedia-3.5.1.owl
└── metadata.json

0 directories, 4 files


##### datasets/14336/dbpedia-3.6.owl.bz2

In [110]:
%%bash
tree datasets/14336

[01;34mdatasets/14336[00m
├── [01;31mdbpedia-3.6.owl.bz2[00m
├── linux.n3
├── linux.rdf
└── metadata.json

0 directories, 4 files


In [111]:
%%bash
cd datasets/14336
bzip2 -d dbpedia-3.6.owl.bz2

In [112]:
%%bash
tree datasets/14336

[01;34mdatasets/14336[00m
├── dbpedia-3.6.owl
├── linux.n3
├── linux.rdf
└── metadata.json

0 directories, 4 files


##### datasets/15414/dbpedia-3.6.owl.bz2

In [113]:
%%bash
tree datasets/15414

[01;34mdatasets/15414[00m
├── [01;31mdbpedia-3.6.owl.bz2[00m
├── metadata.json
├── paris.n3
├── paris.nt
└── paris.rdf

0 directories, 5 files


In [114]:
%%bash
cd datasets/15414
bzip2 -d dbpedia-3.6.owl.bz2

In [115]:
%%bash
tree datasets/15414

[01;34mdatasets/15414[00m
├── dbpedia-3.6.owl
├── metadata.json
├── paris.n3
├── paris.nt
└── paris.rdf

0 directories, 5 files


### Files that needs to be checked manually

In [116]:
assign_extensions = [
    "datasets/13506/biomodels-biomd0000000048",
    "datasets/13522/irefindex.irogid-1069566",
    "datasets/14517/irefindex.irogid-1069566",
    "datasets/14742/sample-20160515.rdf.xml",
]

In [117]:
%%bash
cd datasets/13506
head biomodels-biomd0000000048
rm biomodels-biomd0000000048

# Empty TURTLE


In [118]:
%%bash
cd datasets/13522
head irefindex.irogid-1069566
rm irefindex.irogid-1069566

# Empty TURTLE


In [119]:
%%bash
cd datasets/14517
head irefindex.irogid-1069566
rm irefindex.irogid-1069566

# Empty TURTLE


In [120]:
%%bash
cd datasets/14742
head sample-20160515.rdf.xml
mv sample-20160515.rdf.xml sample-20160515.rdf

<RDF xmlns="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdfs:Resource xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
               xmlns:dc="http://purl.org/dc/elements/1.1/"
               xmlns:dcterms="http://purl.org/dc/terms/"
               xmlns:role="http://www.language-archives.org/vocabulary/role#"
               xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
               rdf:about="http://www.language-archives.org/item/oai:acl.sr.language-archives.org:P87-1033">
   <dc:publisher rdf:resource="http://www.language-archives.org/archive/acl.sr.language-archives.org"/>
</rdfs:Resource>
<rdfs:Resource xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"


### Big files

After extracting all the zips above, look for all the RDF files that are bigger than 200MB

In [121]:
for path, subdirs, files in os.walk("datasets"):
    for name in files:
        file_with_path = os.path.join(path, name)
        if is_file_larger_than_size_limit(file_with_path) and check_if_file_name_is_rdf(name):
            print(file_with_path)

datasets/13565/download-20120123.rdf
datasets/11580/rows.rdf
datasets/14364/Govwild_rdf.n3
datasets/14079/eat.nt
datasets/13263/Govwild_rdf.n3
datasets/21532/jrcnames_uri.nt
datasets/15243/fr.rdf
datasets/15243/en.rdf
datasets/13368/all-geonames.rdf
datasets/14344/all-geonames.rdf
