Goal: reach desired course end state via meandering Jupyter notebook narrative

Desired end state: [JSON Lines](https://jsonlines.org/) file of [JSON-LD](https://json-ld.org/) objects that represents all or a portial of the Nobel Prize dataset such that a compenency question can be answered efficiently with MongoDB.

For the MongoDB part, perhaps use `jq` to filter JSON Lines by `rdf:type` in order to `mongoimport` to the appropriate Mongo collections. Or use Python for this.

# i. Fetch Nobel Prize data as SPARQL JSON response

1. Go to <https://data.nobelprize.org/sparql>.
2. Enter this query:
    ```sparql
    PREFIX nobel: <http://data.nobelprize.org/terms/>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
    SELECT ?s ?p ?o WHERE {
      ?s ?p ?o .
    }
    ```
3. Click "Response" results view (default may be "Table")
4. Click "Download result"
5. `gzip` result (~30x compression from ~30MB to ~1MB).

In [None]:
!du -h data/00-raw-sparql-response.json.gz

# ii. Load JSON response as list of statements and serialize as RDF

1. into memory as Python dict
2. map to list of statements
3. load into RDFLib and save as RDF

In [None]:
import gzip
import json

with gzip.open("data/00-raw-sparql-response.json.gz") as f:
    response = json.loads(f.read())

In [None]:
statements = []
for binding in response["results"]["bindings"]:
    statements.append((binding["s"], binding["p"], binding["o"]))

In [None]:
from rdflib import Graph, URIRef, Literal, BNode, Namespace

g = Graph()
for statement in statements:
    s, p, o = statement
    if s["type"] not in ("uri", "bnode"):
        raise ValueError("subs must be uris or bnodes")
    if p["type"] != "uri":
        raise ValueError("preds must be uris")
    if o["type"] not in ("uri", "bnode", "literal"):
        raise ValueError("objs must be uris or bnodes or literals")
    if o["type"] == "literal" and len(set(o) - {"type", "value", "datatype", "xml:lang"}):
        raise ValueError("literal objs can only have datatype and xml:lang apart from value")
        
    s = URIRef(s["value"]) if s["type"] == "uri" else BNode(s["value"])
    p = URIRef(p["value"])
    if o["type"] == "uri":
        o = URIRef(o["value"])
    elif o["type"] == "bnode":
        o = BNode(o["value"])
    else:  # o["type"] == "literal"
        o = Literal(o["value"], lang=o.get("xml:lang"), datatype=o.get("datatype"))
    
    g.add((s, p, o))

In [None]:
g.serialize("data/01-nobelprize-data.nt", format="nt")
!gzip -f data/01-nobelprize-data.nt
!du -h data/01-nobelprize-data.nt.gz

# 1. representing facts: RDF

Load into RDF graph using rdflib

In [None]:
import gzip

from toolz import take

with gzip.open("data/01-nobelprize-data.nt.gz", "rt") as f:
    for line in take(100, f):
        print(line)

In [None]:
from rdflib import Graph

g = Graph()

with gzip.open("data/01-nobelprize-data.nt.gz") as f:
    g.parse(f.read())

In [None]:
for s, p, o in take(5, g):
    print(s, p, o)

In [None]:
list(g.namespaces())

In [None]:
for s, p, o in take(5, g):
    print(s, p.n3(g.namespace_manager), o.n3(g.namespace_manager))

In [None]:
def pprint_terms(terms, graph=g):
    print(*[t.n3(graph.namespace_manager) for t in terms])

In [None]:
from rdflib import Namespace
from rdflib.namespace import RDF
from toolz import take

NOBEL = Namespace("http://data.nobelprize.org/terms/")
g.namespace_manager.bind("nobel", NOBEL)

for s, p, o in take(5, g.triples((None, RDF.type, NOBEL.Laureate))):
    pprint_terms([s, p, o], g)

In [None]:
for s, p, o in g.triples((NOBEL.Laureate, None, None)):
    pprint_terms([s, p, o], g)

In [None]:
print(NOBEL.Laureate)

In [None]:
g.parse(NOBEL.Laureate)

In [None]:
for s, p, o in g.triples((NOBEL.Laureate, None, None)):
    pprint_terms([s, p, o], g)

In [None]:
from rdflib.namespace import FOAF

g.namespace_manager.bind("foaf", FOAF)

In [None]:
for s, p, o in g.triples((NOBEL.Laureate, None, None)):
    pprint_terms([s, p, o], g)

In [None]:
from rdflib.namespace import OWL

for s, p, o in g.triples((OWL.Class, None, None)):
    pprint_terms([s, p, o], g)

In [None]:
print(OWL.Class)

In [None]:
g.parse(OWL.Class)

In [None]:
from rdflib.namespace import OWL

for s, p, o in g.triples((OWL.Class, None, None)):
    pprint_terms([s, p, o], g)

In [None]:
from rdflib.namespace import OWL

for s, p, o in g.triples((FOAF.Agent, None, None)):
    pprint_terms([s, p, o], g)

In [None]:
print(FOAF.Agent)

In [None]:
# g.parse(FOAF.Agent)
g.parse("data/foaf.ttl")

In [None]:
from rdflib.namespace import OWL

for s, p, o in g.triples((FOAF.Agent, None, None)):
    pprint_terms([s, p, o], g)

In [None]:
list(g.namespaces())

# 2. representing terminology: RDFS and OWL

In [None]:
def term_in_ns(term, ns):
    return str(subj).startswith(str(ns))

In [None]:
from rdflib.namespace import OWL

for subj, pred, obj in g:
    if term_in_ns(subj, NOBEL):
        pprint_terms([subj, pred, obj], g)

In [None]:
from rdflib.namespace import OWL

datatype_properties = []

for subj, pred, obj in g.triples((None, RDF.type, OWL.DatatypeProperty)):
    if term_in_ns(subj, NOBEL):
        pprint_terms([subj, pred, obj], g)
        datatype_properties.append(subj)

In [None]:
for prop in datatype_properties:
    for pred, obj in g.predicate_objects(prop):
        pprint_terms([prop, pred, obj], g)
    print()

In [None]:
from rdflib.namespace import OWL

object_properties = []

for subj, pred, obj in g.triples((None, RDF.type, OWL.ObjectProperty)):
    if term_in_ns(subj, NOBEL):
        pprint_terms([subj, pred, obj], g)
        object_properties.append(subj)

In [None]:
for prop in object_properties:
    for pred, obj in g.predicate_objects(prop):
        pprint_terms([prop, pred, obj], g)
    print()

In [None]:
from rdflib.namespace import OWL

classes = []

for subj, pred, obj in g.triples((None, RDF.type, OWL.Class)):
    if term_in_ns(subj, NOBEL):
        pprint_terms([subj, pred, obj], g)
        classes.append(subj)

In [None]:
for cls in classes:
    for pred, obj in g.predicate_objects(cls):
        pprint_terms([cls, pred, obj], g)
    print()

In [None]:
categories = next(g.objects(NOBEL.Category, OWL.oneOf))

In [None]:
from rdflib.term import BNode

for p, o in g.predicate_objects(categories):
    pprint_terms([p,o], g)

In [None]:
for p, o in g.predicate_objects(OWL.oneOf):
    pprint_terms([p,o], g)

In [None]:
g.parse(OWL.oneOf)

In [None]:
for p, o in g.predicate_objects(OWL.oneOf):
    pprint_terms([p,o], g)

In [None]:
g.serialize("data/02-nobelprize-data-enriched.nt", format="nt")
!gzip -f data/02-nobelprize-data-enriched.nt
!du -h data/02-nobelprize-data-enriched.nt.gz

# 3. knowledge graph search: SPARQL

What information is there for laureates?

In [None]:
import gzip

from rdflib import Graph

g = Graph()

with gzip.open("data/02-nobelprize-data-enriched.nt.gz") as f:
    g.parse(f.read())

In [None]:
from rdflib.plugins.sparql import prepareQuery

q = prepareQuery("""
    SELECT ?s ?p ?o WHERE {
        ?s a nobel:Laureate .
        ?s ?p ?o .
    }
""", initNs={"nobel": NOBEL})

In [None]:
for row in take(100, g.query(q)):
    pprint_terms(row, g)

In [None]:
DBO = Namespace("http://dbpedia.org/ontology/")

In [None]:
def prepQ(q: str):
    return prepareQuery(q, initNs={"nobel": NOBEL, "dbo": DBO})

In [None]:
g.namespace_manager.bind("laureate", Namespace("http://data.nobelprize.org/resource/laureate/"))
g.namespace_manager.bind("country", Namespace("http://data.nobelprize.org/resource/country/"))
g.namespace_manager.bind("city", Namespace("http://data.nobelprize.org/resource/city/"))
g.namespace_manager.bind("university", Namespace("http://data.nobelprize.org/resource/university/"))
g.namespace_manager.bind("dpb", Namespace("http://dbpedia.org/property/"))
g.namespace_manager.bind("dbo", DBO)
g.namespace_manager.bind("nobel", NOBEL)
g.namespace_manager.bind("foaf", FOAF)

What fraction of laureates are affiliated with an institution in a country that is not their country of birth?

In [None]:
q = prepQ("""
    SELECT (COUNT(?laureate) as ?nlaureates) WHERE {
        ?laureate a nobel:Laureate .
        
        ?laureate dbo:birthPlace ?bcountry .
        ?bcountry a dbo:Country .
    }
""")

for row in g.query(q):
    pprint_terms(row, g)

In [None]:
q = prepQ("""
    SELECT (COUNT(?laureate) as ?nlaureates) WHERE {
        ?laureate a nobel:Laureate .
        
        ?laureate dbo:affiliation ?institution .
        ?institution dbo:country ?icountry .
        ?icountry a dbo:Country .
    }
""")

for row in g.query(q):
    pprint_terms(row, g)

In [None]:
q = prepQ("""
    SELECT (COUNT(?laureate) as ?nlaureates) WHERE {
        ?laureate a nobel:Laureate .

        ?laureate dbo:birthPlace ?bcountry .
        ?bcountry a dbo:Country .
        
        ?laureate dbo:affiliation ?institution .
        ?institution dbo:country ?icountry .
    }
""")

for row in g.query(q):
    pprint_terms(row, g)


In [None]:
q = prepQ("""
    SELECT (COUNT(?laureate) as ?nlaureates) WHERE {
        ?laureate a nobel:Laureate .

        ?laureate dbo:birthPlace ?bcountry .
        ?bcountry a dbo:Country .
        
        ?laureate dbo:affiliation ?institution .
        ?institution dbo:country ?icountry .
        
        FILTER(sameTerm(?bcountry,?icountry))
    }
""")

for row in g.query(q):
    pprint_terms(row, g)

In [None]:
q = prepQ("""
    SELECT (COUNT(?laureate) as ?nlaureates) WHERE {
        ?laureate a nobel:Laureate .

        ?laureate dbo:birthPlace ?bcountry .
        ?bcountry a dbo:Country .
        
        ?laureate dbo:affiliation ?institution .
        ?institution dbo:country ?icountry .
        
        FILTER(!sameTerm(?bcountry,?icountry))
    }
""")

for row in g.query(q):
    pprint_terms(row, g)

In [None]:
525 + 364 == 889

In [None]:
def as_pct(numer, denom):
    return f"{numer/denom:.1%}"

as_pct(364, 889)

In [None]:
print(f"""
    PREFIX nobel: <{NOBEL}>
    PREFIX dbo: <{DBO}>
    
    SELECT (COUNT(?laureate) as ?nlaureates) ?icountry ?bcountry WHERE {{
        ?laureate a nobel:Laureate .

        ?laureate dbo:birthPlace ?bcountry .
        ?bcountry a dbo:Country .
        
        ?laureate dbo:affiliation ?institution .
        ?institution dbo:country ?icountry .
        
        FILTER(!sameTerm(?bcountry,?icountry))
    }}
    GROUP BY ?icountry ?bcountry
    ORDER BY DESC(?nlaureates)
    LIMIT 5
""")

In [None]:
q = prepareQuery(f"""
    PREFIX nobel: <{NOBEL}>
    PREFIX dbo: <{DBO}>
    
    SELECT (COUNT(?laureate) as ?nlaureates) ?icountry ?bcountry WHERE {{
        ?laureate a nobel:Laureate .

        ?laureate dbo:birthPlace ?bcountry .
        ?bcountry a dbo:Country .
        
        ?laureate dbo:affiliation ?institution .
        ?institution dbo:country ?icountry .
        
        FILTER(!sameTerm(?bcountry,?icountry))
    }}
    GROUP BY ?icountry ?bcountry
    ORDER BY DESC(?nlaureates)
    LIMIT 5
""")

for row in g.query(q):
    pprint_terms(row, g)

# 4.  Representing entities: collections and JSON documents

make collections for
- nobel:NobelPrize
- nobel:LaureateAward
- nobel:Laureate
- nobel:Category
- Institutions (objects of dbo:affiliation triples)
- Countries (objects of dbo:country triples)

In [None]:
from collections import defaultdict

class_collection = {
    "nobel:NobelPrize": "nobel_prizes",
    "nobel:LaureateAward": "laureate_awards",
    "nobel:Laureate": "laureates",
    "nobel:Category": "categories",
}

database = defaultdict(lambda: defaultdict(dict))

for cls, collection in class_collection.items():
    q = prepareQuery(f"""
        SELECT ?sub ?pred ?obj WHERE {{
            ?sub a {cls} .
            ?sub ?pred ?obj
        }}
    """, initNs={"nobel": NOBEL})

    for row in g.query(q):
        sub, pred, obj = row
        database[collection][sub][pred] = obj

In [None]:
q = prepareQuery("""
    SELECT ?institution ?pred ?obj WHERE {
        ?sub dbo:affiliation  ?institution .
        ?institution ?pred ?obj
    }
""", initNs={"nobel": NOBEL, "dbo": DBO})

for row in g.query(q):
    sub, pred, obj = row
    database["institutions"][sub][pred] = obj

In [None]:
individual_countries = set()

q = prepareQuery("""
    SELECT DISTINCT ?country WHERE {
        ?sub dbo:country  ?country .
    }
""", initNs={"nobel": NOBEL, "dbo": DBO})

for row in g.query(q):
    individual_countries.add(str(row[0]))

q = prepareQuery("""
    SELECT DISTINCT ?bcountry WHERE {
        ?laureate dbo:birthPlace ?bcountry .
        ?bcountry a dbo:Country .
    }
""", initNs={"nobel": NOBEL, "dbo": DBO})

for row in g.query(q):
    individual_countries.add(str(row[0]))

for country in sorted(individual_countries):
    q = prepareQuery(f"""
        SELECT ?country ?pred ?obj WHERE {{
            BIND (<{country}> as ?country) .
            ?country ?pred ?obj .
        }}
    """, initNs={"nobel": NOBEL, "dbo": DBO})

    for row in g.query(q):
        sub, pred, obj = row
        database["countries"][sub][pred] = obj

In [None]:
from json.encoder import (_make_iterencode, JSONEncoder,
                          encode_basestring_ascii, INFINITY,
                          encode_basestring)

class CustomObjectEncoder(JSONEncoder):

    def iterencode(self, o, _one_shot=False):
        """Encode the given object and yield each string
        representation as available.

        For example::

            for chunk in JSONEncoder().iterencode(bigobject):
                mysocket.write(chunk)
                
        Change from json.encoder.JSONEncoder.iterencode is setting
        _one_shot=False and isinstance=self.isinstance
        in call to `_make_iterencode`.
        And not using `c_make_encoder`.

        """
        if self.check_circular:
            markers = {}
        else:
            markers = None
        if self.ensure_ascii:
            _encoder = encode_basestring_ascii
        else:
            _encoder = encode_basestring

        def floatstr(o, allow_nan=self.allow_nan,
                _repr=float.__repr__, _inf=INFINITY, _neginf=-INFINITY):
            # Check for specials.  Note that this type of test is processor
            # and/or platform-specific, so do tests which don't depend on the
            # internals.

            if o != o:
                text = 'NaN'
            elif o == _inf:
                text = 'Infinity'
            elif o == _neginf:
                text = '-Infinity'
            else:
                return _repr(o)

            if not allow_nan:
                raise ValueError(
                    "Out of range float values are not JSON compliant: " +
                    repr(o))

            return text

        _iterencode = _make_iterencode(
                markers, self.default, _encoder, self.indent, floatstr,
                self.key_separator, self.item_separator, self.sort_keys,
                self.skipkeys, _one_shot=False, isinstance=self.isinstance)
        return _iterencode(o, 0)

In [None]:
import datetime

from rdflib.term import Literal, BNode

class RDFTermEncoder(CustomObjectEncoder):
    def isinstance(self, o, cls):
        if isinstance(o, (Literal, BNode)):
            return False
        return isinstance(o, cls)
    def default(self, o):
        if isinstance(o, Literal):
            rv = {"value": o.value}
            if o.datatype is not None:
                rv["datatype"] = o.datatype
            if o.language is not None:
                rv["lang"] = o.language
            return rv
        if isinstance(o, BNode):
            return "http://localhost/bnode/" + str(o)
        if isinstance(o, datetime.datetime):
            return o.isoformat()
        if isinstance(o, datetime.date):
            return str(o)
        # Let the base class default method raise the TypeError
        return super().default(o)

In [None]:
db = json.loads(json.dumps(database, cls=RDFTermEncoder))

In [None]:
with open("data/03-document-database.json", "w") as f:
    json.dump(db, f, indent=2)

In [None]:
!gzip -f data/03-document-database.json
!du -h data/03-document-database.json.gz

In [None]:
with gzip.open("data/03-document-database.json.gz") as f:
    db = json.loads(f.read())

In [None]:
from pprint import pprint

for collection_name, collection in db.items():
    for individual, document in take(5, collection.items()):
        print("collection:", collection_name)
        print("id:", individual)
        pprint(document)

# 5. Framing linked-data subgraphs as documents: JSON-LD

In [None]:
from pyld import jsonld

In [None]:
db_ld = json.loads(g.serialize(format='json-ld', indent=2))

In [None]:
len(db_ld)

In [None]:
db_ld[0]

In [None]:
db_ld[-1]

In [None]:
for n in g.namespaces():
    print(n)

In [None]:
context = {
    prefix: str(uri) for prefix, uri in g.namespaces()
}

In [None]:
context

In [None]:
context["category"] = "http://data.nobelprize.org/resource/category/"

In [None]:
compacted = jsonld.compact(db_ld, context)

In [None]:
compacted.keys()

In [None]:
len(compacted["@graph"])

In [None]:
for item in take(5, compacted["@graph"]):
    pprint(item)

In [None]:
frame = {
    "@context": context,
    "@type": "nobel:Laureate",
    "@requireAll": True,
    "@explicit": True,
    "foaf:name": {},
    "dbo:birthPlace": {
        "@requireAll": True,
        "@explicit": True,
        "@embed": "@always",
        "@type": "dbo:Country",
    },
    "dbo:affiliation": {
        "@requireAll": True,
        "@explicit": True,
        "@embed": "@always",
        "dbo:country": {},
    }
}

In [None]:
framed = jsonld.frame(compacted, frame)

In [None]:
len(framed["@graph"])

In [None]:
pprint(list(take(5, framed["@graph"])))

In [None]:
with open("data/04-jsonld-framed-laureates.json", "w") as f:
    json.dump(framed, f, indent=2)

In [None]:
!gzip -f data/04-jsonld-framed-laureates.json

# 6. Document collection search: MongoDB

In [None]:
from pymongo import MongoClient

client = MongoClient()

In [None]:
mdb = client["nobel"]

In [None]:
import gzip
import json

with gzip.open("data/04-jsonld-framed-laureates.json.gz") as f:
    framed = json.load(f)

In [None]:
from toolz import assoc

mdb.laureates.drop()
rv = mdb.laureates.insert_many([assoc(doc, "@context", context) for doc in framed["@graph"]])

In [None]:
len(rv.inserted_ids)

What fraction of laureates are affiliated with an institution in a country that is not in their country of birth?

In [None]:
def as_list(d):
    return d if isinstance(d, list) else [d]

In [None]:
from toolz import dissoc

n_affiliated_with_nonbirthcountry_institution = 0

for d in mdb.laureates.find():
    countries_affil = {c["@id"] for a in as_list(d["dbo:affiliation"]) for c in as_list(a["dbo:country"])}
    countries_birth = {p["@id"] for p in as_list(d["dbo:birthPlace"])}
    
    if len(countries_affil - countries_birth):
        n_affiliated_with_nonbirthcountry_institution += 1

In [None]:
as_pct(n_affiliated_with_nonbirthcountry_institution, mdb.laureates.count_documents({}))

What fraction of laureates are affiliated exclusively with institutions that are not in their country of birth?

In [None]:
from toolz import dissoc

n_affiliated_exclusively_with_nonbirthcountry_institutions = 0

for d in mdb.laureates.find():
    countries_affil = {c["@id"] for a in as_list(d["dbo:affiliation"]) for c in as_list(a["dbo:country"])}
    countries_birth = {p["@id"] for p in as_list(d["dbo:birthPlace"])}
    
    if countries_affil.isdisjoint(countries_birth):
        n_affiliated_exclusively_with_nonbirthcountry_institutions += 1

In [None]:
as_pct(n_affiliated_exclusively_with_nonbirthcountry_institutions, mdb.laureates.count_documents({}))

In [None]:
list(mdb.laureates.find({"dbo:birthPlace.@id": "country:Denmark"}, {"foaf:name": 1, "_id": 0}))

In [None]:
list(mdb.laureates.find({"dbo:affiliation.dbo:country.@id": "country:Denmark"}, {"foaf:name": 1, "_id": 0}))

In [None]:
len(mdb.laureates.distinct("dbo:birthPlace.@id"))

In [None]:
len(mdb.laureates.distinct("dbo:affiliation.dbo:country.@id"))

In [None]:
!mongoexport -d nobel -c laureates -o data/05-laureates-mongoexport.jsonl

In [None]:
!gzip -f data/05-laureates-mongoexport.jsonl