In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import load_dotenv

load_dotenv("../.env")

TODO:
* tab:el_prop (cell filled below)
* tab:flags

In [None]:
raw_table_latex = """
    \hline
    becs & $e$ & array & $n_{\text{sites}} \times 3 \times 3$ & Born effective charges \\
    \hline
    eps\_electronic & $-$ & array & $3 \times 3$ & Electronic contribution to the dielectric permittivity tensor \\
    \hline
    eps\_total & $-$ & array & $3 \times 3$ & Total dielectric permittivity tensor\\
    \hline
    cnsr\_breaking & $e$ & number & $-$ & Maximum breaking of the CNSR \\
    \hline
"""

In [None]:
datatype = {"string": "xsd:string", "array": "qudt:Array", "number": "xsd:double"}

XSD datatypes supported by OWL 2 RL:

```
rdf:PlainLiteral
rdf:XMLLiteral
rdfs:Literal
xsd:decimal
xsd:integer
xsd:nonNegativeInteger
xsd:nonPositiveInteger
xsd:positiveInteger
xsd:negativeInteger
xsd:long
xsd:int
xsd:short
xsd:byte
xsd:unsignedLong
xsd:unsignedInt
xsd:unsignedShort
xsd:unsignedByte
xsd:float
xsd:double
xsd:string
xsd:normalizedString
xsd:token
xsd:language
xsd:Name
xsd:NCName
xsd:NMTOKEN
xsd:boolean
xsd:hexBinary
xsd:base64Binary
xsd:anyURI
xsd:dateTime
xsd:dateTimeStamp
```

Source: <https://www.w3.org/TR/owl2-profiles/#Entities_3>

In [None]:
from pprint import pprint

entry = {}
for line in raw_table_latex.splitlines():   
    if " & " not in line:
        continue
    elts = line.split("&")
    elts = [e.replace("\\", "").strip() for e in elts]
    elts = [e for e in elts if e]
    entry[elts[0]] = {
        "@type": "owl:DatatypeProperty",
        "rdfs:range": {"@id": datatype[elts[2]]},
        "rdfs:label": elts[0],
        "skos:definition": " ".join(elts[4:]),
        "rdfs:comment": [f"UNITS: {elts[1]}", f"SHAPE: {elts[3]}",],
    }
    if elts[1] != "$-$" or elts[3] != "$-$":
        entry[elts[0]]["rdfs:comment"] = []
    if elts[1] != "$-$":
        entry[elts[0]]["rdfs:comment"].append(f"UNITS: {elts[1]}")
    if elts[3] != "$-$":
        entry[elts[0]]["rdfs:comment"].append(f"SHAPE: {elts[3]}")

pprint(entry)

In [None]:
from xyz_polyneme_ns.cli.util import req

In [None]:
for term, info in entry.items():
    print(term, "...")
    info["rdfs:isDefinedBy"] = {
        "@id": "https://ns.polyneme.xyz/ark:57802/2022/02/marda/phonons"
    },
    rv = req("POST", "/2022/02/marda/phonons", params={"term": term}, json=info)
    print(rv)

In [None]:
raw_table_latex = """
    has\_neg\_fr & boolean & True if negative frequencies are present \\
    \hline
    large\_asr\_break & boolean & True if the breaking of ASR is greater than 30~cm$^{-1}$ \\
    \hline
    large\_cnsr\_break & boolean & True if the breaking of CNSR is greater than 0.2 \\
    \hline
    small\_q\_neg\_fr & boolean & True if negative frequencies are present only close to $\Gamma$ \\
    \hline
"""

In [None]:
datatype = {"string": "xsd:string", "array": "qudt:Array", "number": "xsd:double",
           "boolean": "xsd:boolean"}

In [None]:
from pprint import pprint

entry = {}
for line in raw_table_latex.splitlines():   
    if " & " not in line:
        continue
    elts = line.split("&")
    elts = [e.replace("\\", "").strip() for e in elts]
    elts = [e for e in elts if e]
    entry[elts[0]] = {
        "@type": "owl:DatatypeProperty",
        "rdfs:range": {"@id": datatype[elts[1]]},
        "rdfs:label": elts[0],
        "skos:definition": " ".join(elts[2:]),
    }

pprint(entry)

In [None]:
for term, info in entry.items():
    print(term, "...")
    info["rdfs:isDefinedBy"] = {
        "@id": "https://ns.polyneme.xyz/ark:57802/2022/02/marda/phonons"
    },
    rv = req("POST", "/2022/02/marda/phonons", params={"term": term}, json=info)
    print(rv)

In [None]:
from rdflib import Graph, URIRef, RDFS, Literal

g = Graph()
g.add((
    URIRef("http://example.com/node"),
    RDFS.comment,
    Literal("SHAPE: $n_{\text{sites}} \times 3 \times 3$")
))
g.add((
    URIRef("http://example.com/node"),
    RDFS.comment,
    Literal("SHAPE: $n_{\\text{sites}} \\times 3 \\times 3$")
))
print("JSON-LD:")
print(g.serialize(format="application/ld+json"))
print("\nTurtle:")
print(g.serialize(format="text/turtle"))
print("XML:")
print(g.serialize(format="application/rdf+xml"))
print("N-Triples:")
print(g.serialize(format="nt"))

In [None]:
from xyz_polyneme_ns.db import mongo_db

mdb = mongo_db()

In [None]:
# Prevent `\t` from being rendered as a TAB character.

# for tdoc in mdb.terms.find(
#     {'rdfs:isDefinedBy': {'@id': 'https://ns.polyneme.xyz/ark:57802/2022/02/marda/phonons'}}
# ):
#     if "rdfs:comment" in tdoc:
#         tdoc["rdfs:comment"] = [c.replace("\t", "\\t") for c in tdoc["rdfs:comment"]]
#         print(tdoc["rdfs:comment"])
#         mdb.terms.update_one(
#             {"_id": tdoc["_id"]},
#             {"$set": {"rdfs:comment": tdoc["rdfs:comment"]}}
#         )

# Does the @context work?

In [None]:
import glob

In [None]:
filenames = glob.glob("/Users/dwinston/Dropbox/diary/22/02/marda-phonon-data/*.json")

In [None]:
for i, fn in enumerate(filenames):
    print("file", i+1, "of", len(filenames))
    with open(fn) as f:
        data = json.load(f)
    data["@context"] = {
        "@vocab": "https://ns.polyneme.xyz/ark:57802/2022/02/marda/phonons/"
    }
    with open(
        f"/Users/dwinston/Dropbox/diary/22/02/marda-phonon-data/withcontext/{fn.split('/')[-1]}", "w"
    ) as f:
        json.dump(data, f)

In [None]:
filenames = glob.glob("/Users/dwinston/Dropbox/diary/22/02/marda-phonon-data/withcontext/*.json")

In [None]:
from rdflib import Graph

g = Graph()

In [None]:
for i, fn in enumerate(filenames):
    print("file", i+1, "of", len(filenames))
    g.parse(fn, format="application/ld+json")

In [None]:
g.parse("https://ns.polyneme.xyz/ark:57802/2022/02/marda/phonons")

In [None]:
from rdflib import Namespace
from rdflib.namespace import RDF
from toolz import take

from rdflib.plugins.sparql import prepareQuery

def pprint_terms(terms, graph=g):
    print(*[t.n3(graph.namespace_manager) for t in terms])

PHONONS = Namespace("https://ns.polyneme.xyz/ark:57802/2022/02/marda/phonons/")
g.namespace_manager.bind("mardap", PHONONS)

q = prepareQuery(f"""
    PREFIX mardap: <{PHONONS}>
    
    SELECT ?mpid ?cnsr_breaking WHERE {{
        ?data mardap:metadata ?metadata .

        ?metadata mardap:material_id ?mpid .
        ?data mardap:dielectric ?dielectric .
        ?dielectric mardap:cnsr_breaking ?cnsr_breaking .
        
        FILTER(?cnsr_breaking > 0.2)
    }}
""")

for row in g.query(q):
    pprint_terms(row, g)

In [None]:
# g.serialize(
#     "/Users/dwinston/Dropbox/diary/22/02/marda-phonon-data/withcontext/dataset.ttl",
#     format="ttl"
# )

In [None]:
# !gzip -f /Users/dwinston/Dropbox/diary/22/02/marda-phonon-data/withcontext/dataset.ttl
# !du -h /Users/dwinston/Dropbox/diary/22/02/marda-phonon-data/withcontext/dataset.ttl.gz

In [None]:
# todo: tar and gz withcontext json files
# tar -cvf dataset.json.tar *.json
# gzip -f dataset.json.tar