# Convert opensalt to RDF

## Setup

- Setup modules, paths, Node class

In [18]:
import pandas as pd
from pathlib import Path
from rdflib import Graph, Literal, Namespace, RDF, URIRef, BNode
from rdflib.namespace import SKOS, DCTERMS, SDO
from pprint import pprint
import json

# path to curriculum data
curriculum_xlsx = Path('./data/Lerhplan_all.xlsx')

# create Node Class
class Node:
    def __init__(self, **kwargs):
        self.id = kwargs.get('_id')
        self.type = kwargs.get('type')
        self.name = kwargs.get('name')
        self.description = kwargs.get('description')
        # creator is of type object
        self.creator = kwargs.get('creator')
        self.publisher = kwargs.get('publisher')
        self.courseCode = kwargs.get('courseCode')
        self.educationalLevel = kwargs.get('educationalLevel')
        self.educationalContext = kwargs.get('educationalContext')
        self.level = kwargs.get('level')
        self.license = kwargs.get('license')
        # corresponds to "hasPart"
        self.children = []

    def __repr__(self):
        return self.name

## Load the data

- we convert the curriculum file from xlsx to a dict

In [2]:
df = pd.read_excel(curriculum_xlsx, sheet_name=1)
data = df.to_dict("records")

- now we have to adjust the names according to our schema

In [132]:
# set creator and publisher for curriculum

creator = {
    "type": "Organization",
    "id": "https://www.isb.bayern.de/",
    "name": "Staatsinstitut für Schulqualität und Bildungsforschung (ISB)"
}

publisher = {
    "type": "Organization",
    "id": "https://www.isb.bayern.de/",
    "name": "Staatsinstitut für Schulqualität und Bildungsforschung (ISB)"
}

def split_item(item, delimiter, index):
    try:
        return item.split(delimiter)[index]
    except IndexError:
        # print(f"index error at {item} at {index}")
        return None
        pass

for item in data:
    item['_id'] = item['identifier']
    item['type'] = "Course"
    item['name'] = split_item(item['fullStatement'], ' - ', 0)
    item['description'] = split_item(item['fullStatement'], ' - ', 1)
    item['creator'] = creator
    item['publisher'] = publisher
    item['courseCode'] = split_item(item['humanCodingScheme'], '_', 0)
    item['educationalLevel'] = {
            "type": "DefinedTerm",
            "inDefinedTermSet": "http://w3id.org/openeduhub/vocabs/educationalLevel/",
            "url": "http://w3id.org/openeduhub/vocabs/educationalLevel/" + str(item['educationLevel']),
            "name": str(item['educationLevel'])
        }
    item['educationalContext'] = {
            "type": "DefinedTerm",
            "inDefinedTermSet": "http://w3id.org/openeduhub/vocabs/educationalContext/",
            "url": "http://w3id.org/openeduhub/vocabs/educationalContext/" + str(split_item(item['humanCodingScheme'], '_', 1)),
            "name": str(split_item(item['humanCodingScheme'], '_', 1))
        }
    item['level'] =  item['smartLevel'].count('.')
    item['license'] = 'http://cc0.com'

{'CFItemType': nan,
 '_id': 'f2203a84-89f9-11ea-8413-0242ac1a0003',
 'abbreviatedStatement': nan,
 'conceptKeywords': nan,
 'courseCode': 'D1/2',
 'creator': {'id': 'https://www.isb.bayern.de/',
             'name': 'Staatsinstitut für Schulqualität und Bildungsforschung '
                     '(ISB)',
             'type': 'Organization'},
 'description': None,
 'educationLevel': 1,
 'educationalContext': {'inDefinedTermSet': 'http://w3id.org/openeduhub/vocabs/educationalContext/',
                        'name': 'grundschule',
                        'type': 'DefinedTerm',
                        'url': 'http://w3id.org/openeduhub/vocabs/educationalContext/grundschule'},
 'educationalLevel': {'inDefinedTermSet': 'http://w3id.org/openeduhub/vocabs/educationalLevel/',
                      'name': '1',
                      'type': 'DefinedTerm',
                      'url': 'http://w3id.org/openeduhub/vocabs/educationalLevel/1'},
 'fullStatement': 'Sprechen und Zuhören',
 'humanCodingS

## Build a tree

In [133]:
root = Node()

try:
    for record in data:
        last = root
        for _ in range(record['level']):
            last = last.children[-1]

        last.children.append(Node(
            _id = record['_id'], 
            name = record['name'], 
            description = record['description'],
            creator = record['creator'],
            publisher = record['publisher'],
            courseCode = record['courseCode'],
            educationalLevel = record['educationalLevel'],
            educationalContext = record['educationalContext'],
            license = record['license'],
            level = record['level']
        ))
except IndexError:
    pass

In [134]:
 # if not root node don't append educationalContext and educationalLevel to children
# because these properties are inherited and can be reasoned from the
# parent property
def delete_from_children(root):
    for child in root.children:
        if child.level != 0:
            try:
                del(
                    child.educationalContext, 
                    child.educationalLevel
                )
            except:
                pass
        delete_from_children(child)

In [135]:
def print_tree(root, depth=0):
    for child in root.children:
        print('  ' * depth + '%r' % child)
        print_tree(child, depth + 1)

In [136]:
# print(print_tree(root))

## build the graph

### `context.json`


- `"@container": "@set"`:
    - used to make even single values to be displayed as an array (see https://w3c.github.io/json-ld-syntax/#sets)
- `"@container": "@language"`:
    - is set to ensure easy language accessibility, when reading in files. Not sure yet how usefulit will be with german curricula, but it should be done in order to follow best practices. Primary language sub tags should be used, e.g. "de", "en"... (see https://w3c.github.io/json-ld-syntax/#string-internationalization)
- courseCode:
    - should be set to `"@container"`: ["@set", "@language"]`, but this is currently throwing an error in rdflib-jsonld

In [255]:
# TODO add relations?

name_systematik = 'curriculum_bayern'

filename_ttl = (Path.cwd() / 'data' / 'curriculum_bayern.ttl')
filename_xml = (Path.cwd() / 'data' / 'curriculum_bayern.xml')
filename_jsonld = (Path.cwd() / 'data' / 'curriculum_bayern.jsonld')

# initialize graph
g = Graph()

# add OpenEduhub Namespace as OEH
OEH = Namespace("http://w3id.org/openeduhub/vocabs/")

#define Namespace for curr
curr = Namespace("http://example-perma-id.com/" + name_systematik + "/")

# define id for curriculum_model
curriculum_model = URIRef(n)


title = Literal(name_systematik, lang="de")
description = Literal(name_systematik, lang="de")
creator = Literal("<https://creator.com>")

# Bind a few prefix, namespace pairs for more readable output
g.bind("sdo", SDO)
g.bind("oeh", OEH)
g.bind("curr", curr)

# Add triples to curriculum using store's add method.
g.add( (curriculum_model, RDF.type, SDO.Course ) )
g.add( (curriculum_model, SDO.name, title) )
g.add( (curriculum_model, SDO.description, description) )
g.add( (curriculum_model, SDO.creator, creator) )


def add_items(root):
    for item in root.children:

        node = curr + URIRef(item.id)
        node_name = Literal(item.name, lang="de")
        node_courseCode = Literal(item.courseCode, lang="de")
        node_courseCode2 = Literal("item.courseCode2", lang="de")
        node_description = Literal(item.description, lang="de")
        node_licene = URIRef(item.license)
        
        # prepare blank node for creator
        bNode_creator = BNode()
        creator_type = SDO + URIRef(item.creator['type'])
        creator_id = URIRef(item.creator['id'])
        creator_name = Literal(item.creator['name'], lang="de")

        # prepare blank node for publisher
        bNode_publisher = BNode()
        publisher_type = SDO + URIRef(item.publisher['type'])
        publisher_id = URIRef(item.publisher['id'])
        publisher_name = Literal(item.publisher['name'], lang="de")
        
        # prepare blank node for educationalLevel
        bNode_educationalLevel = BNode()
        educationalLevel_type = SDO + URIRef(item.educationalLevel['type'])
        educationalLevel_name = Literal(item.educationalLevel['name'], lang="de")
        educationalLevel_url = URIRef(item.educationalLevel['url'])
        educationalLevel_inDefinedTermSet = URIRef(item.educationalLevel['inDefinedTermSet'])

        # prepare blank node for educationalContext
        bNode_educationalContext = BNode()
        educationalContext_type = SDO + URIRef(item.educationalContext['type'])
        educationalContext_name = Literal(item.educationalContext['name'], lang="de")
        educationalContext_url = URIRef(item.educationalContext['url'])
        educationalContext_inDefinedTermSet = URIRef(item.educationalContext['inDefinedTermSet'])


        # add triples to the graph
        g.add( (node, RDF.type, SDO.Course) )
        g.add( (node, SDO.name, node_name))
        g.add( (node, SDO.courseCode, node_courseCode) )
        # TODO remove
        g.add( (node, SDO.courseCode, node_courseCode2) )

        g.add( (node, SDO.description, node_description) )
        g.add( (node, SDO.license, node_licene) )
        
        # add bNode "creator"
        g.add( (node, SDO.creator, bNode_creator) )
        g.add( (bNode_creator, RDF.type, creator_type) )
        g.add( (bNode_creator, SDO.name, creator_name) )
        g.add( (bNode_creator, SDO.id, creator_id) )

        # add bNode "publisher"
        g.add( (node, SDO.publisher, bNode_publisher) )
        g.add( (bNode_publisher, RDF.type, publisher_type) )
        g.add( (bNode_publisher, SDO.name, publisher_name) )
        g.add( (bNode_publisher, SDO.id, publisher_id) )
        
        # add bNode "educationalLevel"
        g.add( (node, SDO.educationalLevel, bNode_educationalLevel) )
        g.add( (bNode_educationalLevel, RDF.type, educationalLevel_type) )
        g.add( (bNode_educationalLevel, SDO.name, educationalLevel_name) )
        g.add( (bNode_educationalLevel, SDO.url, educationalLevel_url) )
        g.add( (bNode_educationalLevel, SDO.inDefinedTermSet, educationalLevel_inDefinedTermSet))
        
        # add bNode "educationalContext"
        g.add( (node, OEH.educationalContext, bNode_educationalContext) )
        g.add( (bNode_educationalContext, RDF.type, educationalContext_type) )
        g.add( (bNode_educationalContext, SDO.name, educationalContext_name) )
        g.add( (bNode_educationalContext, SDO.url, educationalContext_url) )
        g.add( (bNode_educationalContext, SDO.inDefinedTermSet, educationalContext_inDefinedTermSet))
        
        
        if item.children != []:
            for child in item.children:
                g.add( (node, SDO.hasPart, n + URIRef(child.id)))
                g.add( (n + URIRef(child.id), SDO.isPartOf, node))

        add_items(item)
            
add_items(root)

for child in root.children:
    node = curr + URIRef(child.id)
    g.add( (curriculum_model, SDO.hasPart, node))
    g.add( (node, SDO.isPartOf, curriculum_model ))

# load context file
with open("context.json") as json_file:
    context = json.load(json_file)


output_turtle = g.serialize(format='turtle').decode("utf-8")
output_xml = g.serialize(format='xml').decode("utf-8")
output_jsonld = g.serialize(format='json-ld', context=context).decode("utf-8")


def write_file(filename, data):
    with open(filename, "w") as f:
        f.write(data)
        f.close()

write_file(filename_ttl, output_turtle)
write_file(filename_xml, output_xml)
write_file(filename_jsonld, output_jsonld)

## prepare json files to use with visjs

- [ ] iterate over graph with .triples function: https://rdflib.readthedocs.io/en/stable/intro_to_graphs.html#basic-triple-matching
- [ ] does visjs accept strings as IDs?
- [ ] write a write file function

## create files just containing the nodes and edges

- create nodes and edges using functions (-> do it functional)

In [256]:
def createNodesAndEdges(graph):
    nodes = []
    edges = []

    for s in g.subjects(RDF.type, SDO.Course):
        name = g.value(s, SDO.name).value

        nodes.append({
            "id": s.toPython(),
            "label": name
        })
        for o in g.objects(s, SDO.hasPart):
            edges.append({
                "from": s.toPython(),
                "to": o
            })
    return nodes, edges

nodes, edges = createNodesAndEdges(g)

### write nodes and edges files for visjs

id can also be strings according to doc: https://visjs.github.io/vis-network/docs/network/nodes.html

In [258]:
filename_nodes = Path.cwd() / "data" / "curriculum_bayern_nodes_visjs.json"
filename_edges = Path.cwd() / "data" / "curriculum_bayern_edges_visjs.json"

def write_json(filename, data):
    with open(filename, "w") as f:
        json.dump(data, f)

write_json(filename_nodes, nodes)
write_json(filename_edges, edges)
