In [34]:

import re
import pickle
import glob, os
import requests, urllib
import json, random

import pandas as pd

from bs4 import BeautifulSoup as bs
from tqdm import tqdm
from typing import List, Any, List, Dict
from textblob import TextBlob
from itertools import combinations, combinations_with_replacement
from collections import Counter
from nltk.corpus import wordnet as wn
from SPARQLWrapper import SPARQLWrapper, JSON

from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import XSD, RDF, RDFS, SKOS, NamespaceManager

import utils as util

We would like to express the following features/relations:
* Dictionary definition terms, which are always concepts
  * We'll use the source as namespace, and corresponding concept identifier if it exists
  * SKOS is used to establish a mapping (e.g., skos:exactMatch) and add the definition (skos:definition)
* Special relations (R) or features (F) that we want to capture between words, which may help identify concepts:
  * R: Word is part of MWE
  * R: Morphologically similar words; stemming & Levenshtein distance
  * R:Semantically similar words; distributed similarity (NNs)
  * F: Acronyms
  * F: domain-specificity; foreground or background term following our filtering procedure
 

### Prepare namespaces

* Note: that UNICLASS is not a namespace (yet) only has identifiers 

In [2]:
ROOT = Namespace("http://example.org/top_concept_for_visulisation/#")
WIKI = Namespace("http://www.wikidata.org/entity/#")

In [3]:
IREC_ontology_URL = "http://example.org/irec-schema/"
IREC_instances_URL = "http://example.org/irec-spans/"
IREC_concepts_URL = "http://example.org/irec-concepts/"

In [4]:
# create our custom namespace for the schema to store spans
IREC = Namespace(IREC_ontology_URL)

# create a custom namespace to store spans and concepts
SPANS = Namespace(IREC_instances_URL)
CONCEPTS = Namespace(IREC_concepts_URL)

### graph creation utilities

In [28]:
class UID_assigner:
    def __init__(self):
        self.UIDs = {}
        self.UID = 0
        
    def assign_UID(self, text, namespace: Namespace):
        """
        Determines which type of UID to assign, based on the namespace.
        """
        if namespace == SPANS:
            return self.span_UID(text)
        elif namespace == CONCEPTS:
            return self.concept_UID(text)
        else:
            print("Currently no function implemented for assigning UIDs for this namespace")
    
    def span_UID(self, text):
        """
        NOTE: each text span is a unique identifier in and of itself. We'll simply convert the text span to 
        a URL friendly representation.
        """
        n_space = self.spans.title.__reduce__()[1][0]
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
        
        urltext = urllib.parse.quote(text)
        if urltext in self.UIDs[n_space]:
            return self.UIDs[n_space][urltext], False
        else:
            self.UIDs[n_space][urltext] = str(self.UID)
            return self.UIDs[n_space][urltext], True
        
    def concept_UID(self, text):
        """
        For now I'll create my own dumb interger-based UIDs for nodes as a simple shortcut, split per namespace
        """
        n_space = CONCEPTS.title.__reduce__()[1][0]
        
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
        
        if text in self.UIDs[n_space]:
            return self.UIDs[n_space][text], False
        else:
            self.UID += 1
            self.UIDs[n_space][text] = str(self.UID)
            return self.UIDs[n_space][text], True
    
    def count_nodes_in_namespace(self, namespace: Namespace = SPANS):
        n_space = namespace.title.__reduce__()[1][0]
        print(f"Number of nodes in '{n_space}': {len(self.UIDs[n_space])}")
        
    def print_node_by_id(self, node_id, namespace: Namespace = SPANS):
        for s, p, o in mygraph.triples((namespace[str(node_id)],  None, None)):
            print(f"{s.split('#')[-1]} ; {p.split('#')[-1]} ; {o.split('#')[-1]}")
        
    def print_node_by_text(self, node_text, namespace: Namespace = SPANS):
        n_space = namespace.title.__reduce__()[1][0]
        node_id = self.UIDs[n_space][node_text]
        # find all triples with subject
        for s, p, o in mygraph.triples((namespace[node_id],  None, None)):
            print(f"{s.split('#')[-1]} ; {p.split('#')[-1]} ; {o.split('#')[-1]}")

In [7]:
# These wrappers only exist to help me quickly and consistently add nodes to the graph


def add_top_concept(graph, node_uid, namespace: Namespace=CONCEPTS):
    """ In some cases we'd like the concept to be linked to the ROOT of the graph, for visualisation. """
    graph.add((namespace[node_uid], SKOS.hasTopConcept, ROOT[top_concept_uid]))
    return graph

# IREC functions and REFERENCE
IREC.Span # A span is a sequence of characters that occurs verbatim in a text, either contiguous or discontiguos as extracted by SPaR.txt (Kruiper et al., 2021).   
IREC.constitutes  # Indicates that a span constitutes another span, e.g., the Multi-Word Expression (MWE) Span `hot water storage system` the Span `storage`.
IREC.isMorphologicallySimilarTo # Indicates that a Span is morphologically similar to another Span, e.g., they may have the same stem or a small Levenshtein distance.
IREC.isSemanticallySimilarTo # Indicates that a Span is semantically similar to another Span, following a cosine similarity between their  embeddings.
IREC.hasAcronym # A Span can have an acronym, e.g., `British Standards Institute` has the acronym `BSI`.
IREC.isAcronymOf # A Span can have an acronym, e.g., `BSI` is the acronym for `British Standards Institute`.
IREC.hasAntonym # Property that relates a Span to another Span, each being each other's antonyms.

def irec_span(graph, node_uid, text, namespace: Namespace=SPANS):
    """ Add a span node in the SPANS namespace, of type IREC.Span and the span text set as its RDF.label """
    # is preflabel a property? I would assume so
    graph.add(namespace[node_uid], RDF.type, IREC.Span)
    graph.add(namespace[node_uid], RDFS.label,  Literal(text, lang='en'))
    return graph

def irec_constitutes(graph, subject_node_uid, object_node_uid,
                     subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that somewhere in the label of the first SPAN node, you can find the second span's label """
    graph.add(namespace[subject_node_uid], IREC.constitutes, namespace[object_node_uid])
    return graph

def irec_morp_sim(graph, subject_node_uid, object_node_uid,
                  subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the labels of two SPAN nodes are morphologically similar """
    graph.add(namespace[subject_node_uid], IREC.isMorphologicallySimilarTo, namespace[object_node_uid])
    return graph

def irec_sem_sim(graph, subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the labels of two SPAN nodes are semantically similar, following the distributed semantics hypothesis """
    graph.add(namespace[subject_node_uid], IREC.isSemanticallySimilarTo, namespace[object_node_uid])
    return graph

def irec_has_acronym(graph, subject_node_uid, object_node_uid,
                     subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the label of the subject node has an acronym, ergo the label of the object node  """
    graph.add(namespace[subject_node_uid], IREC.hasAcronym, namespace[object_node_uid])
    return graph

def irec_is_acronym_of(graph, subject_node_uid, object_node_uid,
                       subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the label of the subject node is an acronym of the label of the object node  """
    graph.add(namespace[subject_node_uid], IREC.isAcronymOf, namespace[object_node_uid])
    return graph

def irec_antonym(graph, subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the label of the subject node is an antonym of the label of the object node  """
    graph.add(namespace[subject_node_uid], IREC.hasAntonym, namespace[object_node_uid])
    return graph


# SKOS 
def skos_node(graph, node_uid, text, namespace: Namespace=CONCEPTS):
    """ Add a concept to the graph in the CONCEPTS namespace, of type SKOS.Concept """
    graph.add(namespace[node_uid], RDF.type, SKOS.Concept)
    return graph

def skos_prefLabel(graph, node_uid, text, namespace: Namespace=CONCEPTS):
    """ Add the text label for a node """
    graph.add((namespace[node_uid], SKOS.prefLabel, Literal(text, lang='en')))
    return graph

def skos_altLabel(graph, node_uid, alt_label_uid, namespace: Namespace=CONCEPTS):
    """ Add an alternative text label for a concept node """
    graph.add((namespace[node_uid], SKOS.altLabel, namespace[alt_label_uid]))
    graph.add((namespace[alt_label_uid], SKOS.altLabel, namespace[node_uid]))
    return graph

def skos_related(subject_node_uid, object_node_uid,
                subject_namespace: Namespace=SPANS, object_namespace: Namespace=CONCEPTS):
    """ Denotes a relation between two nodes, would expect the nodes to be in different vocabularies """
    graph.add((subject_namespace[subject_node_uid], SKOS.related, object_namespace[object_node_uid]))
    return graph
    
def skos_narrower(graph, broader_node_uid, narrower_node_uid, 
                 broader_namespace: Namespace=SPANS, narrower_namespace: Namespace=SPANS):
    """ Assuming narrower/broader is always reflexive, would expect the nodes to be in different vocabularies """
    graph.add((namespace[narrower_uid], SKOS.narrower, namespace[broader_uid]))
    graph.add((namespace[broader_uid], SKOS.broader, namespace[narrower_uid]))
    return graph
    
def skos_note(graph, node_uid, note_text, namespace: Namespace=SPANS):
    """ I don't think this is used right now; not sure if there is a use-case at any point """
    graph.add((namespace[node_uid], SKOS.note, Literal(note_text, lang='en')))
    return graph

def skos_definition(graph, node_uid, definition_text, namespace: Namespace=CONCEPTS):
    """ The namespace indidcates the source of the definition? """
    graph.add((namespace[node_uid], SKOS.definition, Literal(definition_text, lang='en')))
    return graph

### Prepare namespaces

In [8]:
irec_graph = Graph()

irec_graph.bind("root", ROOT)
irec_graph.bind("wikipedia", WIKI)

In [9]:
# import our vocabulary
irec_graph.parse("IREC.rdf")

<Graph identifier=Nd907629ec1ca400fbd6ce0ffa3cb84b0 (<class 'rdflib.graph.Graph'>)>

In [13]:
ua = UID_assigner()
top_concept_uid = 'ROOT'

irec_graph = add_prefLabel(irec_graph, top_concept_uid, "NUU_graph_root", namespace = ROOT)

### Add base antonyms

In [15]:
# We want to capture antonyms: dichotomy in meaning of words, 
# For this we'll use NLTK's version of WordNet, which mainly captures antonyms for adjectives and adverbs.
wordnet_antonyms = {}
for i in wn.all_synsets():
    if i.pos() in ['a', 's']:    # If synset is adj or satelite-adj.
        for j in i.lemmas():     # Iterating through lemmas for each synset.
            if j.antonyms():     # If adj has antonym.
                wordnet_antonyms[j.name()] = [x.name() for x in j.antonyms()]

# Example of a useful antonym for us
wordnet_antonyms['hot']

['cold']

In [16]:
wordnet_antonyms['cold']

['hot']

In [17]:
for span in wordnet_antonyms.keys():
    span_uid, new_uid_bool = ua.assign_UID(span, SPANS)
    
    if new_uid_bool:
        # need to add the span to the graph
    
    if (SPANS[url_span], None, None) not in g: 
        irec_graph.add((SPANS[url_span], IREC.Span, Literal(span, lang='en')))
        
    antonyms = wordnet_antonyms[span]
    for antonym in antonyms:
        url_antonym = urllib.parse.quote(antonym)
        
        if (SPANS[url_antonym], None, None) not in g:
            irec_graph.add((SPANS[url_antonym], IREC.Span, Literal(antonym, lang='en')))
            
        # add the antonym relation
        irec_graph.add((SPANS[url_span], IREC.hasAntonym, SPANS[url_antonym]))
       

[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdf

[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdf

[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdf

### Add domain terms extracted from the Approved documents as Spans

In [9]:
domain_terms = pickle.load(open('data/domain_terms.pkl', 'rb'))

In [10]:
domain_terms

['expansion water',
 'the hot water system',
 'the hot water',
 'a hot water system',
 'the hot tap',
 'the water supply',
 'PLY',
 'Water Fittings',
 'Fittings',
 'the storage vessel',
 'the Gas Safety Installation',
 'the Gas Safety Installation Use',
 'Gas Safety Installation',
 'Regulations 1996',
 'Regulations 1992',
 'Regulations 1994',
 'ductwork',
 'ductwork serving',
 'pipework',
 'Electrical safety Dwellings',
 'industrial processes',
 'the stored water',
 'a sanitary conveniences',
 'sanitary conveniences',
 'sanitary fittings',
 'a sanitary convenience',
 'a temperature relief valve',
 'cistern lids',
 'cylinders',
 'ignition',
 'cylinder',
 'steam',
 'engine',
 'thermoplastic material',
 'thermoplastics',
 'thermoplastic product',
 'thermoplastic',
 'thermoplastic core',
 'thermoplastic materials a )',
 'thermoplastic panels',
 'thermoplastic substrate',
 'cistern',
 'cisterns',
 'the cistern',
 'washing facilities',
 'shower facilities',
 'changing facilities',
 'internal

In [11]:
for span in domain_terms:
    url_span = urllib.parse.quote(span)
    # note that each span is a unique identifier in and of itself
    if (SPANS[url_span], None, None) not in g:
        g.add((SPANS[url_span], IREC.Span, Literal(span, lang='en')))

### Add Acronyms that were grabbed from the text

These can help:
* remove terms where the boundary detection is off
* avoid suggesting similar acronyms, e.g., suggest that EPC and EPS are similar 

In [19]:
acronyms = {'PAS': ['ecification', 'Specification'],  'GSIUR': ['Regulations 1998'],  'HSE': ['Regulations 2000',   'water systems',   'Safety Executive',   'Health and Safety Executive'],  'PE': ['Polyethylene', 'polyethylene'],  'DN': ['pipe'],  'DCLG': ['land', 'Local Government', 'England', 'ment'],  'PP': ['Polypropylene'],  'BCB': ['Control Body',   'the building control body',   'Building control body',   'building control body',   'Building Control Body'],  'SRHRV': ['ventilator',   'single room heat recovery ventilator',   'a single room heat recovery ventilator'],  'MVHR': ['blocks', 'heat recovery'],  'WC': ['sets'],  'TFA': ['the total floor area'],  'LRV': ['Light reflectance value'],  'BER': ['Building CO2 Emission Rate', 'CO2 Emission Rate'],  'TER': ['CO2 Emission Rate',   'the Target CO2 Emission Rate',   'Target CO2 Emission Rate'],  'DER': ['CO2 Emission Rate', 'the Dwelling CO2 Emission Rate'],  'EPC': ['energy performance certificate'],  'TFEE': ['Target Fabric Energy Efficiency',   'Fixed building services',   'Energy Efficiency'],  'DHF': ['the Door and Hardware Federation', 'Door and Hardware Federation'],  'REI': ['fire resistance', 'bility'],  'PHE': ['horizontal evacuation'],  'W': ['the final exit', 'final exit'],  'DWELLINGS': ['RESIDENTIAL'],  'OTHER': ['RESIDENTIAL'],  'TSO': ['Office', 'The Stationery Office'],  'FPA': ['the Fire Protection Association', 'Association'],  'A': ['absorption area'],  'AT': ['absorption area'],  'DECC': ['Climate Change'],  'NCM': ['the National Calculation Methodology'],  'ADCAS': ['Allied Services'],  'DFEE': ['Energy Efficiency'],  'LPA': ['the local planning authority', 'planning authority'],  'UKAS': ['the United Kingdom Accreditation Service'],  'BSI': ['the British Standards Institution'],  'EA': ['Accreditation'],  'BGS': ['British Geological Survey'],  'HBN': ['Notes'],  'GGF': ['Glazing Federation'],  'E': ['terms of integrity'],  'TRADA': ['the Timber Research and Development Association', 'Association'],  'ACOP': ['Code of Practice'],  'ATTMA': ['Association'],  'RVA': ['Association', 'the Residential Ventilation Association'],  'TEHVA': ['Association'],  'DSA': ['Association'],  'CIRIA': ['Association'],  'MCRMA': ['Association'],  'DSMA': ['Association'],  'OFTEC': ['Association'],  'WHO': ['Organisation'],  'GAI': ['Architectural Ironmongers'],  'MEV': ['mechanical extract', 'extract ventilation'],  'VST': ['Vicat softening temperature'],  'SCI': ['Guild Steel Construction Institute'],  'FBE': ['the Built Environment', 'ment'],  'DSER': ['Rating'],  'WER': ['Rating'],  'CIWM': ['ment', 'Wastes Management'],  'EOTA': ['ment'],  'GQRA': ['ment'],  'BRE': ['ment', 'the Building Research Establishment'],  'PPS': ['ment'],  'PSV': ['Passive stack ventilation'],  'EST': ['the Energy Saving Trust'],  'CIBSE': ['Ventilation hygiene toolkit', 'Building Services Engineers'],  'AGS': ['Geoenvironmental Specialists'],  'SPAB': ['Ancient Buildings'],  'UF': ['urea formaldehyde'],  'ODPM': ['the Deputy Prime Minister']}

In [21]:
for acronym, spans in acronyms.items():
    url_acronym = urllib.parse.quote(acronym)
    if (SPANS[url_acronym], None, None) not in g:
        g.add((SPANS[url_acronym], IREC.Span, Literal(span, lang='en')))
    
    for span in spans:
        url_span = urllib.parse.quote(span)
        # note that each span should now be in the graph
        if (SPANS[url_span], IREC.hasAcronym, SPANS[url_acronym]) not in g:
            g.add((SPANS[url_span], IREC.hasAcronym, SPANS[url_acronym]))
        if (SPANS[url_acronym], IREC.isAcronymOf, SPANS[url_span]) not in g:
            g.add((SPANS[url_acronym], IREC.isAcronymOf, SPANS[url_span]))
    

### Add CONCEPTS: defined terms from the Approved Documents

In [22]:
# read data from csv file
definitions = pd.read_excel("data/Approved Documents and derived terms.xlsx", sheet_name="Definitions", keep_default_na=False)

In [8]:
definitions[:3]

Unnamed: 0,Term,Definition,Alternative labels,Note
0,Absorption,"Conversion of sound energy to heat, often by t...",,
1,Absorption coefficient,A quantity characterising the effectiveness of...,,See BS EN 20354:1993.
2,Absorptive material,Material that absorbs sound energy.,,


In [9]:
index_terms = pd.read_excel("data/Approved Documents and derived terms.xlsx", sheet_name="Index terms", keep_default_na=False)

In [10]:
index_terms[:3]

Unnamed: 0,Term,AltLabel(s),Related terms,Broader term
0,abbreviated eaves,,,eaves
1,Access floors,access floor,Platform floors,
2,Access for fire service,fire access,,Fire service facilities


In [11]:
# create graph from definitions first
for i, row in definitions.iloc[1:].iterrows():
    term = row['Term'] if row['Term'].isupper() else row['Term'].lower()
    alternative_labels = row['Alternative labels']
    definition = row['Definition']
    note = row['Note']
    
    # add the term 
    jargon_uid, new_uid = ua.assign_UID(term)
    if new_uid:
        mygraph = add_prefLabel(mygraph, jargon_uid, term)
    
    # hacky visualisation solution; connect all terms to the graph root
    mygraph = add_top_concept(mygraph, '0', jargon_uid)
    
    if note: 
        mygraph = add_note(mygraph, jargon_uid, note) 
    
    # always expecting a definition
    mygraph = add_definition(mygraph, jargon_uid, definition) 
    
    if alternative_labels:
        # lowercase if not an abbreviation
        alt_labels = [x.strip() if x.isupper() else x.lower().strip() for x in alternative_labels.split(", ")]  
        for alt_label in alt_labels:
            alt_label_uid, new_uid = ua.assign_UID(alt_label)
            if new_uid:
                mygraph = add_prefLabel(mygraph, alt_label_uid, alt_label)
        
            mygraph = add_altLabel(mygraph, jargon_uid, alt_label_uid)

In [12]:
# add triples from index terms
for i, row in index_terms.iloc[1:].iterrows():
    term = row['Term'].strip() if row['Term'].isupper() else row['Term'].lower().strip()
    alternative_labels = row['AltLabel(s)']
    related_terms = row['Related terms']
    broader_term = row['Broader term']
    
    # add the term 
    jargon_uid, new_uid = ua.assign_UID(term)
    if new_uid:
        mygraph = add_prefLabel(mygraph, jargon_uid, term)
    
        # hacky visualisation solution; connect all terms to the graph root
        mygraph = add_top_concept(mygraph, '0', jargon_uid)

    if alternative_labels:
        # lowercase if not an abbreviation
        alt_labels = [x.strip() if x.isupper() else x.lower().strip() for x in alternative_labels.split(", ")]  
        print(term)
        print(alt_labels)
        
        for alt_label in alt_labels:
            alt_label_uid, new_uid = ua.assign_UID(alt_label)
            if new_uid:
                mygraph = add_prefLabel(mygraph, alt_label_uid, alt_label)
            mygraph = add_altLabel(mygraph, jargon_uid, alt_label_uid)

    if related_terms:
        rel_terms = [x.strip() if x.isupper() else x.lower().strip() for x in related_terms.split(", ")]
        for rel_term in rel_terms:
            rel_term_uid, new_uid = ua.assign_UID(rel_term)
            if new_uid:
                mygraph = add_prefLabel(mygraph, rel_term_uid, rel_term)
            mygraph = add_related(mygraph, jargon_uid, rel_term_uid)
    
    if broader_term:    # expecting 1 broader term at most currently
        b_term = broader_term.strip().lower() if not broader_term.isupper() else broader_term.strip()
        b_term_uid, new_uid = ua.assign_UID(b_term)
        if new_uid:
            mygraph = add_prefLabel(mygraph, b_term_uid, b_term)
        mygraph = add_narrower(mygraph, b_term_uid, jargon_uid)

access floors
['access floor']
access for fire service
['fire access']
access panels
['access panel']
access rooms
['access room']
accessibility
['accessable']
accreditation
['accredited']
acoustic insulation
['sound insulation']
acrylic
['acrylic acid']
acrylic glass
['plexiglass', 'perspex']
adhesive
['ADH']
air and vapour control layer
['AVCL']
air conditioning
['air conditioner', 'airconditioner', 'air-conditioning', 'airconditioning', 'AC']
air infiltration
['airtightness', 'air permeability']
air polution index
['API']
air pressure drop
['APD']
alarm systems
['alarm', 'alarm system']
alterations
['alteration']
alternative approaches
['alternative approach']
alternative escape routes
['alternative escape route']
alternative exits
['alternative exit']
alternative supply of water
['alternative water supply']
aluminium
['ALUM', 'aluminum']
ampere
['amp', 'A']
architraves
['architrave']
assembly
['recreation purpose group']
atria
['atrium']
attic
['top-floor']
automatic doors
['automa

safety signs
['safety signals']
sandwich panel
['structural insulating panel', 'SIP', 'composite panel']
sanitary appliance
['sanitary convenience']
screw
['screws']
shingle roof tile
['shingles', 'shingle roofing', 'shingle']
shower
['showers']
single storey building
['single-storey building']
single storey flat
['single-storey flat']
skirting
['skirting board']
sleeving
['outer casing']
smoke alarm
['smoke alarms', 'smoke detector', 'smoke detectors']
smoke outlet
['smoke vent']
soffit
['soffit board']
soil vent stack
['SVP']
solar control
['solar energy control']
solar hot water system
['solar thermal system']
solar module roof anchors
['roof anchors for solar modules']
solar panel
['solar cell panel', 'solar collector', 'solar roof panel']
solar power
['solar energy']
solar system
['solar array', 'photo-voltaic system']
spiral stair
['spiral stairs']
sprinkler system
['sprinkler']
storey
['floor', 'deck']
stretched-skin ceiling
['stretched-skin ceilings', 'stretch ceiling']
structu

### Step 1: create a graph from the approved documents terms file

In [6]:
mygraph, ua = create_graph_instance()

In [7]:
# read data from csv file
definitions = pd.read_excel("data/Approved Documents and derived terms.xlsx", sheet_name="Definitions", keep_default_na=False)

In [13]:
mygraph.serialize(destination="graph/approved_doc_terms_only.ttl")

<Graph identifier=Nf8a0fc52b88048248d2d66d09191eac0 (<class 'rdflib.graph.Graph'>)>

In [14]:
# nr of nodes

ua.count_nodes_in_namespace(JARGON)

Number of nodes in 'http://example.org/concept/#title': 1642


In [15]:
ua.print_node_by_text('R', namespace=JARGON)

292 ; prefLabel ; R
292 ; altLabel ; 291


In [16]:
ua.print_node_by_id(291, namespace=JARGON)

291 ; prefLabel ; sound reduction index
291 ; hasTopConcept ; 0
291 ; note ; See BS EN ISO 140-3:1995.
291 ; definition ; A quantity, measured in a laboratory, which characterises the sound insulating properties of a material or building element in a stated frequency band.
291 ; altLabel ; 292


### Step 2: grab wikipedia definitions for nodes, and store locally

In [17]:
[jargon] = [x for x in ua.UIDs.keys()]
jargon_terms = ua.UIDs[jargon]
len(jargon_terms)

1642

In [18]:
# set up the SPARQL endpoint for wikidata

sparql_wrapper = SPARQLWrapper("https://query.wikidata.org/sparql")

In [19]:
def get_wiki_matches(graph_sparql_endpoint: SPARQLWrapper,
                     jargon_term_and_uids: Dict[str, str]):

    all_wiki_definitions = {}
    # we want to grab the term (subject), any definition (subjectDescription) and the class (subjectClass)
    sparql_q = """
               SELECT ?subject ?subjectDescription ?classUID ?className WHERE {
                  ?subject rdfs:label "$QUERY"@en.
                  ?subject wdt:P31|wdt:P279 ?classUID.
                  SERVICE wikibase:label { bd:serviceParam wikibase:language "en".}
                  ?classUID  rdfs:label ?className  FILTER(LANG(?className) = "en").
                }
               """
    
    for term, uid in jargon_term_and_uids.items():
        # make the call to 
        temp_q = sparql_q.replace("$QUERY", term)
        graph_sparql_endpoint.setQuery(temp_q)
        graph_sparql_endpoint.setReturnFormat(JSON)
        json_output = graph_sparql_endpoint.query().convert()
        
        # sometimes multiple Wiki UIDs for a single term, we grab them all here
        for v in json_output['results']['bindings']:
            class_uid = v['classUID']['value'] if 'classUID' in v else ""
            class_label = v['className']['value'] if 'className' in v else ""
            
            if 'subjectDescription' in v:
                if uid not in all_wiki_definitions:
                    all_wiki_definitions[uid] = [{'prefLabel': term,
                                                  'class_uid': class_uid,
                                                  'class_label': class_label,
                                                  'WikiUID': v['subject']['value'],
                                                  'WikiDefinition': v['subjectDescription']['value']}]
                else:
                    all_wiki_definitions[uid].append({'prefLabel': term,
                                                      'class_uid': class_uid,
                                                      'class_label': class_label,
                                                      'WikiUID': v['subject']['value'],
                                                      'WikiDefinition': v['subjectDescription']['value']})
            elif 'subject' in v:
                # no description found, simply adding wiki UID if that exists
                if uid not in all_wiki_definitions:
                    all_wiki_definitions[uid] = [{'prefLabel': term,
                                                  'class_uid': class_uid,
                                                  'class_label': class_label,
                                                  'WikiUID': v['subject']['value']}]
                else:
                    all_wiki_definitions[uid].append({'prefLabel': term,
                                                      'class_uid': class_uid,
                                                      'class_label': class_label,
                                                      'WikiUID': v['subject']['value']})
    return all_wiki_definitions


In [20]:
if not os.path.exists("data/wiki_dict.json"):
    wiki_dict = get_wiki_matches(sparql_wrapper, jargon_terms)#{'test': 1, 'conductor':2})
    with open("data/wiki_dict.json", 'w') as f:
        json.dump(wiki_dict, f, indent=2)
else:
    with open("data/wiki_dict.json", 'r') as f:
        wiki_dict = json.load(f)

In [21]:
# check wikipedia classes that are being returned
wiki_class_dict = {}
for my_uid, wiki_matches in wiki_dict.items():
    for match in wiki_matches:
        if match['class_label'] not in wiki_class_dict:
            wiki_class_dict[match['class_label']] = {'UID': [match['class_uid']],
                                                     'examples': [match['prefLabel']]}
            
        elif match['class_uid'] != wiki_class_dict[match['class_label']]:
            wiki_class_dict[match['class_label']]['UID'].append(match['class_uid'])
            wiki_class_dict[match['class_label']]['examples'].append(match['prefLabel'])
        

In [22]:
# Print wikipedia class names for annotation
for k, v in wiki_class_dict.items():
    print(k)

physical quantity
name
metadata
room
courtyard
architectural element
general anatomical term
class of anatomical entity
particular anatomical entity
cardiac chamber
anatomical structure
systems engineering
type of regulation and control
synchronization primitive
story
natural physical object
building
obstacle
closed set
geographical feature
topological property
boundary
Wikimedia list article
human-made landform
operation result
delimitation
chain
structure
organization
Wikimedia disambiguation page
legal person
buildings
architectural structure
facility
method of discovery
occurrence
building of pleasure and sporting boats
priming
concrete object
absence
horizontal structural element
storage room
human-made geographic feature
steps
part
theater space
vehicle component
thoroughfare
support
line of partition
stairs
memory area
common area
real property
exterior ornament of the shield
interior space
cavity
cabin
compartment
mechanical load
non-SI unit mentioned in and accepted with the S

In [23]:
# Print wikipedia class UIDs for annotation
for k, v in wiki_class_dict.items():
    print(list(set([x.rsplit('/',1)[1] for x in v['UID']])))

['Q107715']
['Q82799']
['Q180160']
['Q180516']
['Q309250']
['Q391414']
['Q28843519']
['Q112826905']
['Q112826975']
['Q3368336']
['Q4936952']
['Q682496']
['Q96758092']
['Q96359153']
['Q831691']
['Q16686022']
['Q41176']
['Q264661']
['Q320357']
['Q618123']
['Q625948']
['Q1307347']
['Q13406463']
['Q35145743']
['Q55091441']
['Q3241107']
['Q4504468']
['Q6671777']
['Q43229']
['Q4167410']
['Q3778211']
['Q43694649']
['Q811979']
['Q13226383']
['Q111047616']
['Q1190554']
['Q893164']
['Q66098695']
['Q4406616']
['Q19829125']
['Q30327093']
['Q15710020']
['Q811430']
['Q1454694']
['Q15989253']
['Q42679632']
['Q60673395']
['Q83620']
['Q1058733']
['Q1088910']
['Q12511']
['Q2308577']
['Q5153508']
['Q10494269']
['Q1052804']
['Q1299240']
['Q28367692']
['Q60998096']
['Q12347612']
['Q815284']
['Q3268848']
['Q17342348']
['Q7187']
['Q29023906']
['Q904927']
['Q1924249']
['Q3387041']
['Q21077852']
['Q21294996']
['Q5725005']
['Q11703678']
['Q2996394']
['Q4254955']
['Q112075259']
['Q35473']
['Q37038']
['Q29053744'

In [24]:
# Print examples for each wikipedia class for annotation
for k, v in wiki_class_dict.items():
    print(list(set([x for x in v['examples']][:10])))

['air pressure', 'electric current', 'frequency', 'area', 'sound pressure level', 'absorption coefficient', 'sound reduction index', 'density', 'reverberation time', 'pitch']
['access point']
['access point']
['wet room', 'utility room', 'bathroom', 'bedroom', 'kitchen', 'vestibule', 'hall', 'toilet', 'atrium', 'entrance hall']
['atrium']
['ramp', 'skylight', 'ceiling', 'glazing', 'stairs', 'handrail', 'pier', 'atrium', 'landing']
['compartment', 'column', 'membrane', 'cistern', 'atrium']
['fascia', 'duct', 'joint', 'vestibule', 'valve', 'atrium']
['atrium', 'fascia', 'valve', 'joint']
['atrium']
['atrium', 'compartment']
['automatic control']
['automatic control']
['barrier', 'lock']
['level', 'basement']
['basement']
['garage', 'fireplace', 'bathroom', 'extension', 'building', 'cistern', 'commercial building', 'kitchen', 'dwelling house', 'school']
['boundary', 'hazard', 'door']
['boundary']
['boundary']
['boundary']
['boundary']
['boundary', 'slate']
['boundary', 'garden', 'slope']


In [27]:
annotation_str = "y n n y y y n n n n n y y n y n y y n n n y n y n y n n y n y y y y n y n n y n y y y y n n n y n n y n y y n y y n n y y y n n n y y y y y n n n n y n y n y n n n n n y n n n n n n n n n n n n n n n n n n n y y y y y y y y y y y y n n y n n n y y y y y n y n n n n n n n n y n n n n n n n n n y n n n y n n n n n n y y n n n n n n n n n n n n n n n y n n n n n n n n n n n y n n n y y y y y y y y y y y y y y y y n y y n n y y y y y n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n y y n y y y y y n n n n n y n n y y y y n n y n y n y y y y n y y y y y y y n n n y n y y y y y y y y n y n n n y n n y y y y n n y n n n y n y n n y y y y n y n n n n n n y y y y y n n y y n y n n y n n n n y n y y n y y y n y y n y n n y y y n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n n y y y y y n y y n y y n y y n y n n n y y n y y n y n n n n y y y y y y y y y y n n y y y y n n n y y n n n n n y n n n n n n n n n n n n n n n n n n n n y y n y y y y y n y y n n n n y n y n n n y y n n y y y n n n n n n n y n y n y y n n y y n n y y y y y n y y y y n n n n y n n n n y y y y y y y n y y y n y y y y y n y y y y y n n n n y n y y y y y n n n y y n n n n y n n y y y y n n y y y y n y y y n n n n y y n y n n n y n n y n n y n y n y y n n n n y n n n n n n y y y y y y y y y y n n n y n n y n y n n n n n n n n n y n n y n y y y y y n y y y n n y n n y y y y y y n n n n n y y n y n n y y n n y y n y y y n y n n n n n n n y n n n y y n n n n n n n y y y n n n n y n n n n n n y y y y y y y y y n n n n n n n n y y y y y n n n y y n y n n n n n y n y n y y y y y y y y n n y n y n y y y y y y y y y y y y y y y y y y y y y y y n n n n n n y y n n y n y y y n n y n y y y y y y n n n y y n n y n y n n y y y y n n y n n n y n y y n n n n n n y n n n n y y y n n y y y y y n y n y n y y n n n n n n n n y y n y n n n n n y y n n n n n y n n n n n n n n n n n n n y n n n n y n n n y y y y n y n y y n n n n n y n n n n y y n n n n n n y y n n n n y y n n n y n n y n n y n y y n n y n y y y y y n y n y n n n n n y y n n y y n n n n n y y y y n y n n n y n n n n n n n n n y y y y y n n y y y y y y y y y y n y n y y y n n y n y y y y n n y y y y n y y y y y y y y y n n n n n n n n n y y n n y y y n n y y n n n y y y y y y y n n y y y n y y y y n n y y y y y y y y y y y y y y n y y n n n n n y y y y y n y n n y y n n n n n n n n n n n n y n n n y n n n y y y y y y y n n n y y y n n n n n n n n n n n y n y y y y n y"
annotations = annotation_str.split(' ')

In [28]:
assert len(annotations) == len(wiki_class_dict)

In [29]:
for k, a in zip(wiki_class_dict.keys(), annotations):
    wiki_class_dict[k]['relevant'] = a
        

### Step 3: parse all definitions, in order to identify new nodes and links between nodes
* we won't re-define the new nodes (again), because we assume that this would cause too much drift?
* should check to make sure that that's the case

In [31]:
def parse_definition(full_definition):
    definition_sentences = [str(sent) for sent in TextBlob(full_definition).sentences]
    identified_objects = []
    for definition_sent in definition_sentences:
        encoded_def = urllib.parse.quote(definition_sent)
        spar_output = requests.get(f"http://localhost:8000/predict_objects/{encoded_def}").json()
        try:
            spar_objects = spar_output["prediction"]['obj']
            identified_objects += util.custom_cleaning_rules(spar_objects)
        except:
            continue
    return identified_objects

In [None]:
for k, v in complete_dict.items():
    # run spar.txt on definition
    full_definition = v['definition']
    # should break up into sentences
    definition_sentences = [str(sent) for sent in TextBlob(full_definition).sentences]
    complete_dict[k]['def_terms'] = []
    
    for definition_sent in definition_sentences:
        encoded_def = urllib.parse.quote(definition_sent)
        spar_output = requests.get(f"http://localhost:8000/predict_objects/{encoded_def}").json()
        try:
            spar_objects = spar_output["prediction"]['obj']
            cleaned_objs = util.custom_cleaning_rules(spar_objects)
            for obj in cleaned_objs:
                complete_dict[k]['def_terms'] += cleaned_objs
        except:
            continue

In [41]:
# approved_documents
object_dict = {}
for row in definitions.iterrows():
    idx, (term, definition, alt_labels, note) = row
    objects = parse_definition(definition)
    
    object_dict[term] = objects

KeyboardInterrupt: 

In [37]:

#     term = term if term.isupper() else term
    
#     for obj in objects:
#         obj = obj if obj.isupper() else obj
#         # add the object to the graph 
#         obj_uid, new_uid = ua.assign_UID(obj)
#         if new_uid:
#             mygraph = add_prefLabel(mygraph, obj_uid, obj)
        
#         # add relation between term and mygraph
            
      # ====      
#     def add_related(graph, jargon_uid, other_term_uid, jargon_namespace: Namespace=JARGON, related_namespace: Namespace=JARGON):
#     graph.add((jargon_namespace[jargon_uid], SKOS.related, related_namespace[other_term_uid]))
#     graph.add((related_namespace[other_term_uid], SKOS.related, jargon_namespace[jargon_uid]))
#     return graph

0    Conversion of sound energy to heat, often by t...
1    A quantity characterising the effectiveness of...
2                  Material that absorbs sound energy.
Name: Definition, dtype: object

### Step 4: Add relevant wikipedia nodes and to graph

In [None]:
# TODO merge same defintion, us rdfs:a 
# TODO merge same defintion, us rdfs:a 

In [None]:
# - create a new node for the wikipedia term, with prefLabel the term
for uid, wiki_def_dict_list in expanded_dict.items():
    
    # we will only take into account the 1st definition for now!
    # we will only take into account the 1st definition for now!
    # we will only take into account the 1st definition for now!
    idx = 0
    d = wiki_def_dict_list[idx]
    
    term = d['prefLabel']
    wiki_uid = d['WikiUID'].rsplit('/',1)[1]
    if 'WikiDefinition' in d:
        if wiki_definition == "Wikimedia disambiguation page":
            # skip disambiguation pages in general
            continue
        
        definition = d['WikiDefinition']
        spar_objects = d['WikiDef_terms']
        mygraph = add_wiki_exact_match(mygraph, term, uid, wiki_uid, definition, spar_objects)
    else:
        print(f"Term with wiki exact match, but without definition: {term}")
        mygraph = add_wiki_exact_match(mygraph, term, uid, wiki_uid)

In [None]:
# def add_wiki_exact_match(graph, term, mygraph_uid, wiki_uid, wiki_definition=None, spar_objects=None):
#     graph.add((EX[mygraph_uid], SKOS.exactMatch, WIKI[wiki_uid]))
#     graph.add((WIKI[wiki_uid], SKOS.prefLabel, Literal(term, lang='en')))
#     if wiki_definition:
#         graph.add((WIKI[wiki_uid], SKOS.definition, Literal(wiki_definition, lang='en')))
#     if spar_objects:
#         for obj in spar_objects:
#             # UIDs for these terms are assigned within our EXAMPLE namespace
#             def_term_uid, new_uid_bool = ua.assign_UID(def_term)
#             if new_uid_bool:
#                 mygraph = add_prefLabel(mygraph, def_term_uid, def_term)
            
#             graph.add((WIKI[wiki_uid], SKOS.related, EX[def_term_uid]))
#             graph.add((EX[def_term_uid], SKOS.related, WIKI[wiki_uid]))
            
#     return graph