In [1]:
import re
import time
import pickle
import glob, os
import requests, urllib
import json, random

import pandas as pd

from bs4 import BeautifulSoup as bs
from tqdm import tqdm
from typing import List, Any, List, Dict
from pathlib import Path
from textblob import TextBlob
from itertools import combinations, combinations_with_replacement
from collections import Counter
from nltk.corpus import wordnet as wn
from SPARQLWrapper import SPARQLWrapper, JSON

from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import XSD, RDF, RDFS, SKOS, NamespaceManager

from utils.spar_utils import TermExtractor
from utils.cleaning_utils import custom_cleaning_rules, remove_unicode_chars

We would like to express the following features/relations:
* Dictionary definition terms, which are always concepts
  * We'll use the source as namespace, and corresponding concept identifier if it exists
  * SKOS is used to establish a mapping (e.g., skos:exactMatch) and add the definition (skos:definition)
* Special properties that we want to capture between words, which may help identify concepts:
  * Word is part of MWE
  * Morphologically similar words; stemming & Levenshtein distance
  * Semantically similar words; distributed similarity (NNs)
  * Acronyms
  * Related, this is a generic relation, e.g., a `ampere` is related to `electric current`
  * Domain-specificity; foreground or background term following our filtering procedure
 

### Prepare namespaces

In [2]:
ROOT = Namespace("https://example.org/top_concept_for_visulisation/#")
WIKI = Namespace("https://www.wikidata.org/entity/#")
# Note: that UNICLASS is not a namespace (yet) only has identifiers 
UNICLASS = Namespace("https://www.example.org/uniclass/")

In [3]:
IREC_ontology_URL = "https://example.org/irec-schema#"
IREC_spans_URL = "https://example.org/irec-spans#"
IREC_concepts_URL = "https://example.org/irec-concepts#"

In [4]:
# create our custom namespace for the schema to store spans
IREC = Namespace(IREC_ontology_URL)

# create a custom namespace to store spans and concepts
SPANS = Namespace(IREC_spans_URL)
CONCEPTS = Namespace(IREC_concepts_URL)

### graph creation utilities

In [61]:
class UID_assigner:
    def __init__(self):
        self.UIDs = {}
        self.UID = 0
        
    def assign_UID(self, text, namespace: Namespace):
        """
        Determines which type of UID to assign, based on the namespace.
        """
        if namespace == SPANS:
            return self.span_UID(text)
        elif namespace == CONCEPTS:
            return self.concept_UID(text)
        else:
            print("UID assignment not set up for this namespace, maybe use UID_assigner.keep_track_of_existing_UID()")
            
    
    def span_UID(self, text):
        """
        NOTE: each text span is a unique identifier in and of itself. We'll simply convert the text span to 
        a URL friendly representation.
        """
        n_space = SPANS.placeholder.defrag().__reduce__()[1][0]
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
        
        urltext = urllib.parse.quote(text)
        if text in self.UIDs[n_space]:
            return self.UIDs[n_space][text], False
        else:
            self.UIDs[n_space][text] = urltext
            return self.UIDs[n_space][text], True
        
    def concept_UID(self, text):
        """
        For now I'll create my own dumb interger-based UIDs for nodes as a simple shortcut, split per namespace
        """
        n_space = CONCEPTS.placeholder.defrag().__reduce__()[1][0]
        
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
        
        if text in self.UIDs[n_space]:
            return self.UIDs[n_space][text], False
        else:
            self.UID += 1
            self.UIDs[n_space][text] = str(self.UID)
            return self.UIDs[n_space][text], True
        
    def keep_track_of_existing_UID(self, text:str, existing_uid: str, namespace:Namespace):
        """
        Simply keep track of UIDs that exist in the provided namespace.
        """
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
            
        if text not in self.UIDs[n_space]:
            # already seen by this UID assigner
            self.UIDs[n_space][text] = existing_uid
        return existing_uid, True
    
    def retrieved_uid_by_text(self, node_text, namespace: Namespace = SPANS):
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        if node_text in self.UIDs[n_space]:
            return self.UIDs[n_space][node_text]
        else:
            return None 
        
    def count_nodes_in_namespace(self, namespace: Namespace = SPANS):
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        print(f"Number of nodes in '{n_space}': {len(self.UIDs[n_space])}")
        return len(self.UIDs[n_space])
        
    def print_node_by_id(self, graph, node_id, namespace: Namespace = SPANS):
        for s, p, o in graph.triples((namespace[str(node_id)],  None, None)):
            print(f"{s.split('#')[-1]} ; {p.split('#')[-1]} ; {o.split('#')[-1]}")
        
    def print_node_by_text(self, graph, node_text, namespace: Namespace = SPANS):
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        node_id = self.UIDs[n_space][node_text]
        # find all triples with subject
        for s, p, o in graph.triples((namespace[node_id],  None, None)):
            print(f"{s.split('#')[-1]} ; {p.split('#')[-1]} ; {o.split('#')[-1]}")

In [62]:
# These wrappers only exist to help me quickly and consistently add nodes to the graph

def add_top_concept(graph, node_uid, namespace: Namespace=CONCEPTS):
    """ In some cases we'd like the concept to be linked to the ROOT of the graph, for visualisation. """
    graph.add((namespace[node_uid], SKOS.hasTopConcept, ROOT[top_concept_uid]))
    return graph

# IREC functions and REFERENCE
IREC.Span # A span is a sequence of characters that occurs verbatim in a text, either contiguous or discontiguos as extracted by SPaR.txt (Kruiper et al., 2021).   
IREC.constitutes  # Indicates that a span constitutes another span, e.g., the Multi-Word Expression (MWE) Span `hot water storage system` the Span `storage`.
IREC.isMorphologicallySimilarTo # Indicates that a Span is morphologically similar to another Span, e.g., they may have the same stem or a small Levenshtein distance.
IREC.isSemanticallySimilarTo # Indicates that a Span is semantically similar to another Span, following a cosine similarity between their  embeddings.
IREC.related # General way to indicate some relation between two spans, e.g., `ampere` is related to `electric current`
IREC.hasAcronym # A Span can have an acronym, e.g., `British Standards Institute` has the acronym `BSI`.
IREC.isAcronymOf # A Span can have an acronym, e.g., `BSI` is the acronym for `British Standards Institute`.
IREC.hasAntonym # Property that relates a Span to another Span, each being each other's antonyms.

def irec_span(graph, node_uid, text, namespace: Namespace=SPANS):
    """ Add a span node in the SPANS namespace, of type IREC.Span and the span text set as its RDF.label """
    # is preflabel a property? I would assume so
    graph.add((namespace[node_uid], RDF.type, IREC.Span))
    graph.add((namespace[node_uid], RDFS.label,  Literal(text, lang='en')))
    return graph

def irec_constitutes(graph, subject_node_uid, object_node_uid,
                     subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that somewhere in the label of the first SPAN node, you can find the second span's label """
    graph.add((subject_namespace[subject_node_uid], IREC.constitutes, object_namespace[object_node_uid]))
    return graph

def irec_morp_sim(graph, subject_node_uid, object_node_uid,
                  subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the labels of two SPAN nodes are morphologically similar """
    graph.add((subject_namespace[subject_node_uid], IREC.isMorphologicallySimilarTo, object_namespace[object_node_uid]))
    return graph

def irec_sem_sim(graph, subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the labels of two SPAN nodes are semantically similar, following the distributed semantics hypothesis """
    graph.add((subject_namespace[subject_node_uid], IREC.isSemanticallySimilarTo, object_namespace[object_node_uid]))
    return graph

def irec_related(graph, subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that a Span is related in SOME way to another Span. """
    graph.add((subject_namespace[subject_node_uid], IREC.related, object_namespace[object_node_uid]))
    return graph

def irec_has_acronym(graph, subject_node_uid, object_node_uid,
                     subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the label of the subject node has an acronym, ergo the label of the object node  """
    graph.add((subject_namespace[subject_node_uid], IREC.hasAcronym, object_namespace[object_node_uid]))
    return graph

def irec_is_acronym_of(graph, subject_node_uid, object_node_uid,
                       subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the label of the subject node is an acronym of the label of the object node  """
    graph.add((subject_namespace[subject_node_uid], IREC.isAcronymOf, object_namespace[object_node_uid]))
    return graph

def irec_antonym(graph, subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the label of the subject node is an antonym of the label of the object node  """
    graph.add((subject_namespace[subject_node_uid], IREC.hasAntonym, object_namespace[object_node_uid]))
    return graph


# SKOS 
def skos_node(graph, node_uid, text, namespace: Namespace=CONCEPTS):
    """ Add a concept to the graph in the CONCEPTS namespace, of type SKOS.Concept """
    graph.add((namespace[node_uid], RDF.type, SKOS.Concept))
    graph = skos_prefLabel(graph, node_uid, text, namespace)
    return graph

def skos_prefLabel(graph, node_uid, text, namespace: Namespace=CONCEPTS):
    """ Add the text label for a node """
    graph.add((namespace[node_uid], SKOS.prefLabel, Literal(text, lang='en')))
    return graph

def skos_altLabel(graph, node_uid, alt_label_uid, namespace: Namespace=CONCEPTS):
    """ Add an alternative text label for a concept node """
    graph.add((namespace[node_uid], SKOS.altLabel, namespace[alt_label_uid]))
    graph.add((namespace[alt_label_uid], SKOS.altLabel, namespace[node_uid]))
    return graph

def skos_exact_match(graph, subject_node_uid, object_node_uid,
                subject_namespace: Namespace=SPANS, object_namespace: Namespace=CONCEPTS):
    """ Denotes an exact match between two nodes, would expect the nodes to be in different vocabularies """
    graph.add((subject_namespace[subject_node_uid], SKOS.exactMatch, object_namespace[object_node_uid]))
    return graph

def skos_related(graph, subject_node_uid, object_node_uid,
                subject_namespace: Namespace=SPANS, object_namespace: Namespace=CONCEPTS):
    """ Denotes a relation between two nodes, would expect the nodes to be in different vocabularies """
    graph.add((subject_namespace[subject_node_uid], SKOS.related, object_namespace[object_node_uid]))
    return graph
    
def skos_broader(graph, narrower_node_uid, broader_node_uid,
                 narrower_namespace: Namespace=CONCEPTS, broader_namespace: Namespace=CONCEPTS):
    """ Assuming narrower/broader is always reflexive, would expect the nodes to be in different vocabularies """
    graph.add((broader_namespace[narrower_node_uid], SKOS.narrower, narrower_namespace[broader_node_uid]))
    graph.add((narrower_namespace[broader_node_uid], SKOS.broader, broader_namespace[narrower_node_uid]))
    return graph
    
def skos_note(graph, node_uid, note_text, namespace: Namespace=CONCEPTS):
    """ I don't think this is used right now; not sure if there is a use-case at any point """
    graph.add((namespace[node_uid], SKOS.note, Literal(note_text, lang='en')))
    return graph

def skos_definition(graph, node_uid, definition_text, namespace: Namespace=CONCEPTS):
    """ The namespace indidcates the source of the definition? """
    graph.add((namespace[node_uid], SKOS.definition, Literal(definition_text, lang='en')))
    return graph

### Prepare namespaces

In [63]:
irec_graph = Graph()

irec_graph.bind("root", ROOT)
irec_graph.bind("wikipedia", WIKI)

In [64]:
# import our vocabulary of classes/relations
graph_data_fp = Path.cwd().joinpath("data", "graph_data")
irec_graph.parse(graph_data_fp.joinpath("IREC.rdf"))

<Graph identifier=N23da844f2e644cff9cf41c3e49e6fdc5 (<class 'rdflib.graph.Graph'>)>

In [65]:
ua = UID_assigner()
top_concept_uid = 'ROOT'

irec_graph = skos_prefLabel(irec_graph, top_concept_uid, "NUU_graph_root", namespace = ROOT)

### Add base antonyms

In [66]:
# We want to capture antonyms: dichotomy in meaning of words, 
# For this we'll use NLTK's version of WordNet, which mainly captures antonyms for adjectives and adverbs.
wordnet_antonyms = {}
for i in wn.all_synsets():
    if i.pos() in ['a', 's']:    # If synset is adj or satelite-adj.
        for j in i.lemmas():     # Iterating through lemmas for each synset.
            if j.antonyms():     # If adj has antonym.
                wordnet_antonyms[str(j.name()).strip()] = [x.name() for x in j.antonyms()]

# Example of a useful antonym for us
wordnet_antonyms['hot']

['cold']

In [67]:
wordnet_antonyms['cold']

['hot']

In [68]:
for span in wordnet_antonyms.keys():
    span_uid, new_uid_bool = ua.assign_UID(span, SPANS)
    
    if new_uid_bool: # equals if (SPANS[uid], None, None) not in graph: 
        # need to add the span to the graph
        irec_graph = irec_span(irec_graph, span_uid, span)
        
    antonyms = wordnet_antonyms[span]
    for antonym in antonyms:
        antonym_uid, new_uid_bool = ua.assign_UID(antonym, SPANS)
        
        if new_uid_bool:
            irec_graph = irec_span(irec_graph, antonym_uid, antonym)
            
        # add the antonym relation
        irec_graph = irec_antonym(irec_graph, span_uid, antonym_uid)
       

### Add domain terms extracted from the Approved documents as Spans

In [69]:
domain_terms = pickle.load(open(graph_data_fp.joinpath('domain_terms.pkl'), 'rb'))

In [70]:
domain_terms

['expansion water',
 'the hot water system',
 'the hot water',
 'a hot water system',
 'the hot tap',
 'the water supply',
 'PLY',
 'Water Fittings',
 'Fittings',
 'the storage vessel',
 'the Gas Safety Installation',
 'the Gas Safety Installation Use',
 'Gas Safety Installation',
 'Regulations 1996',
 'Regulations 1992',
 'Regulations 1994',
 'ductwork',
 'ductwork serving',
 'pipework',
 'Electrical safety Dwellings',
 'industrial processes',
 'the stored water',
 'a sanitary conveniences',
 'sanitary conveniences',
 'sanitary fittings',
 'a sanitary convenience',
 'a temperature relief valve',
 'cistern lids',
 'cylinders',
 'ignition',
 'cylinder',
 'steam',
 'engine',
 'thermoplastic material',
 'thermoplastics',
 'thermoplastic product',
 'thermoplastic',
 'thermoplastic core',
 'thermoplastic materials a )',
 'thermoplastic panels',
 'thermoplastic substrate',
 'cistern',
 'cisterns',
 'the cistern',
 'washing facilities',
 'shower facilities',
 'changing facilities',
 'internal

In [71]:
# simply adding the extracted spans
for span in domain_terms:
    span_uid, new_uid_bool = ua.assign_UID(span, SPANS)
    
    if new_uid_bool:
        irec_graph = irec_span(irec_graph, span_uid, span)

### Add Acronyms that were grabbed from the text

These can help:
* remove terms where the boundary detection is off
* avoid suggesting similar acronyms, e.g., suggest that EPC and EPS are similar 

In [72]:
acronyms = {'PAS': ['ecification', 'Specification'],  'GSIUR': ['Regulations 1998'],  'HSE': ['Regulations 2000',   'water systems',   'Safety Executive',   'Health and Safety Executive'],  'PE': ['Polyethylene', 'polyethylene'],  'DN': ['pipe'],  'DCLG': ['land', 'Local Government', 'England', 'ment'],  'PP': ['Polypropylene'],  'BCB': ['Control Body',   'the building control body',   'Building control body',   'building control body',   'Building Control Body'],  'SRHRV': ['ventilator',   'single room heat recovery ventilator',   'a single room heat recovery ventilator'],  'MVHR': ['blocks', 'heat recovery'],  'WC': ['sets'],  'TFA': ['the total floor area'],  'LRV': ['Light reflectance value'],  'BER': ['Building CO2 Emission Rate', 'CO2 Emission Rate'],  'TER': ['CO2 Emission Rate',   'the Target CO2 Emission Rate',   'Target CO2 Emission Rate'],  'DER': ['CO2 Emission Rate', 'the Dwelling CO2 Emission Rate'],  'EPC': ['energy performance certificate'],  'TFEE': ['Target Fabric Energy Efficiency',   'Fixed building services',   'Energy Efficiency'],  'DHF': ['the Door and Hardware Federation', 'Door and Hardware Federation'],  'REI': ['fire resistance', 'bility'],  'PHE': ['horizontal evacuation'],  'W': ['the final exit', 'final exit'],  'DWELLINGS': ['RESIDENTIAL'],  'OTHER': ['RESIDENTIAL'],  'TSO': ['Office', 'The Stationery Office'],  'FPA': ['the Fire Protection Association', 'Association'],  'A': ['absorption area'],  'AT': ['absorption area'],  'DECC': ['Climate Change'],  'NCM': ['the National Calculation Methodology'],  'ADCAS': ['Allied Services'],  'DFEE': ['Energy Efficiency'],  'LPA': ['the local planning authority', 'planning authority'],  'UKAS': ['the United Kingdom Accreditation Service'],  'BSI': ['the British Standards Institution'],  'EA': ['Accreditation'],  'BGS': ['British Geological Survey'],  'HBN': ['Notes'],  'GGF': ['Glazing Federation'],  'E': ['terms of integrity'],  'TRADA': ['the Timber Research and Development Association', 'Association'],  'ACOP': ['Code of Practice'],  'ATTMA': ['Association'],  'RVA': ['Association', 'the Residential Ventilation Association'],  'TEHVA': ['Association'],  'DSA': ['Association'],  'CIRIA': ['Association'],  'MCRMA': ['Association'],  'DSMA': ['Association'],  'OFTEC': ['Association'],  'WHO': ['Organisation'],  'GAI': ['Architectural Ironmongers'],  'MEV': ['mechanical extract', 'extract ventilation'],  'VST': ['Vicat softening temperature'],  'SCI': ['Guild Steel Construction Institute'],  'FBE': ['the Built Environment', 'ment'],  'DSER': ['Rating'],  'WER': ['Rating'],  'CIWM': ['ment', 'Wastes Management'],  'EOTA': ['ment'],  'GQRA': ['ment'],  'BRE': ['ment', 'the Building Research Establishment'],  'PPS': ['ment'],  'PSV': ['Passive stack ventilation'],  'EST': ['the Energy Saving Trust'],  'CIBSE': ['Ventilation hygiene toolkit', 'Building Services Engineers'],  'AGS': ['Geoenvironmental Specialists'],  'SPAB': ['Ancient Buildings'],  'UF': ['urea formaldehyde'],  'ODPM': ['the Deputy Prime Minister']}

In [73]:
for acronym, spans in acronyms.items():
    
    acronym_uid, new_uid_bool = ua.assign_UID(acronym, SPANS)
    if new_uid_bool:
        # many of these will be in the graph already
        irec_graph = irec_span(irec_graph, acronym_uid, acronym)
    
    for span in spans:
        # these are all part of the graph already
        span_uid, _ = ua.assign_UID(span, SPANS) 
        
        # todo; 
        #  could do some filtering here of the clearly erroneous span-acronym combinations
        #  or leave this until later, using the graph...
    
        if (SPANS[acronym_uid], IREC.isAcronymOf, SPANS[span_uid]) not in irec_graph:
            irec_graph = irec_is_acronym_of(irec_graph, acronym_uid, span_uid)
            irec_graph = irec_has_acronym(irec_graph, span_uid, acronym_uid)
            
    

### Add CONCEPTS: defined terms from the Approved Documents

In [74]:
# read data from csv file
definitions = pd.read_excel(graph_data_fp.joinpath("Approved Documents and derived terms.xlsx"), sheet_name="Definitions", keep_default_na=False)

In [75]:
definitions[:3]

Unnamed: 0,Term,Definition,Alternative labels,Note
0,Absorption,"Conversion of sound energy to heat, often by t...",,
1,Absorption coefficient,A quantity characterising the effectiveness of...,,See BS EN 20354:1993.
2,Absorptive material,Material that absorbs sound energy.,,


In [76]:
# todo; below, perhaps check existince of relations to avoid duplication, e.g.;
# if (SPANS[acronym_uid], IREC.isAcronymOf, SPANS[span_uid]) not in irec_graph:

In [77]:
concepts_definitions_dict = {} # keep track of definitions for parsing later

# create graph from definitions first
for i, row in definitions.iloc[1:].iterrows():
    term = row['Term'] if row['Term'].isupper() else row['Term'].lower()
    alternative_labels = row['Alternative labels']
    definition = row['Definition']
    note = row['Note']

    concept_uid, new_uid = ua.assign_UID(term, CONCEPTS)
    
    if new_uid: # add the term 
        irec_graph = skos_node(irec_graph, concept_uid, term)
        concepts_definitions_dict[concept_uid] = [{'prefLabel': term, 'definition': definition, 'note': note}]  
    else:
        concepts_definitions_dict[concept_uid].append({'prefLabel': term, 'definition': definition, 'note': note})  
                                                      
    if note: 
        irec_graph = skos_note(irec_graph, concept_uid, note) 
    
    # always expecting a definition
    irec_graph = skos_definition(irec_graph, concept_uid, definition) 
    
    if alternative_labels:
        # lowercase if not an abbreviation
        alt_labels = [x.strip() if x.isupper() else x.lower().strip() for x in alternative_labels.split(", ")]  
        
        for alt_label in alt_labels:
            # treated like a span, that refers to the same concept <-- this is a deviation from previously
            alt_label_uid, new_uid = ua.assign_UID(alt_label, SPANS)
            if new_uid:
                irec_graph = irec_span(irec_graph, alt_label_uid, alt_label)
        
            irec_graph = skos_altLabel(irec_graph, concept_uid, alt_label_uid)

### Add SPANS: glossary/index terms from the Approved Documents

In [78]:
index_terms = pd.read_excel(graph_data_fp.joinpath("Approved Documents and derived terms.xlsx"), sheet_name="Index terms", keep_default_na=False)

In [79]:
index_terms[:3]

Unnamed: 0,Term,AltLabel(s),Related terms,Broader term
0,abbreviated eaves,,,eaves
1,Access floors,access floor,Platform floors,
2,Access for fire service,fire access,,Fire service facilities


In [80]:
# add triples from index terms / glossaries
# we will treat glossary terms like SPANS, relations between their labels occur between the spans 
for i, row in index_terms.iloc[1:].iterrows():
    term = row['Term'].strip() if row['Term'].isupper() else row['Term'].lower().strip()
    alternative_labels = row['AltLabel(s)'] 
    related_terms = row['Related terms']
    broader_term = row['Broader term']
    
    # add the term 
    term_uid, new_uid = ua.assign_UID(term, SPANS)
    if new_uid:
        irec_graph = irec_span(irec_graph, term_uid, term)
        # hacky visualisation solution; connect all terms to the graph root -> need to find a better solution
        # hacky visualisation solution; connect all terms to the graph root
        # hacky visualisation solution; connect all terms to the graph root
        irec_graph = add_top_concept(irec_graph, term_uid)

    if alternative_labels:
        
        # lowercase if not an acronym
        alt_labels = [x.strip() if x.isupper() else x.lower().strip() for x in alternative_labels.split(", ")]  
        
        for alt_label in alt_labels:
            # treated like a span, that refers to the same concept <-- this is a deviation from previously
            alt_label_uid, new_uid = ua.assign_UID(alt_label, SPANS)
            if new_uid:
                irec_graph = irec_span(irec_graph, alt_label_uid, alt_label)
            
            if alt_label.isupper():
                # there are acronyms among the alternative labels
                irec_graph = irec_has_acronym(irec_graph, term_uid, alt_label_uid)
                irec_graph = irec_is_acronym_of(irec_graph, alt_label_uid, term_uid)
            else:
                ### WAIT< SKOS HERE? MAYBE IREC>..
                ### WAIT< SKOS HERE? MAYBE IREC>..
                ### WAIT< SKOS HERE? MAYBE IREC>..
                irec_graph = skos_altLabel(irec_graph, term_uid, alt_label_uid) ### WAIT< SKOS HERE? MAYBE IREC>..

    if related_terms:
        rel_terms = [x.strip() if x.isupper() else x.lower().strip() for x in related_terms.split(", ")]
        for rel_term in rel_terms:
            related_uid, new_uid = ua.assign_UID(rel_term, SPANS)
            if new_uid:
                irec_graph = irec_span(irec_graph, related_uid, rel_term)
            
            if rel_term.isupper():
                # there are acronyms among the related labels as well
                irec_graph = irec_has_acronym(irec_graph, term_uid, related_uid)
                irec_graph = irec_is_acronym_of(irec_graph, related_uid, term_uid)
            else:
                irec_graph = irec_related(irec_graph, term_uid, related_uid)
    
    if broader_term:
        # We expect 1 broader term at most currently, assuming we'd like a tree structure (DAG with 1 parent at most)
        b_term = broader_term.strip().lower() if not broader_term.isupper() else broader_term.strip()
        b_term_uid, new_uid = ua.assign_UID(b_term, SPANS)
        if new_uid:
            irec_graph = irec_span(irec_graph, b_term_uid, b_term)

        # We do not expect that the broader term is necessarily a concept, although this is a feature we may rely on later
        irec_graph = skos_broader(irec_graph, term_uid, b_term_uid, SPANS, SPANS)

In [81]:
# irec_graph.serialize(destination="graph/approved_doc_terms_only.ttl")

### Print some insight in the graph so far

In [82]:
ua.count_nodes_in_namespace(SPANS)

Number of nodes in 'https://example.org/irec-spans': 12590


12590

In [83]:
ua.count_nodes_in_namespace(CONCEPTS)

Number of nodes in 'https://example.org/irec-concepts': 296


296

In [84]:
ua.print_node_by_id(irec_graph, 291, CONCEPTS)

291 ; type ; Concept
291 ; prefLabel ; wet room
291 ; note ; For the purposes of Part F, sanitary accommodation is also regarded as a wet room.
291 ; definition ; A room used for domestic activities (such as cooking, clothes washing and bathing) which give rise to significant production of airborne moisture, e.g. a kitchen, utility room or bathroom. 
291 ; definition ; WC or bathroom compartment with tanking and drainage laid to fall to a connected gulley capable of draining the floor area when used as a shower.


In [85]:
ua.print_node_by_id(irec_graph, urllib.parse.quote('sanitary accommodation'), SPANS)

sanitary%20accommodation ; type ; Span
sanitary%20accommodation ; label ; sanitary accommodation
sanitary%20accommodation ; related ; sanitary%20appliance


In [86]:
ua.print_node_by_text(irec_graph, 'sanitary accommodation', SPANS)

sanitary%20accommodation ; type ; Span
sanitary%20accommodation ; label ; sanitary accommodation
sanitary%20accommodation ; related ; sanitary%20appliance


As you can see in the examples above, the concept `wet room` and the span `sanitary accomdodation` are related:
* The concept `wet room` is provided with a note in the merged approved documents.
* The text inside this node describes how, for part F of the approved documents, `sanitary accomodation` is regarded as a `wet room`. 

Based on the above, we'd like to link the span `sanitary accomodation` to the concept `wet room`. While we could parse the note in more detail, and identify that a `skos:altLabel` relation should be added, we'll use a more generic approach:
* Any span that is found inside a definition or note of a concept will be linked through `irec:related`
* Based on the definitions above, potential spans related to the `wet room` concept then become: `sanitary accomdoation`, `airborn moisture`, `kitchen`, `utility room`, `bathroom`, `WC`, `tanking`, `drainage`, `gulley`, `shower`.


<div class="alert alert-block alert-info">
We believe that the types of relations described above can be valuable and would like to provide more definitions for more terms, to help interrelate more spans and concepts. To this end, we first try to find WikiData definitions for all concepts and spans. 
</div>

### Grab wikipedia definitions for Concept nodes, and store locally for re-use

* First, we try to grab all wiki definitions for all spans and concepts that are in the graph (so far)

In [87]:
# set up the SPARQL endpoint for wikidata

sparql_wrapper = SPARQLWrapper("https://query.wikidata.org/sparql")

In [88]:
def get_wiki_matches(graph_sparql_endpoint: SPARQLWrapper,
                     jargon_term_and_uids: List):

    all_wiki_definitions = {}
    # we want to grab the term (subject), any definition (subjectDescription) and the class (subjectClass)
    sparql_q = """
               SELECT ?subject ?subjectDescription ?classUID ?className WHERE {
                  ?subject rdfs:label "$QUERY"@en.
                  ?subject wdt:P31|wdt:P279 ?classUID.
                  SERVICE wikibase:label { bd:serviceParam wikibase:language "en".}
                  ?classUID  rdfs:label ?className  FILTER(LANG(?className) = "en").
                }
               """
    
    for term, uid in tqdm(jargon_term_and_uids):
        # make the call to 
        temp_q = sparql_q.replace("$QUERY", term)
        graph_sparql_endpoint.setQuery(temp_q)
        graph_sparql_endpoint.setReturnFormat(JSON)
        try:
            json_output = graph_sparql_endpoint.query().convert()
        except:
            # If no result, wait 2s; One client is allowed 30 error queries per minute
            print(f"Error for query, you should what's wrong with the term: {term}")
            time.sleep(2)
            continue
            
        # sometimes multiple Wiki UIDs for a single term, we grab them all here
        bindings = [v for v in json_output['results']['bindings']]
            

        for v in bindings:
            class_uid = v['classUID']['value'] if 'classUID' in v else ""
            class_label = v['className']['value'] if 'className' in v else ""
            
            if 'subjectDescription' in v:
                if uid not in all_wiki_definitions:
                    all_wiki_definitions[uid] = [{'prefLabel': term,
                                                  'class_uid': class_uid,
                                                  'class_label': class_label,
                                                  'WikiUID': v['subject']['value'],
                                                  'WikiDefinition': v['subjectDescription']['value']}]
                else:
                    all_wiki_definitions[uid].append({'prefLabel': term,
                                                      'class_uid': class_uid,
                                                      'class_label': class_label,
                                                      'WikiUID': v['subject']['value'],
                                                      'WikiDefinition': v['subjectDescription']['value']})
            elif 'subject' in v:
                # no description found, simply adding wiki UID if that exists
                if uid not in all_wiki_definitions:
                    all_wiki_definitions[uid] = [{'prefLabel': term,
                                                  'class_uid': class_uid,
                                                  'class_label': class_label,
                                                  'WikiUID': v['subject']['value']}]
                else:
                    all_wiki_definitions[uid].append({'prefLabel': term,
                                                      'class_uid': class_uid,
                                                      'class_label': class_label,
                                                      'WikiUID': v['subject']['value']})
    return all_wiki_definitions


In [89]:
concepts_and_uids = [(k, v) for k, v in ua.UIDs[CONCEPTS.placeholder.defrag().__reduce__()[1][0]].items()]
spans_and_uids = [(k, v) for k, v in ua.UIDs[SPANS.placeholder.defrag().__reduce__()[1][0]].items()]

In [90]:
# First run for the Concepts
concept_wiki_dict_fp = graph_data_fp.joinpath("concept_wiki_dict.json")
if not concept_wiki_dict_fp.exists():
    concept_wiki_dict = get_wiki_matches(sparql_wrapper, concepts_and_uids)#{'test': 1, 'conductor':2})
    with open(concept_wiki_dict_fp, 'w') as f:
        json.dump(concept_wiki_dict, f, indent=2)
else:
    with open(concept_wiki_dict_fp, 'r') as f:
        concept_wiki_dict = json.load(f)

In [91]:
# Now run for the spans
span_wiki_dict_fp = graph_data_fp.joinpath("span_wiki_dict.json")
if not span_wiki_dict_fp.exists():
    span_wiki_dict = get_wiki_matches(sparql_wrapper, spans_and_uids)#{'test': 1, 'conductor':2})
    with open(span_wiki_dict_fp, 'w') as f:
        json.dump(span_wiki_dict, f, indent=2)
else:
    with open(span_wiki_dict_fp, 'r') as f:
        span_wiki_dict = json.load(f)

In [92]:
print("Number of concepts with WikiData definitions: {} ({:.2f}%)".format(len(concept_wiki_dict), len(concept_wiki_dict)/ua.count_nodes_in_namespace(CONCEPTS)*100))
print("Number of spans with WikiData definitions: {} ({:.2f}%)".format(len(span_wiki_dict), len(span_wiki_dict)/ua.count_nodes_in_namespace(SPANS)*100))



Number of nodes in 'https://example.org/irec-concepts': 296
Number of concepts with WikiData definitions: 244 (82.43%)
Number of nodes in 'https://example.org/irec-spans': 12590
Number of spans with WikiData definitions: 655 (5.20%)


* Some examples of/insight in definitions from different sources

In [93]:
# List of definitions from approved documents
concepts_definitions_dict['1']

[{'prefLabel': 'absorption coefficient',
  'definition': 'A quantity characterising the effectiveness of a sound absorbing surface. The proportion of sound energy absorbed is given as a number between zero (for a fully reflective surface) and one (for a fully absorptive surface). Note that sound absorption coefficients determined from laboratory measurements may have values slightly larger than one.',
  'note': 'See BS EN 20354:1993.'}]

In [94]:
# List of definitions for the same concept, from WikiData
concept_wiki_dict['1']

[{'prefLabel': 'absorption coefficient',
  'class_uid': 'http://www.wikidata.org/entity/Q107715',
  'class_label': 'physical quantity',
  'WikiUID': 'http://www.wikidata.org/entity/Q97368968',
  'WikiDefinition': 'measure for the exponential reduction of a quantity along a path due to absorption',
  'Spans in definitions and notes': ['a path due',
   'a quantity',
   'the exponential reduction']}]

In [95]:
# List of definitions for a related span, from WikiData
print([k for k in span_wiki_dict.keys() if 'absor' in k])
span_wiki_dict['absorbent']

['absorbent']


[{'prefLabel': 'absorbent',
  'class_uid': 'http://www.wikidata.org/entity/Q3505845',
  'class_label': 'state',
  'WikiUID': 'http://www.wikidata.org/entity/Q110147344',
  'WikiDefinition': 'having the ability or tendency to absorb; able to soak up liquid easily; absorptive.',
  'Spans in definitions and notes': ['liquid easily', 'the ability tendency']}]

### Only keep WikiData definitions that belong to classes that we've annotated
* We have previously annotated the relevance of all WikiData classes returned for the defined terms and index terms in the Approved Documents.

In [96]:
annotated_wikidata_classes_df = pd.read_csv(graph_data_fp.joinpath("wiki_classes_annotated.csv"), index_col=1)

In [97]:
annotated_wikidata_classes_df[:3]

Unnamed: 0_level_0,WikiData class,Annotation,Example spans
WikiData UIDs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
['Q107715'],physical quantity,y,"['sound pressure level', 'density', 'area', 's..."
['Q82799'],name,n,['access point']
['Q180160'],metadata,n,['access point']


In [98]:
wikiclass_dict = {}
for row in annotated_wikidata_classes_df.iterrows():
    uid_list_string, class_annotations_examples = row
    uid_list = uid_list_string[2:-2].split(',')
    for uid in uid_list:
        wikiclass_dict[uid] = {
            'Class': class_annotations_examples['WikiData class'],
            'Annotation': class_annotations_examples['Annotation'],
            'Example spans': class_annotations_examples['Example spans']
        }
    

In [99]:
def filter_wikidata_classes(wiki_class_dict, term_dict):
    new_term_dict = {}
    removed_definitions = []
    for uid, definition_dict_list in term_dict.items():
        for definition_dict in definition_dict_list:
            class_uid = definition_dict['class_uid'].rsplit("/", 1)[1]
            if class_uid in wikiclass_dict:
                class_name = wikiclass_dict[class_uid]["Class"]
                if wikiclass_dict[class_uid]["Annotation"] == 'y':
                    if uid not in new_term_dict:
                        new_term_dict[uid] = [definition_dict]
                    else:
                        new_term_dict[uid].append(definition_dict)
                else:
                    removed_definitions.append(definition_dict)
                        
    return new_term_dict, removed_definitions

In [100]:
concept_wiki_dict, removed_definitions = filter_wikidata_classes(wikiclass_dict, concept_wiki_dict)

In [101]:
span_wiki_dict, removed_definitions = filter_wikidata_classes(wikiclass_dict, span_wiki_dict)

### Parse all definitions (including WikiData) to identify additional spans

In [102]:
te = TermExtractor(max_num_cpu_threads=4)

In [103]:
def add_spar_labels(input_dict: Dict[str, str], term_extractor: TermExtractor):
    for uid, definition_dict_list in tqdm(input_dict.items()):
        for idx, definition_dict in enumerate(definition_dict_list):
            if 'Spans in definitions and notes' in definition_dict:
                # spans already computed for this definition_dict, continuing to check next
                continue
            
            spartxt_objects = []
            for k in definition_dict.keys():
                if k in ['WikiDefinition', 'definition', 'note']:
                    to_be_parsed = definition_dict[k]
                    sentences = term_extractor.split_into_sentences(to_be_parsed)
                    # cleaning spans as well;
                    sentences = [remove_unicode_chars(s).encode("ascii", "ignore").decode() for s in sentences]
                    spartxt_objects += custom_cleaning_rules(term_extractor.process_sentences(sentences))
                    
            input_dict[uid][idx]['Spans in definitions and notes'] = spartxt_objects
    return input_dict

In [104]:
concept_definitions_dict_fp = graph_data_fp.joinpath("concepts_definitions_dict.json")
if not concept_definitions_dict_fp.exists():
    print("Computing SPaR.txt objects for concepts_definitions_dict")
    concepts_definitions_dict = add_spar_labels(concepts_definitions_dict, te)
    with open(concept_definitions_dict_fp, 'w') as f:
        json.dump(concepts_definitions_dict, f, indent=2)
else:
    print("Loading previously computed concepts_definitions_dict with SPaR.txt objects from file")
    with open(concept_definitions_dict_fp, 'r') as f:
        concepts_definitions_dict = json.load(f)

Loading previously computed concepts_definitions_dict with SPaR.txt objects from file


In [105]:
concept_wiki_dict = add_spar_labels(concept_wiki_dict, te)
# Save the updated concept_wiki_dict, which will be loaded in previous cells anyway
with open(concept_wiki_dict_fp, 'w') as f:
    json.dump(concept_wiki_dict, f, indent=2)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 244/244 [00:00<00:00, 190650.18it/s]


In [106]:
span_wiki_dict = add_spar_labels(span_wiki_dict, te)
# Save the updated span_wiki_dict, which will be loaded in previous cells anyway
with open(span_wiki_dict_fp, 'w') as f:
    json.dump(span_wiki_dict, f, indent=2)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 655/655 [00:00<00:00, 367773.64it/s]


### Add any new spans to the graph

In [107]:
all_spartxt_objects = {}
for x in [concepts_definitions_dict, concept_wiki_dict, span_wiki_dict]:
    for definition_dict_list in x.values():
        for definition_dict in definition_dict_list:
            defined_term = definition_dict['prefLabel']
            spar_objects_in_dict = custom_cleaning_rules(definition_dict['Spans in definitions and notes'])
            if defined_term not in all_spartxt_objects:
                all_spartxt_objects[defined_term] = spar_objects_in_dict
            else:
                all_spartxt_objects[defined_term] += spar_objects_in_dict
                
unique_spans_from_defined_terms = list(set([s for v in all_spartxt_objects.values() for s in v]))
print("Number of defined terms: ", len(all_spartxt_objects))
print("Number of new spans: ", len(unique_spans_from_defined_terms))

Number of defined terms:  992
Number of new spans:  3429


In [108]:
random.sample(unique_spans_from_defined_terms, 10)

['calibration',
 'rock material',
 'head jamb lintel',
 'higher',
 'The International Organization',
 'the circulation',
 'The leading edge',
 'decision parameters',
 'A roof space',
 'fixture']

In [109]:
n_space = SPANS.placeholder.defrag().__reduce__()[1][0]
ua.UIDs[n_space]['sanitary accommodation']

'sanitary%20accommodation'

In [110]:
# add a related label between the defined concept or span, and the span found in a definition
for i, (term, related_spans) in enumerate(all_spartxt_objects.items()):

    # add the term as a span if needed
    term_uid, new_uid = ua.assign_UID(term, SPANS)
    if new_uid:
        irec_graph = irec_span(irec_graph, term_uid, term)

    rel_spans = [x.strip() if x.isupper() else x.lower().strip() for x in related_spans]
    for rel_term in rel_spans:
        related_uid, new_uid = ua.assign_UID(rel_term, SPANS)
        if new_uid:
            irec_graph = irec_span(irec_graph, related_uid, rel_term)

        irec_graph = irec_related(irec_graph, concept_uid, related_uid)
    

### Add WikiData definitions to graph

In [111]:
for i, (term_uid, definition_dict_list) in enumerate(concept_wiki_dict.items()):
    for definition_dict in definition_dict_list:
        # Add the WIKI node to our graph
        term = definition_dict['prefLabel']
        wiki_uid = definition_dict['WikiUID'].rsplit('/', 1)[1]
        # keep track of uid that is added to the graph
        _, _ = ua.keep_track_of_existing_UID(term, wiki_uid, WIKI)
        # add to graph, in WIKI namespace
        irec_graph = skos_node(irec_graph, wiki_uid, term, WIKI)

        # Add the WIKI definition to the node if it exists
        if 'WikiDefinition' in definition_dict:            
            definition = definition_dict['WikiDefinition']
            irec_graph = skos_definition(irec_graph, wiki_uid, definition, namespace=WIKI) 

        # Add an exact match between the wiki node and our concept node
        irec_graph = skos_exact_match(irec_graph, term_uid, wiki_uid, CONCEPTS, WIKI)

In [112]:
for i, (term_uid, definition_dict_list) in enumerate(span_wiki_dict.items()):
    for definition_dict in definition_dict_list:
        # Add the WIKI node to our graph
        term = definition_dict['prefLabel']
        wiki_uid = definition_dict['WikiUID'].rsplit('/', 1)[1]
        # keep track of uid that is added to the graph
        _, _ = ua.keep_track_of_existing_UID(term, wiki_uid, WIKI)
        # add to graph, in WIKI namespace
        irec_graph = skos_node(irec_graph, wiki_uid, term, WIKI)

        # Add the WIKI definition to the node if it exists
        if 'WikiDefinition' in definition_dict:            
            definition = definition_dict['WikiDefinition']
            irec_graph = skos_definition(irec_graph, wiki_uid, definition, namespace=WIKI) 

        # Add an exact match between the wiki node and our concept node --> NOW SPANS
        irec_graph = skos_exact_match(irec_graph, term_uid, wiki_uid, SPANS, WIKI)

### We will also add the Uniclass terms that we found in the text to the graph

In [113]:
with open(graph_data_fp.joinpath("uniclass_terms_in_text.pkl"), 'rb') as f:
    uniclass_terms_in_text = pickle.load(f)

In [114]:
for uniclass_uid, definition_dict in uniclass_terms_in_text.items():
    # Add the Uniclass node to our graph
    term = definition_dict['pref_label']
    # keep track of uid that is added to the graph
    _, _ = ua.keep_track_of_existing_UID(term, uniclass_uid, UNICLASS)
    # add to graph, in UNICLASS namespace
    irec_graph = skos_node(irec_graph, uniclass_uid, term, UNICLASS)
    
    # Determine the corresponding term_uid
    if ua.retrieved_uid_by_text(term): # First as is (no lowercasing, despite Uniclass casing)
        # Add an exact match between the wiki node and the corresponding span
        irec_graph = skos_exact_match(irec_graph, ua.retrieved_uid_by_text(term),
                                      wiki_uid, SPANS, UNICLASS)
    elif ua.retrieved_uid_by_text(term.lower()):
        # Add an exact match between the wiki node and the corresponding lowercased version in SPANS
        irec_graph = skos_exact_match(irec_graph, ua.retrieved_uid_by_text(term.lower()), 
                                      wiki_uid, SPANS, UNICLASS)
    

### Compute properties between spans
* link definitions to spans (if span occurs verbatim; linkwith irec:related)
* link spans to spans:
  * constitutes; x occurs in y, thus y might be an extended phrase for x and perhaps a subclass, or x may be a material property, and so on
  * morphological similarity, x may be an inflection of y
  * semantic similarity, x and y might be alternative labels or have the same superclass