In [77]:
import re
import time
import pickle
import glob, os
import subprocess
import json, random
import requests, urllib
import concurrent.futures

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup as bs
from tqdm import tqdm
from typing import List, Any, List, Dict, Tuple
from pathlib import Path
from textblob import TextBlob
from threading import current_thread
from itertools import combinations, product
from collections import Counter
from nltk.corpus import wordnet as wn
from transformers import BertTokenizer, BertModel
from SPARQLWrapper import SPARQLWrapper, JSON
from sklearn.neighbors import kneighbors_graph
from scipy.spatial.distance import cosine

from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import XSD, RDF, RDFS, SKOS, NamespaceManager

from utils.spar_utils import TermExtractor
from utils.cluster_utils import levenshtein
from utils.embedding_utils import Embedder
from utils.cleaning_utils import split_list, custom_cleaning_rules, remove_unicode_chars

We would like to express the following features/relations:
* Dictionary definition terms, which are always concepts
  * We'll use the source as namespace, and corresponding concept identifier if it exists
  * SKOS is used to establish a mapping (e.g., skos:exactMatch) and add the definition (skos:definition)
* Special properties that we want to capture between words, which may help identify concepts:
  * Word is part of MWE
  * Morphologically similar words; stemming & Levenshtein distance
  * Semantically similar words; distributed similarity (NNs)
  * Acronyms
  * Related, this is a generic relation, e.g., a `ampere` is related to `electric current`
  * Domain-specificity; foreground or background term following our filtering procedure
 

In [2]:
graph_output_fp = Path.cwd().joinpath("data", "graph_output")
graph_output_fp.mkdir(parents=True, exist_ok=True) # create directory if it doesn't exist

### Prepare namespaces

In [3]:
ROOT = Namespace("https://example.org/top_concept_for_visulisation/#")
WIKI = Namespace("https://www.wikidata.org/wiki/#")
# Note: that UNICLASS is not a namespace (yet) only has identifiers 
UNICLASS = Namespace("https://www.example.org/uniclass/#")

In [4]:
PROV = Namespace("http://www.w3.org/ns/prov#")
DCT = Namespace("http://purl.org/dc/terms/#")

In [5]:
PROV.placeholder.defrag().__reduce__()[1][0]

'http://www.w3.org/ns/prov'

In [6]:
# example/placeholder URLs for the IReC project 
IREC_ontology_URL = "https://schema.irec.org/#"
IREC_spans_URL = "https://spans.irec.org/#"
IREC_concepts_URL = "https://concepts.irec.org/#"

# create our custom namespace for the schema to store spans
IREC = Namespace(IREC_ontology_URL)

# create a custom namespace to store spans and concepts
SPANS = Namespace(IREC_spans_URL)
CONCEPTS = Namespace(IREC_concepts_URL)

### graph creation utilities

In [7]:
class UID_assigner:
    def __init__(self):
        self.UIDs = {}
        self.UID = 0
        self.scheme_uids = {}
        
    def get_scheme_UID(self, namespace: Namespace):
        """
        Determines which type of UID to assign, based on the namespace.
        """
        return [x for x in self.UIDs[namespace._.defrag().__reduce__()[1][0]].values() if x == "schemeUID"][0]
        
    def assign_UID(self, text, namespace: Namespace):
        """
        Determines which type of UID to assign, based on the namespace.
        """
        if not text:
            raise Exception("Not text label provided to assign a UID.")
        if namespace == SPANS:
            return self.span_UID(text)
        elif namespace == CONCEPTS:
            return self.concept_UID(text)
        else:
            print("UID assignment not set up for this namespace, maybe use UID_assigner.keep_track_of_existing_UID()")
            
    
    def span_UID(self, text):
        """
        NOTE: each text span is a unique identifier in and of itself. We'll simply convert the text span to 
        a URL friendly representation.
        """
        n_space = SPANS.placeholder.defrag().__reduce__()[1][0]
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
        
        urltext = urllib.parse.quote(text)
        if text not in self.UIDs[n_space]:
            self.UIDs[n_space][text] = urltext
            
        return self.UIDs[n_space][text]
        
    def concept_UID(self, text):
        """
        For now I'll create my own dumb interger-based UIDs for nodes as a simple shortcut, split per namespace
        """
        n_space = CONCEPTS.placeholder.defrag().__reduce__()[1][0]
        
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
        
        if text not in self.UIDs[n_space]:
            self.UID += 1
            self.UIDs[n_space][text] = str(self.UID)
        return self.UIDs[n_space][text]
        
    def keep_track_of_existing_UID(self, text:str, existing_uid: str, namespace:Namespace):
        """
        Simply keep track of UIDs that exist in the provided namespace.
        """
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
            
        if text not in self.UIDs[n_space]:
            # already seen by this UID assigner
            self.UIDs[n_space][text] = existing_uid
            
        return existing_uid
    
    def retrieve_uid_by_text(self, node_text, namespace: Namespace = SPANS):
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        if node_text in self.UIDs[n_space]:
            return self.UIDs[n_space][node_text]
        else:
            return None 
        
    def count_nodes_in_namespace(self, namespace: Namespace = SPANS):
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        print(f"Number of nodes in '{n_space}': {len(self.UIDs[n_space])}")
        return len(self.UIDs[n_space])
        
    def print_node_by_id(self, graph, node_id, namespace: Namespace = SPANS):
        for s, p, o in graph.triples((namespace[str(node_id)],  None, None)):
            print(f"{s.split('#')[-1]} ; {p.split('#')[-1]} ; {o.split('#')[-1]}")
        
    def print_node_by_text(self, graph, node_text, namespace: Namespace = SPANS):
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        node_id = self.UIDs[n_space][node_text]
        # find all triples with subject
        for s, p, o in graph.triples((namespace[node_id],  None, None)):
            print(f"{s.split('#')[-1]} ; {p.split('#')[-1]} ; {o.split('#')[-1]}")

In [8]:
# These wrappers only exist to help me consistently add nodes to the graph

def dct_title(node_uid: str, title: str, namespace: Namespace) -> List[Tuple]:
    return [(namespace[node_uid], DCT.title,  Literal(title, lang='en'))]

def provenance(node_uid: str, source: URIRef, namespace: Namespace) -> List[Tuple]:
    """ temp source attribution """
    return [(namespace[node_uid], PROV.hadPrimarySource, source)]

def prov_agent(node_uid: str, agent: URIRef, namespace: Namespace) -> List[Tuple]:
    """ temp agent attribution (for Spans generated by SPaR.txt) """
    return [(namespace[node_uid], PROV.wasAttributedTo, agent)]


# SKOS 
def skos_scheme(node_uid, namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Node that identifies the concep scheme with a URI, expecting/using as scheme root """
    return [(namespace[node_uid], RDF.type, SKOS.ConceptScheme)]

def skos_top_concept(node_uid, top_concept_uid, 
                    namespace: Namespace=CONCEPTS, top_concept_namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Currently, we mainly use the top-concept for visualisation. """
    return [(namespace[node_uid], SKOS.hasTopConcept, top_concept_namespace[top_concept_uid]),
            (top_concept_namespace[top_concept_uid], SKOS.topConceptOf, namespace[node_uid])]

def skos_in_scheme(node_uid, scheme_uid, namespace: Namespace=CONCEPTS, scheme_namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Keep track of the scheme/source of a node. """
    return [(namespace[node_uid], SKOS.inScheme, scheme_namespace[scheme_uid])]

def skos_node(node_uid, text, namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Add a concept with prefLabel to the graph in the CONCEPTS namespace, of type SKOS.Concept """
    return [(namespace[node_uid], RDF.type, SKOS.Concept), 
            (namespace[node_uid], SKOS.prefLabel, Literal(text, lang='en'))]

def skos_prefLabel(node_uid, text, namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Add the text label for a node """
    return [(namespace[node_uid], SKOS.prefLabel, Literal(text, lang='en'))]

def skos_altLabel(node_uid, alt_label_uid, namespace: Namespace=CONCEPTS)-> List[Tuple]:
    """ Add an alternative text label for a concept node """
    return [(namespace[node_uid], SKOS.altLabel, namespace[alt_label_uid]), 
            (namespace[alt_label_uid], SKOS.altLabel, namespace[node_uid])]

def skos_exact_match(subject_node_uid, object_node_uid,
                subject_namespace: Namespace=SPANS, object_namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Denotes an exact match between two nodes, would expect the nodes to be in different vocabularies """
    return [(subject_namespace[subject_node_uid], SKOS.exactMatch, object_namespace[object_node_uid])]

def skos_related(subject_node_uid, object_node_uid,
                subject_namespace: Namespace=SPANS, object_namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Denotes a relation between two nodes, would expect the nodes to be in different vocabularies """
    return [(subject_namespace[subject_node_uid], SKOS.related, object_namespace[object_node_uid])]
    
def skos_broader(narrower_node_uid, broader_node_uid, 
                 narrower_namespace: Namespace=CONCEPTS, broader_namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Assuming narrower/broader is always reflexive, would expect the nodes to be in different vocabularies """
    return [(broader_namespace[narrower_node_uid], SKOS.narrower, narrower_namespace[broader_node_uid]),
            (narrower_namespace[broader_node_uid], SKOS.broader, broader_namespace[narrower_node_uid])]
    
def skos_definition(node_uid, definition_text, namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Add a definition, if provided in the Merged Approved Documents """
    return [(namespace[node_uid], SKOS.definition, Literal(definition_text, lang='en'))]

def skos_note(node_uid, note_text, namespace: Namespace=CONCEPTS) -> List[Tuple]:
    """ Some notes exist in the approved docs at least, containing useful information """
    return [(namespace[node_uid], SKOS.note, Literal(note_text, lang='en'))]



# IREC functions and REFERENCE
IREC.CharacterSpan # A span is a sequence of characters that occurs verbatim in a text, either contiguous or discontiguos as extracted by SPaR.txt (Kruiper et al., 2021).   
IREC.constitutes  # Indicates that a span constitutes another span, e.g., the Multi-Word Expression (MWE) Span `hot water storage system` the Span `storage`.
IREC.isMorphologicallySimilarTo # Indicates that a Span is morphologically similar to another Span, e.g., they may have the same stem or a small Levenshtein distance.
IREC.isSemanticallySimilarTo # Indicates that a Span is semantically similar to another Span, following a cosine similarity between their  embeddings.
IREC.related # General way to indicate some relation between two spans, e.g., `ampere` is related to `electric current`
IREC.hasAcronym # A Span can have an acronym, e.g., `British Standards Institute` has the acronym `BSI`.
IREC.isAcronymOf # A Span can have an acronym, e.g., `BSI` is the acronym for `British Standards Institute`.
IREC.hasAntonym # Property that relates a Span to another Span, each being each other's antonyms.
IREC.wikiDefinition # One of potentially multiple WikiData definitions for the irec:CharacterSpan node.
IREC.wikiClassLabel # One of potentially multiple WikiData class labels for the irec:CharacterSpan node.

def irec_span(node_uid, text, namespace: Namespace=SPANS) -> List[Tuple]:
    """ Add a span node in the SPANS namespace, of type IREC.CharacterSpan and the span text set as its RDF.label """
    # is preflabel a property? I would assume so
    return [(namespace[node_uid], RDF.type, IREC.CharacterSpan), 
            (namespace[node_uid], RDFS.label,  Literal(text, lang='en'))]

def irec_constitutes(subject_node_uid, object_node_uid,
                     subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS) -> List[Tuple]:
    """ Indicates that somewhere in the label of the first SPAN node, you can find the second span's label """
    return [(subject_namespace[subject_node_uid], IREC.constitutes, object_namespace[object_node_uid])]

def irec_morp_sim(subject_node_uid, object_node_uid,
                  subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS) -> List[Tuple]:
    """ Indicates that the labels of two SPAN nodes are morphologically similar """
    return [(subject_namespace[subject_node_uid], IREC.isMorphologicallySimilarTo, object_namespace[object_node_uid])]

def irec_sem_sim(subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS) -> List[Tuple]:
    """ Indicates that the labels of two SPAN nodes are semantically similar, following the distributed semantics hypothesis """
    return [(subject_namespace[subject_node_uid], IREC.isSemanticallySimilarTo, object_namespace[object_node_uid])]

def irec_related(subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS) -> List[Tuple]:
    """ Indicates that a Span is related in SOME way to another Span. """
    return [(subject_namespace[subject_node_uid], IREC.related, object_namespace[object_node_uid])]

def irec_has_acronym(subject_node_uid, object_node_uid,
                     subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS) -> List[Tuple]:
    """ Indicates that the label of the subject node has an acronym, ergo the label of the object node  """
    return [(subject_namespace[subject_node_uid], IREC.hasAcronym, object_namespace[object_node_uid]),
            (object_namespace[object_node_uid], IREC.isAcronymOf, subject_namespace[subject_node_uid])]

def irec_antonym(subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS) -> List[Tuple]:
    """ Indicates that the label of the subject node is an antonym of the label of the object node  """
    return [(subject_namespace[subject_node_uid], IREC.hasAntonym, object_namespace[object_node_uid])]

def irec_wikidef(node_uid, definition_text, namespace: Namespace=SPANS) -> List[Tuple]:
    """ Add a candidate definition, as retrieved from WikiData for the Span """
    return [(namespace[node_uid], IREC.wikiDefinition, Literal(definition_text, lang='en'))]

def irec_wikiclass(node_uid, definition_text, namespace: Namespace=SPANS) -> List[Tuple]:
    """ Add a candidate class label, as retrieved from WikiData for the Span """
    return [(namespace[node_uid], IREC.wikiDefinition, Literal(definition_text, lang='en'))]



In [9]:
def add_tuples(graph, tuples):
    """
    We'll never add the same tuple twice to a graph
    """
    for t in tuples:
        assert len(t) == 3
    [graph.add(t) for t in tuples if t not in graph]
    return graph

### Prepare graph
* Currently creating a single graph to hold all information. 
* Relevant information gathered from external resources is added; primarily class labels and definitions from WikiData.

In [10]:
irec_graph = Graph()

irec_graph.bind("root", ROOT)
irec_graph.bind("wikipedia", WIKI)
irec_graph.bind("uniclass", UNICLASS)
irec_graph.bind("dct", DCT)
irec_graph.bind("prov", PROV)

In [11]:
# bind our vocabulary of classes/relations
graph_data_fp = Path.cwd().joinpath("data", "graph_data")
irec_graph.parse(graph_data_fp.joinpath("IREC.rdf"))
irec_graph.bind("spans", SPANS)
irec_graph.bind("concepts", CONCEPTS)

In [12]:
# primary sources and agents
irec_IRI = URIRef("https://github.com/rubenkruiper/irec")

merged_approved_documents_IRI = URIRef("https://www.gov.uk/government/collections/approved-documents")
wikidata_IRI = URIRef("https://www.wikidata.org/")
uniclass_IRI = URIRef("https://en.wikipedia.org/wiki/Uniclass")
spart_txt_IRI = URIRef("http://dx.doi.org/10.18653/v1/2021.nllp-1.14")

irec_graph = add_tuples(irec_graph, 
                        [
                            (irec_IRI, PROV.type, PROV.PrimarySource),
                            (merged_approved_documents_IRI, PROV.type, PROV.PrimarySource),
                            (wikidata_IRI, PROV.type, PROV.PrimarySource),
                            (uniclass_IRI, PROV.type, PROV.PrimarySource),
                            (spart_txt_IRI, PROV.type, PROV.SoftwareAgent)       
                        ])


In [13]:
def add_scheme_uid(graph: Graph, primary_source: URIRef, scheme_name: str, scheme_uid_label:str, namespace: Namespace) -> Graph:
    # We'll set the UID ourselves
    scheme_uid = ua.keep_track_of_existing_UID(scheme_name, scheme_uid_label, namespace)
    # add title
    graph = add_tuples(graph, dct_title(scheme_uid, scheme_name, namespace)) 
    # add source note  
    graph = add_tuples(graph, provenance(scheme_uid, primary_source, namespace)) 
    # is of type skos:ConceptScheme
    graph = add_tuples(graph, skos_scheme(scheme_uid, SPANS))
    # self-reference being in scheme
    graph = add_tuples(graph, skos_in_scheme(scheme_uid, scheme_uid, namespace, namespace))
    
    return graph

In [14]:
ua = UID_assigner()

# global UIDs for the schemes we'll be using
irec_graph = add_scheme_uid(irec_graph, irec_IRI, "IREC spans", "schemeUID", SPANS)
irec_graph = add_scheme_uid(irec_graph, irec_IRI, "IREC concepts", "schemeUID", CONCEPTS)

# irec_graph = add_scheme_uid(irec_graph, "IREC WikiData concepts", "schemeUID", WIKI) # NON EXISTENT NODE
# irec_graph = add_scheme_uid(irec_graph, "IREC Uniclass concepts", "schemeUID", UNICLASS)  # NON EXISTENT NODE


### Add base antonyms
* We may want to get a sense of which spans are antonyms
* For this we'll use NLTK's version of WordNet, which mainly captures antonyms for adjectives and adverbs.

In [15]:
wordnet_antonyms = {}
for i in wn.all_synsets():
    if i.pos() in ['a', 's']:    # If synset is adj or satelite-adj.
        for j in i.lemmas():     # Iterating through lemmas for each synset.
            if j.antonyms():     # If adj has antonym.
                wordnet_antonyms[str(j.name()).strip()] = [x.name() for x in j.antonyms()]

# Example of a useful antonym for us
wordnet_antonyms['hot']

['cold']

In [16]:
wordnet_antonyms['cold']

['hot']

In [17]:
# there are cases of multiple antonyms:
[(k, wordnet_antonyms[k]) for k, v in wordnet_antonyms.items() if len(v) > 1]

[('acidic', ['alkaline', 'amphoteric']),
 ('alkaline', ['amphoteric', 'acidic']),
 ('amphoteric', ['acidic', 'alkaline']),
 ('air-to-surface', ['air-to-air', 'surface-to-air']),
 ('air-to-air', ['surface-to-air', 'air-to-surface']),
 ('surface-to-air', ['air-to-surface', 'air-to-air']),
 ('anadromous', ['catadromous', 'diadromous']),
 ('catadromous', ['diadromous', 'anadromous']),
 ('diadromous', ['anadromous', 'catadromous']),
 ('aquatic', ['terrestrial', 'amphibious']),
 ('terrestrial', ['amphibious', 'aquatic']),
 ('amphibious', ['aquatic', 'terrestrial']),
 ('prenatal', ['perinatal', 'postnatal']),
 ('perinatal', ['postnatal', 'prenatal']),
 ('postnatal', ['prenatal', 'perinatal']),
 ('sonic', ['subsonic', 'supersonic']),
 ('subsonic', ['supersonic', 'sonic']),
 ('supersonic', ['sonic', 'subsonic']),
 ('binucleate', ['trinucleate', 'mononuclear']),
 ('mononuclear', ['binucleate', 'trinucleate']),
 ('trinucleate', ['mononuclear', 'binucleate']),
 ('lower-class', ['middle-class', 'up

In [18]:
wordnet_uid = URIRef("https://www.wikidata.org/wiki/Q533822")
for span in wordnet_antonyms.keys():
    span_uid = ua.assign_UID(span, SPANS)
    
    irec_graph = add_tuples(irec_graph, irec_span(span_uid, span))
    irec_graph = add_tuples(irec_graph, skos_in_scheme(span_uid, 'schemeUID', SPANS, SPANS))
    irec_graph = add_tuples(irec_graph, provenance(span_uid, wordnet_uid, SPANS))

    antonyms = wordnet_antonyms[span]
    for antonym in antonyms:
        antonym_uid = ua.assign_UID(antonym, SPANS)

        irec_graph = add_tuples(irec_graph, irec_span(antonym_uid, antonym))
        irec_graph = add_tuples(irec_graph, skos_in_scheme(antonym_uid, 'schemeUID', SPANS, SPANS))
        irec_graph = add_tuples(irec_graph, provenance(antonym_uid, wordnet_uid, SPANS))

        # add the antonym relation
        irec_graph = add_tuples(irec_graph, irec_antonym(span_uid, antonym_uid))
       

### Add domain terms extracted from the Approved documents as Spans

In [19]:
domain_terms = pickle.load(open(graph_data_fp.joinpath('domain_terms.pkl'), 'rb'))

In [20]:
random.sample(domain_terms, 10)

['verandah',
 'parapet gutters',
 'Paragraph cavity trays',
 'decay measurements',
 'purpose group 1',
 'the WC',
 'BS 2782 - 0 Method 508A',
 'sides walls fourth',
 'world heritage sites',
 'mple']

In [21]:
# simply adding the extracted spans
domain_terms = custom_cleaning_rules(domain_terms)
for span in domain_terms:
    span_uid = ua.assign_UID(span, SPANS)
    
    irec_graph = add_tuples(irec_graph, irec_span(span_uid, span))
    irec_graph = add_tuples(irec_graph, skos_in_scheme(span_uid, 'schemeUID', SPANS, SPANS))
    irec_graph = add_tuples(irec_graph, provenance(span_uid, merged_approved_documents_IRI, SPANS))
    # add agent for spans generated by SPaR.txt
    irec_graph = add_tuples(irec_graph, prov_agent(span_uid, spart_txt_IRI, SPANS))

### Add Acronyms that were grabbed from the text

These can help:
* remove terms where the boundary detection is off
* avoid suggesting similar acronyms, e.g., suggest that EPC and EPS are similar 

In [22]:
acronyms = {'PAS': ['ecification', 'Specification'],  'GSIUR': ['Regulations 1998'],  'HSE': ['Regulations 2000',   'water systems',   'Safety Executive',   'Health and Safety Executive'],  'PE': ['Polyethylene', 'polyethylene'],  'DN': ['pipe'],  'DCLG': ['land', 'Local Government', 'England', 'ment'],  'PP': ['Polypropylene'],  'BCB': ['Control Body',   'the building control body',   'Building control body',   'building control body',   'Building Control Body'],  'SRHRV': ['ventilator',   'single room heat recovery ventilator',   'a single room heat recovery ventilator'],  'MVHR': ['blocks', 'heat recovery'],  'WC': ['sets'],  'TFA': ['the total floor area'],  'LRV': ['Light reflectance value'],  'BER': ['Building CO2 Emission Rate', 'CO2 Emission Rate'],  'TER': ['CO2 Emission Rate',   'the Target CO2 Emission Rate',   'Target CO2 Emission Rate'],  'DER': ['CO2 Emission Rate', 'the Dwelling CO2 Emission Rate'],  'EPC': ['energy performance certificate'],  'TFEE': ['Target Fabric Energy Efficiency',   'Fixed building services',   'Energy Efficiency'],  'DHF': ['the Door and Hardware Federation', 'Door and Hardware Federation'],  'REI': ['fire resistance', 'bility'],  'PHE': ['horizontal evacuation'],  'W': ['the final exit', 'final exit'],  'DWELLINGS': ['RESIDENTIAL'],  'OTHER': ['RESIDENTIAL'],  'TSO': ['Office', 'The Stationery Office'],  'FPA': ['the Fire Protection Association', 'Association'],  'A': ['absorption area'],  'AT': ['absorption area'],  'DECC': ['Climate Change'],  'NCM': ['the National Calculation Methodology'],  'ADCAS': ['Allied Services'],  'DFEE': ['Energy Efficiency'],  'LPA': ['the local planning authority', 'planning authority'],  'UKAS': ['the United Kingdom Accreditation Service'],  'BSI': ['the British Standards Institution'],  'EA': ['Accreditation'],  'BGS': ['British Geological Survey'],  'HBN': ['Notes'],  'GGF': ['Glazing Federation'],  'E': ['terms of integrity'],  'TRADA': ['the Timber Research and Development Association', 'Association'],  'ACOP': ['Code of Practice'],  'ATTMA': ['Association'],  'RVA': ['Association', 'the Residential Ventilation Association'],  'TEHVA': ['Association'],  'DSA': ['Association'],  'CIRIA': ['Association'],  'MCRMA': ['Association'],  'DSMA': ['Association'],  'OFTEC': ['Association'],  'WHO': ['Organisation'],  'GAI': ['Architectural Ironmongers'],  'MEV': ['mechanical extract', 'extract ventilation'],  'VST': ['Vicat softening temperature'],  'SCI': ['Guild Steel Construction Institute'],  'FBE': ['the Built Environment', 'ment'],  'DSER': ['Rating'],  'WER': ['Rating'],  'CIWM': ['ment', 'Wastes Management'],  'EOTA': ['ment'],  'GQRA': ['ment'],  'BRE': ['ment', 'the Building Research Establishment'],  'PPS': ['ment'],  'PSV': ['Passive stack ventilation'],  'EST': ['the Energy Saving Trust'],  'CIBSE': ['Ventilation hygiene toolkit', 'Building Services Engineers'],  'AGS': ['Geoenvironmental Specialists'],  'SPAB': ['Ancient Buildings'],  'UF': ['urea formaldehyde'],  'ODPM': ['the Deputy Prime Minister']}

In [23]:
for acronym, spans in acronyms.items():
    
    acronym_uid = ua.assign_UID(acronym, SPANS)
   
    irec_graph = add_tuples(irec_graph, irec_span(acronym_uid, acronym))
    irec_graph = add_tuples(irec_graph, skos_in_scheme(acronym_uid, 'schemeUID', SPANS, SPANS))
    irec_graph = add_tuples(irec_graph, provenance(acronym_uid, merged_approved_documents_IRI, SPANS))
    
    for span in spans:
        span_uid = ua.assign_UID(span, SPANS) 
        irec_graph = add_tuples(irec_graph, irec_span(span_uid, span))
        irec_graph = add_tuples(irec_graph, skos_in_scheme(span_uid, 'schemeUID', SPANS, SPANS))
        irec_graph = add_tuples(irec_graph, provenance(span_uid, merged_approved_documents_IRI, SPANS))

        # These spans should have been 
        
        # todo; 
        #  could do some filtering here of the clearly erroneous span-acronym combinations
        #  or leave this until later, using the graph...
    
        irec_graph = add_tuples(irec_graph, irec_has_acronym(acronym_uid, span_uid))
    

### Add CONCEPTS: defined terms from the Approved Documents

In [24]:
# read data from csv file
definitions = pd.read_excel(graph_data_fp.joinpath("Approved Documents and derived terms.xlsx"), sheet_name="Definitions", keep_default_na=False)

In [25]:
definitions[:3]

Unnamed: 0,Term,Definition,Alternative labels,Note
0,Absorption,"Conversion of sound energy to heat, often by t...",,
1,Absorption coefficient,A quantity characterising the effectiveness of...,,See BS EN 20354:1993.
2,Absorptive material,Material that absorbs sound energy.,,


In [26]:
concepts_definitions_dict = {} # keep track of definitions for parsing later

# create graph from definitions first
for i, row in definitions.iloc[1:].iterrows():
    term = row['Term'] if row['Term'].isupper() else row['Term'].lower()
    alternative_labels = row['Alternative labels']
    definition = row['Definition']
    note = row['Note']

    # add the term as a CONCEPT and as a SPAN
    concept_uid = ua.assign_UID(term, CONCEPTS)
    irec_graph = add_tuples(irec_graph, skos_node(concept_uid, term))
    irec_graph = add_tuples(irec_graph, skos_in_scheme(concept_uid, 'schemeUID', CONCEPTS, CONCEPTS))
    irec_graph = add_tuples(irec_graph, provenance(concept_uid, merged_approved_documents_IRI, CONCEPTS))
    
    span_uid = ua.assign_UID(term, SPANS)
    irec_graph = add_tuples(irec_graph, irec_span(span_uid, term))
    irec_graph = add_tuples(irec_graph, skos_in_scheme(span_uid, 'schemeUID', SPANS, SPANS))
    irec_graph = add_tuples(irec_graph, provenance(span_uid, merged_approved_documents_IRI, SPANS))
    
    # link the concept and the span # as a skos:exactMatch? or smt else?
    irec_graph = add_tuples(irec_graph, skos_exact_match(concept_uid, span_uid, CONCEPTS, SPANS))
    
    # always expecting a definition
    irec_graph = add_tuples(irec_graph, skos_definition(concept_uid, definition))
    
    if note: 
        irec_graph = add_tuples(irec_graph, skos_note(concept_uid, note))
    
    if alternative_labels:
        # lowercase if not an acronym
        alt_labels = [x.strip() if x.isupper() else x.lower().strip() for x in alternative_labels.split(", ")]  
        
        for alt_label in alt_labels:
            if not alt_label:
                continue
            # add the altlabel to the concept node
            alt_label_concept_uid = ua.assign_UID(alt_label, CONCEPTS)
            irec_graph = add_tuples(irec_graph, skos_node(alt_label_concept_uid, alt_label))
            irec_graph = add_tuples(irec_graph, skos_in_scheme(alt_label_concept_uid, 'schemeUID', CONCEPTS, CONCEPTS))
            irec_graph = add_tuples(irec_graph, provenance(alt_label_concept_uid, merged_approved_documents_IRI, CONCEPTS))

            irec_graph = add_tuples(irec_graph, skos_altLabel(concept_uid, alt_label_concept_uid))
            
            # also add as a span
            alt_label_span_uid = ua.assign_UID(alt_label, SPANS)
            irec_graph = add_tuples(irec_graph, irec_span(alt_label_span_uid, alt_label))
            irec_graph = add_tuples(irec_graph, skos_in_scheme(alt_label_span_uid, 'schemeUID', SPANS, SPANS))
            irec_graph = add_tuples(irec_graph, provenance(alt_label_span_uid, merged_approved_documents_IRI, SPANS))
            
            # link the altlabel concept and the altlabel span
            irec_graph = add_tuples(irec_graph, skos_exact_match(alt_label_concept_uid, alt_label_span_uid, CONCEPTS, SPANS))
    
    if concept_uid not in concepts_definitions_dict: 
        concepts_definitions_dict[concept_uid] = [{'prefLabel': term, 'definition': definition, 'note': note}]  
    else:
        concepts_definitions_dict[concept_uid].append({'prefLabel': term, 'definition': definition, 'note': note})  
                                                      
    
            

### Add SPANS: glossary/index terms from the Approved Documents

In [27]:
index_terms = pd.read_excel(graph_data_fp.joinpath("Approved Documents and derived terms.xlsx"), sheet_name="Index terms", keep_default_na=False)

In [28]:
index_terms[:3]

Unnamed: 0,Term,AltLabel(s),Related terms,Broader term
0,abbreviated eaves,,,eaves
1,Access floors,access floor,Platform floors,
2,Access for fire service,fire access,,Fire service facilities


* add triples from index terms / glossaries; we will treat these terms as SPANS
* some of these terms were added manually on top of the index terms found in the Mergeds Approved documents, so we'll avoid adding the provenance relation to these


In [29]:
total_num_index_terms = 0
for i, row in index_terms.iloc[1:].iterrows():
    term = row['Term'].strip() if row['Term'].isupper() else row['Term'].lower().strip()
    alternative_labels = row['AltLabel(s)']
    related_terms = row['Related terms']
    broader_term = row['Broader term']
    
    # add the term as a SPAN only
    span_uid = ua.assign_UID(term, SPANS)
    irec_graph = add_tuples(irec_graph, irec_span(span_uid, term))
    irec_graph = add_tuples(irec_graph, skos_in_scheme(span_uid, 'schemeUID', SPANS, SPANS))
#     irec_graph = add_tuples(irec_graph, provenance(span_uid, merged_approved_documents_IRI, SPANS))
    total_num_index_terms += 1
        
    if alternative_labels:
        # lowercase if not an acronym
        alt_labels = [x.strip() if x.isupper() else x.lower().strip() for x in alternative_labels.split(", ")]  
        
        for alt_label in alt_labels:
            if not alt_label:
                continue
            # add alt-label as a span only (as well)
            alt_label_uid = ua.assign_UID(alt_label, SPANS)
            irec_graph = add_tuples(irec_graph, irec_span(alt_label_uid, alt_label))
            irec_graph = add_tuples(irec_graph, skos_in_scheme(alt_label_uid, 'schemeUID', SPANS, SPANS))
#             irec_graph = add_tuples(irec_graph, provenance(alt_label_uid, merged_approved_documents_IRI, SPANS))
            total_num_index_terms += 1
            
            if alt_label.isupper():
                # there are acronyms among the alternative labels
                irec_graph = add_tuples(irec_graph, irec_has_acronym(span_uid, alt_label_uid))
            else:
                ### Should I use skos altlabels between spans? maybe create IREC alternative label?
                ### Should I use skos altlabels between spans? maybe create IREC alternative label?
                ### Should I use skos altlabels between spans? maybe create IREC alternative label?
                irec_graph = add_tuples(irec_graph, skos_altLabel(span_uid, alt_label_uid))
                

    if related_terms:
        rel_terms = [x.strip() if x.isupper() else x.lower().strip() for x in related_terms.split(", ")]
        for rel_term in rel_terms:
            if not rel_term:
                continue
            # add related terms as a span (as well)
            related_uid = ua.assign_UID(rel_term, SPANS)
            irec_graph = add_tuples(irec_graph, irec_span(related_uid, rel_term))
            irec_graph = add_tuples(irec_graph, skos_in_scheme(related_uid, 'schemeUID', SPANS, SPANS))
#             irec_graph = add_tuples(irec_graph, provenance(related_uid, merged_approved_documents_IRI, SPANS))
            total_num_index_terms += 1
            
            if rel_term.isupper():
                # there are acronyms among the related labels as well
                irec_graph = add_tuples(irec_graph, irec_has_acronym(span_uid, related_uid))
            else:
                irec_graph = add_tuples(irec_graph, irec_related(span_uid, related_uid)) 
    
    if broader_term:
        # We do not expect that the broader term is necessarily a concept.
        # Currently, it is simply a feature for future reference.
        # We expect 1 broader term at most, assuming the final conceptualisation would
        # be structured like a tree (Directed Acyclic Graph with 1 parent at most).
        b_term = broader_term.strip().lower() if not broader_term.isupper() else broader_term.strip()
        # also broader term as a span
        b_term_uid = ua.assign_UID(b_term, SPANS)
        irec_graph = add_tuples(irec_graph, irec_span(b_term_uid, b_term)) 
        irec_graph = add_tuples(irec_graph, skos_in_scheme(b_term_uid, 'schemeUID', SPANS, SPANS))
#         irec_graph = add_tuples(irec_graph, provenance(b_term_uid, merged_approved_documents_IRI, SPANS))
        total_num_index_terms += 1
        
        ### Should I use skos broader between spans? maybe create an IREC broader?
        ### Should I use skos broader between spans? maybe create an IREC broader?
        irec_graph = add_tuples(irec_graph, skos_broader(span_uid, b_term_uid, SPANS, SPANS)) 

print("Total index terms found in spreadsheet: ", total_num_index_terms)

Total index terms found in spreadsheet:  2363


In [30]:
irec_graph.serialize(destination=graph_output_fp.joinpath("approved_doc_terms_only.ttl"))

<Graph identifier=Na14551e51b8c4ff2aca42c66817a746f (<class 'rdflib.graph.Graph'>)>

### Print some insight in the graph so far

In [31]:
ua.count_nodes_in_namespace(SPANS)

Number of nodes in 'https://spans.irec.org/': 12734


12734

In [32]:
ua.count_nodes_in_namespace(CONCEPTS)

Number of nodes in 'https://concepts.irec.org/': 352


352

In [33]:
ua.print_node_by_id(irec_graph, 273, CONCEPTS)

273 ; type ; Concept
273 ; prefLabel ; sanitary accommodation
273 ; inScheme ; schemeUID
273 ; hadPrimarySource ; https://www.gov.uk/government/collections/approved-documents
273 ; exactMatch ; sanitary%20accommodation
273 ; definition ; A space containing one or more water closets or urinals, whether or not it also contains other sanitary appliances. Sanitary accommodation containing one or  more cubicles counts as a single space if there is free circulation of air throughout the space.


In [34]:
ua.print_node_by_id(irec_graph, urllib.parse.quote('sanitary accommodation'), SPANS)

sanitary%20accommodation ; type ; CharacterSpan
sanitary%20accommodation ; label ; sanitary accommodation
sanitary%20accommodation ; inScheme ; schemeUID
sanitary%20accommodation ; hadPrimarySource ; https://www.gov.uk/government/collections/approved-documents
sanitary%20accommodation ; wasAttributedTo ; http://dx.doi.org/10.18653/v1/2021.nllp-1.14
sanitary%20accommodation ; related ; sanitary%20appliance


In [35]:
ua.print_node_by_id(irec_graph, urllib.parse.quote('wet room'), SPANS)

wet%20room ; type ; CharacterSpan
wet%20room ; label ; wet room
wet%20room ; inScheme ; schemeUID
wet%20room ; hadPrimarySource ; https://www.gov.uk/government/collections/approved-documents
wet%20room ; wasAttributedTo ; http://dx.doi.org/10.18653/v1/2021.nllp-1.14


In [36]:
ua.print_node_by_text(irec_graph, 'sanitary accommodation', SPANS)

sanitary%20accommodation ; type ; CharacterSpan
sanitary%20accommodation ; label ; sanitary accommodation
sanitary%20accommodation ; inScheme ; schemeUID
sanitary%20accommodation ; hadPrimarySource ; https://www.gov.uk/government/collections/approved-documents
sanitary%20accommodation ; wasAttributedTo ; http://dx.doi.org/10.18653/v1/2021.nllp-1.14
sanitary%20accommodation ; related ; sanitary%20appliance


As you can see in the examples above, the concept `wet room` and the span `sanitary accomdodation` are related:
* The concept `wet room` is provided with a note in the merged approved documents.
* The text inside this node describes how, for part F of the approved documents, `sanitary accomodation` is regarded as a `wet room`. 

Based on the above, we'd like to link the span `sanitary accomodation` to the concept `wet room`. While we could parse the note in more detail, and identify that a `skos:altLabel` relation should be added, we'll use a more generic approach:
* Any span that is found inside a definition or note of a concept will be linked through `irec:related`
* Based on the definitions above, potential spans related to the `wet room` concept then become: `sanitary accomdoation`, `airborn moisture`, `kitchen`, `utility room`, `bathroom`, `WC`, `tanking`, `drainage`, `gulley`, `shower`.


<div class="alert alert-block alert-info">
We believe that the types of relations described above can be valuable and would like to provide more definitions for more terms, to help interrelate more spans and concepts. To this end, we first try to find WikiData definitions for all concepts and spans. 
</div>

### Grab wikipedia definitions for Concept nodes, and store locally for re-use

* First, we try to grab all wiki definitions for all spans and concepts that are in the graph (so far)

In [37]:
# set up the SPARQL endpoint for wikidata
sparql_wrapper = SPARQLWrapper("https://query.wikidata.org/sparql")

In [38]:
def get_wiki_matches(graph_sparql_endpoint: SPARQLWrapper,
                     jargon_term_and_uids: List):

    all_wiki_definitions = {}
    # we want to grab the term (subject), any definition (subjectDescription) and the class (subjectClass)
    sparql_q = """
               SELECT ?subject ?subjectDescription ?classUID ?className WHERE {
                  ?subject rdfs:label "$QUERY"@en.
                  ?subject wdt:P31|wdt:P279 ?classUID.
                  SERVICE wikibase:label { bd:serviceParam wikibase:language "en".}
                  ?classUID  rdfs:label ?className  FILTER(LANG(?className) = "en").
                }
               """
    
    for term, uid in tqdm(jargon_term_and_uids):
        # make the call to 
        temp_q = sparql_q.replace("$QUERY", term)
        graph_sparql_endpoint.setQuery(temp_q)
        graph_sparql_endpoint.setReturnFormat(JSON)
        try:
            json_output = graph_sparql_endpoint.query().convert()
        except:
            # If no result, wait 2s; One client is allowed 30 error queries per minute
            print(f"Error for query, you should what's wrong with the term: {term}")
            time.sleep(2)
            continue
            
        # sometimes multiple Wiki UIDs for a single term, we grab them all here
        bindings = [v for v in json_output['results']['bindings']]
            

        for v in bindings:
            class_uid = v['classUID']['value'] if 'classUID' in v else ""
            class_label = v['className']['value'] if 'className' in v else ""
            
            if 'subjectDescription' in v:
                if uid not in all_wiki_definitions:
                    all_wiki_definitions[uid] = [{'prefLabel': term,
                                                  'class_uid': class_uid,
                                                  'class_label': class_label,
                                                  'WikiUID': v['subject']['value'],
                                                  'WikiDefinition': v['subjectDescription']['value']}]
                else:
                    all_wiki_definitions[uid].append({'prefLabel': term,
                                                      'class_uid': class_uid,
                                                      'class_label': class_label,
                                                      'WikiUID': v['subject']['value'],
                                                      'WikiDefinition': v['subjectDescription']['value']})
            elif 'subject' in v:
                # no description found, simply adding wiki UID if that exists
                if uid not in all_wiki_definitions:
                    all_wiki_definitions[uid] = [{'prefLabel': term,
                                                  'class_uid': class_uid,
                                                  'class_label': class_label,
                                                  'WikiUID': v['subject']['value']}]
                else:
                    all_wiki_definitions[uid].append({'prefLabel': term,
                                                      'class_uid': class_uid,
                                                      'class_label': class_label,
                                                      'WikiUID': v['subject']['value']})
    return all_wiki_definitions


In [39]:
concepts_and_uids = [(k, v) for k, v in ua.UIDs[CONCEPTS.placeholder.defrag().__reduce__()[1][0]].items()]
spans_and_uids = [(k, v) for k, v in ua.UIDs[SPANS.placeholder.defrag().__reduce__()[1][0]].items()]

In [40]:
# First run for the Concepts
concept_wiki_dict_fp = graph_output_fp.joinpath("concept_wiki_dict.json")
if not concept_wiki_dict_fp.exists():
    concept_wiki_dict = get_wiki_matches(sparql_wrapper, concepts_and_uids)#{'test': 1, 'conductor':2})
    with open(concept_wiki_dict_fp, 'w') as f:
        json.dump(concept_wiki_dict, f, indent=2)
else:
    with open(concept_wiki_dict_fp, 'r') as f:
        concept_wiki_dict = json.load(f)

In [41]:
# Now run for the spans
span_wiki_dict_fp = graph_output_fp.joinpath("span_wiki_dict.json")
if not span_wiki_dict_fp.exists():
    span_wiki_dict = get_wiki_matches(sparql_wrapper, spans_and_uids)#{'test': 1, 'conductor':2})
    with open(span_wiki_dict_fp, 'w') as f:
        json.dump(span_wiki_dict, f, indent=2)
else:
    with open(span_wiki_dict_fp, 'r') as f:
        span_wiki_dict = json.load(f)

In [42]:
print("Number of concepts with WikiData definitions: {} ({:.2f}%)".format(len(concept_wiki_dict), len(concept_wiki_dict)/ua.count_nodes_in_namespace(CONCEPTS)*100))
print("Number of spans with WikiData definitions: {} ({:.2f}%)".format(len(span_wiki_dict), len(span_wiki_dict)/ua.count_nodes_in_namespace(SPANS)*100))



Number of nodes in 'https://concepts.irec.org/': 352
Number of concepts with WikiData definitions: 244 (69.32%)
Number of nodes in 'https://spans.irec.org/': 12734
Number of spans with WikiData definitions: 655 (5.14%)


* Some examples of/insight in definitions from different sources

In [43]:
# List of definitions from approved documents
concepts_definitions_dict['1']

[{'prefLabel': 'absorption coefficient',
  'definition': 'A quantity characterising the effectiveness of a sound absorbing surface. The proportion of sound energy absorbed is given as a number between zero (for a fully reflective surface) and one (for a fully absorptive surface). Note that sound absorption coefficients determined from laboratory measurements may have values slightly larger than one.',
  'note': 'See BS EN 20354:1993.'}]

In [44]:
# List of definitions for the same concept, from WikiData
concept_wiki_dict['1']

[{'prefLabel': 'absorption coefficient',
  'class_uid': 'http://www.wikidata.org/entity/Q107715',
  'class_label': 'physical quantity',
  'WikiUID': 'http://www.wikidata.org/entity/Q97368968',
  'WikiDefinition': 'measure for the exponential reduction of a quantity along a path due to absorption',
  'Spans in definitions and notes': ['a path due',
   'a quantity',
   'the exponential reduction']}]

In [45]:
# List of definitions for a related span, from WikiData
print([k for k in span_wiki_dict.keys() if 'absor' in k])
span_wiki_dict['absorbent']

['absorbent']


[{'prefLabel': 'absorbent',
  'class_uid': 'http://www.wikidata.org/entity/Q3505845',
  'class_label': 'state',
  'WikiUID': 'http://www.wikidata.org/entity/Q110147344',
  'WikiDefinition': 'having the ability or tendency to absorb; able to soak up liquid easily; absorptive.',
  'Spans in definitions and notes': ['liquid easily', 'the ability tendency']}]

### Only keep WikiData definitions that belong to classes that we've annotated
* We have previously annotated the relevance of all WikiData classes returned for the defined terms and index terms in the Approved Documents.

In [46]:
annotated_wikidata_classes_df = pd.read_csv(graph_data_fp.joinpath("wiki_classes_annotated.csv"), index_col=1)

In [47]:
annotated_wikidata_classes_df[:3]

Unnamed: 0_level_0,WikiData class,Annotation,Example spans
WikiData UIDs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
['Q107715'],physical quantity,y,"['sound pressure level', 'density', 'area', 's..."
['Q82799'],name,n,['access point']
['Q180160'],metadata,n,['access point']


In [48]:
wikiclass_dict = {}
for row in annotated_wikidata_classes_df.iterrows():
    uid_list_string, class_annotations_examples = row
    uid_list = uid_list_string[2:-2].split(',')
    for uid in uid_list:
        wikiclass_dict[uid] = {
            'Class': class_annotations_examples['WikiData class'],
            'Annotation': class_annotations_examples['Annotation'],
            'Example spans': class_annotations_examples['Example spans']
        }
    

In [49]:
def filter_wikidata_classes(wiki_class_dict, term_dict):
    new_term_dict = {}
    removed_definitions = []
    for uid, definition_dict_list in term_dict.items():
        for definition_dict in definition_dict_list:
            class_uid = definition_dict['class_uid'].rsplit("/", 1)[1]
            if class_uid in wikiclass_dict:
                class_name = wikiclass_dict[class_uid]["Class"]
                if wikiclass_dict[class_uid]["Annotation"] == 'y':
                    if uid not in new_term_dict:
                        new_term_dict[uid] = [definition_dict]
                    else:
                        new_term_dict[uid].append(definition_dict)
                else:
                    removed_definitions.append(definition_dict)
                        
    return new_term_dict, removed_definitions

In [50]:
concept_wiki_dict, removed_definitions = filter_wikidata_classes(wikiclass_dict, concept_wiki_dict)

In [51]:
span_wiki_dict, removed_definitions = filter_wikidata_classes(wikiclass_dict, span_wiki_dict)

### Parse all definitions (including WikiData) to identify additional spans
* Add the spar objects found in definitions to respective dictionaries

In [52]:
def add_spar_labels(input_dict: Dict[str, str], term_extractor: TermExtractor):
    number_of_definitions = 0
    for uid, definition_dict_list in tqdm(input_dict.items()):
        for idx, definition_dict in enumerate(definition_dict_list):
            if 'Spans in definitions and notes' in definition_dict:
                # spans already computed for this definition_dict, continuing to check next
                continue
            
            spartxt_objects = []
            for k in definition_dict.keys():
                if k in ['WikiDefinition', 'definition', 'note']:
                    to_be_parsed = definition_dict[k]
                    number_of_definitions += 1
                    sentences = term_extractor.split_into_sentences(to_be_parsed)
                    # cleaning spans as well;
                    sentences = [remove_unicode_chars(s).encode("ascii", "ignore").decode() for s in sentences]
                    spartxt_objects += custom_cleaning_rules(term_extractor.process_sentences(sentences))
                    
            input_dict[uid][idx]['Spans in definitions and notes'] = spartxt_objects
    print(f"Processed {number_of_definitions} definitions")
    return input_dict

In [53]:
concept_definitions_dict_fp = graph_output_fp.joinpath("concepts_definitions_dict.json")
te = None
if not concept_definitions_dict_fp.exists():
    if not te:
        # instantiate a TermExtractor obj, with max_num_cpu_threads instances of SPaR.txt predictors 
        te = TermExtractor(max_num_cpu_threads=4)
    
    print("Computing SPaR.txt objects for concepts_definitions_dict")
    concepts_definitions_dict = add_spar_labels(concepts_definitions_dict, te)
    with open(concept_definitions_dict_fp, 'w') as f:
        json.dump(concepts_definitions_dict, f, indent=2)
else:
    print("Loading previously computed concepts_definitions_dict with SPaR.txt objects from file")
    with open(concept_definitions_dict_fp, 'r') as f:
        concepts_definitions_dict = json.load(f)

Loading previously computed concepts_definitions_dict with SPaR.txt objects from file


In [54]:
if not concept_wiki_dict_fp.exists():
    if not te:
        # instantiate a TermExtractor obj, with max_num_cpu_threads instances of SPaR.txt predictors 
        te = TermExtractor(max_num_cpu_threads=4)
    concept_wiki_dict = add_spar_labels(concept_wiki_dict, te)
    # Save the updated concept_wiki_dict, will be loaded in previous cells anyway
    with open(concept_wiki_dict_fp, 'w') as f:
        json.dump(concept_wiki_dict, f, indent=2)

In [55]:
if not span_wiki_dict_fp.exists():
    if not te:
        # instantiate a TermExtractor obj, with max_num_cpu_threads instances of SPaR.txt predictors 
        te = TermExtractor(max_num_cpu_threads=4)
    span_wiki_dict = add_spar_labels(span_wiki_dict, te)
    # Save the updated span_wiki_dict, which will be loaded in previous cells anyway
    with open(span_wiki_dict_fp, 'w') as f:
        json.dump(span_wiki_dict, f, indent=2)

### Add any new spans to the graph

* First, count the number of defined terms, separate definitions (a term may have multiple definitions), and the number of spans found in these definitions.

In [56]:
def count_spans(some_dict: Dict[str, str], primary_source: URIRef, per_term_definition_counts: Counter):
    primary_source_str = primary_source.__reduce__()[1][0]
    spartxt_objects_in_dict = {primary_source_str: {}}
    total_num_definitions = 0
    for definition_dict_list in some_dict.values():
        for definition_dict in definition_dict_list:
            per_term_definition_counts[definition_dict['prefLabel']] = len([d for d in definition_dict_list if any(d for d in d.keys() if d in ['WikiDefinition', 'definition', 'note'])])
        
            for k in definition_dict.keys():
                if k in ['WikiDefinition', 'definition', 'note']:
                    total_num_definitions += 1
            
            defined_term = definition_dict['prefLabel']
            spar_objects_in_dict = custom_cleaning_rules(definition_dict['Spans in definitions and notes'])
            
            if defined_term not in spartxt_objects_in_dict[primary_source_str]:
                spartxt_objects_in_dict[primary_source_str][defined_term] = spar_objects_in_dict
            else:
                spartxt_objects_in_dict[primary_source_str][defined_term] += spar_objects_in_dict
                
    return spartxt_objects_in_dict, per_term_definition_counts, total_num_definitions
    

In [57]:
all_spartxt_objects = {}
total_num_definitions = 0
per_term_definition_counts = Counter()
for some_dict, source in zip([concepts_definitions_dict, concept_wiki_dict, span_wiki_dict],
                             [merged_approved_documents_IRI, wikidata_IRI, wikidata_IRI]):
    spar_txt_objects_in_dict, term_def_counts, num_def = count_spans(some_dict, source, per_term_definition_counts)
    all_spartxt_objects.update(spar_txt_objects_in_dict)
    per_term_definition_counts.update(term_def_counts)
    total_num_definitions += num_def

In [58]:
defined_terms = []
spans_from_defined_terms = []
for source, defined_term_dict in all_spartxt_objects.items():
    defined_terms += [k for k in defined_term_dict.keys()]
    for spans in defined_term_dict.values():
        spans_from_defined_terms += spans
        

print("Number of defined terms: ", len(defined_terms))
print("Number of definitions/notes found:", total_num_definitions)
print("Number of new spans: ", len(list(set(spans_from_defined_terms))))
print("Top 10 defined terms with most definitions:")
per_term_definition_counts.most_common(10)

Number of defined terms:  951
Number of definitions/notes found: 2472
Number of new spans:  3230
Top 10 defined terms with most definitions:


[('house', 192),
 ('frequency', 32),
 ('lead', 32),
 ('Hotel', 28),
 ('density', 24),
 ('pier', 24),
 ('accessibility', 24),
 ('volume', 24),
 ('risk assessment', 20),
 ('aluminium', 20)]

In [59]:
# Random sample of spans found in the definitions (that weren't )
random.sample(list(set(spans_from_defined_terms)), 10)

['the spread',
 'section 67',
 'stonework',
 '19 years old',
 'a connected gulley',
 'sizes',
 'a porous material',
 'activity',
 'The work',
 'pressure increase']

* Add any new spans to the graph, with prov:hasPrimarySource *WikiData* and prov:wasAttributedTo *SPaR.txt*

In [60]:
# add a related label between the defined concept/span, and the span found in a definition
for i, (source, term_and_spans_dict) in enumerate(all_spartxt_objects.items()):
    for term, related_spans in term_and_spans_dict.items():

        # add the term (defined concept) as a span 
        term_uid = ua.assign_UID(term, SPANS)
        irec_graph = add_tuples(irec_graph, irec_span(term_uid, term))
        irec_graph = add_tuples(irec_graph, skos_in_scheme(term_uid, 'schemeUID', SPANS, SPANS))
        irec_graph = add_tuples(irec_graph, provenance(term_uid, URIRef(source), SPANS))

        # Check if the term is actually already identified as a concept
        concept_uid = None
        if term in ua.UIDs[CONCEPTS._.defrag().__reduce__()[1][0]]:
            concept_uid = ua.UIDs[CONCEPTS._.defrag().__reduce__()[1][0]][term]

        rel_spans = [x.strip() if x.isupper() else x.lower().strip() for x in related_spans]
        for rel_term in rel_spans:
            # Add the spans that were extracted from the definitions
            related_uid = ua.assign_UID(rel_term, SPANS)
            irec_graph = add_tuples(irec_graph, irec_span(related_uid, rel_term))
            irec_graph = add_tuples(irec_graph, skos_in_scheme(related_uid, 'schemeUID', SPANS, SPANS))
            irec_graph = add_tuples(irec_graph, provenance(related_uid, URIRef(source), SPANS))
            # add agent for spans generated by SPaR.txt
            irec_graph = add_tuples(irec_graph, prov_agent(span_uid, spart_txt_IRI, SPANS))

            # Add relation between spans
            irec_graph = add_tuples(irec_graph, irec_related(term_uid, related_uid)) 

            # Add relation between concept and span
            if concept_uid:
                irec_graph = add_tuples(irec_graph, irec_related(concept_uid, related_uid, CONCEPTS, SPANS)) 
    

In [61]:
concept_wiki_dict

{'1': [{'prefLabel': 'absorption coefficient',
   'class_uid': 'http://www.wikidata.org/entity/Q107715',
   'class_label': 'physical quantity',
   'WikiUID': 'http://www.wikidata.org/entity/Q97368968',
   'WikiDefinition': 'measure for the exponential reduction of a quantity along a path due to absorption',
   'Spans in definitions and notes': ['a path due',
    'a quantity',
    'the exponential reduction']}],
 '18': [{'prefLabel': 'atrium',
   'class_uid': 'http://www.wikidata.org/entity/Q180516',
   'class_label': 'room',
   'WikiUID': 'http://www.wikidata.org/entity/Q189265',
   'WikiDefinition': 'architectural feature: courtyard in a Roman domus',
   'Spans in definitions and notes': ['architectural feature : courtyard',
    'a Roman domus']},
  {'prefLabel': 'atrium',
   'class_uid': 'http://www.wikidata.org/entity/Q309250',
   'class_label': 'courtyard',
   'WikiUID': 'http://www.wikidata.org/entity/Q189265',
   'WikiDefinition': 'architectural feature: courtyard in a Roman domu

### Add WikiData definitions to graph

In [62]:
def add_wiki_definitions(irec_graph: Graph, wiki_dict: Dict[str, str], dict_namespace: Namespace):
    """
    """
    for i, (_, definition_dict_list) in enumerate(wiki_dict.items()):
        for definition_dict in definition_dict_list:
            wiki_term = definition_dict['prefLabel']
            wiki_class_label = definition_dict['class_label'] 
            wiki_class_uid = definition_dict['class_uid'] 
            wiki_uid = definition_dict['WikiUID'].rsplit('/', 1)[1]

            # keep track of uid in the Unique ID assigner obj as well
            _ = ua.keep_track_of_existing_UID(wiki_term, wiki_uid, WIKI)
            
            # add the WikiData concept to the graph, in WIKI namespace
            irec_graph = add_tuples(irec_graph, skos_node(wiki_uid, wiki_term, WIKI))
            # irec_graph = add_tuples(irec_graph, skos_in_scheme(wiki_uid, 'schemeUID', WIKI, WIKI))
            irec_graph = add_tuples(irec_graph, provenance(wiki_uid, wikidata_IRI, WIKI))
            
            # Add an exact match between the wiki node and our concept from the Merged Approved Documents
            term_uid = ua.retrieve_uid_by_text(wiki_term, CONCEPTS)
            if term_uid:
                irec_graph = add_tuples(irec_graph, skos_exact_match(term_uid, wiki_uid, CONCEPTS, WIKI))
            
            #### We will link the WikiData concept to a span, rather than a concept, as well as its definitions
            # and class labels
            # 1) Add a span and a link to the wiki concept
            span_uid = ua.retrieve_uid_by_text(wiki_term, SPANS)
            if not span_uid:
                raise Exception(f"Cannot find the span: {wiki_term}!")
            
            # 2) add wiki class label, as well as a str version of the UID for reference, in SPANS namespace
            wiki_class_label_and_uid = f"{wiki_class_label} [{wiki_class_uid}]"
            irec_graph = add_tuples(irec_graph, irec_wikidef(span_uid,  wiki_class_label_and_uid, SPANS))

            # 3)  Add the WIKI definition to the node if it exists, in SPANS namespace
            if 'WikiDefinition' in definition_dict:            
                definition = definition_dict['WikiDefinition']
                irec_graph = add_tuples(irec_graph, irec_wikidef(span_uid, definition, SPANS))
                
            # 4) Add an exact match between the span and wikidata concept as well
            irec_graph = add_tuples(irec_graph, skos_exact_match(span_uid, wiki_uid, SPANS, WIKI))

    return irec_graph

In [63]:
irec_graph = add_wiki_definitions(irec_graph, concept_wiki_dict, CONCEPTS)

In [64]:
irec_graph = add_wiki_definitions(irec_graph, span_wiki_dict, SPANS)

### We will also add the Uniclass terms that we found in the text to the graph

In [65]:
with open(graph_data_fp.joinpath("uniclass_terms_in_text.pkl"), 'rb') as f:
    uniclass_terms_in_text = pickle.load(f)

In [66]:
for uniclass_uid, definition_dict in uniclass_terms_in_text.items():
    # Add the Uniclass node to our graph
    uniclass_term = definition_dict['pref_label']
    # keep track of uid that is added to the graph
    _ = ua.keep_track_of_existing_UID(uniclass_term, uniclass_uid, UNICLASS)
    
    # add the concept to the graph, in UNICLASS namespace
    irec_graph = add_tuples(irec_graph, skos_node(uniclass_uid, uniclass_term, UNICLASS))
    # irec_graph = add_tuples(irec_graph, skos_in_scheme(uniclass_uid, 'schemeUID', UNICLASS, UNICLASS))
    irec_graph = add_tuples(irec_graph, provenance(uniclass_uid, uniclass_IRI, UNICLASS))
    
    # Determine or create the corresponding term_uid in SPANS and add a # skos:exactMatch?
    if ua.retrieve_uid_by_text(uniclass_term): # First as is (no lowercasing, despite Uniclass casing)
        # Add an exact match between the Uniclass node and the corresponding span
        span_uid = ua.retrieve_uid_by_text(uniclass_term)
        irec_graph = add_tuples(irec_graph, skos_exact_match(span_uid, uniclass_uid, SPANS, UNICLASS))
    elif ua.retrieve_uid_by_text(uniclass_term.lower()):
        # Add an exact match between the wiki node and the corresponding lowercased version in SPANS
        span_uid = ua.retrieve_uid_by_text(uniclass_term.lower())
        irec_graph = add_tuples(irec_graph, skos_exact_match(span_uid, uniclass_uid, SPANS, UNICLASS))
    else:
        # Although the uniclass term was found in the text, no exact matching span was extracted by SPaR.txt
        # add the term (defined concept) as a span 
        term_uid = ua.assign_UID(uniclass_term, SPANS)
        irec_graph = add_tuples(irec_graph, irec_span(term_uid, uniclass_term))
        irec_graph = add_tuples(irec_graph, skos_in_scheme(term_uid, 'schemeUID', SPANS, SPANS))
        irec_graph = add_tuples(irec_graph, provenance(term_uid, uniclass_IRI, SPANS))

        irec_graph = add_tuples(irec_graph, skos_exact_match(term_uid, uniclass_uid, SPANS, UNICLASS))
    

In [67]:
irec_graph.serialize(destination=graph_output_fp.joinpath("initial_graph.ttl"))

<Graph identifier=Na14551e51b8c4ff2aca42c66817a746f (<class 'rdflib.graph.Graph'>)>

### Compute properties between spans [work in progress, 15K spans is 225 Million combinations... that's a lot]
* link definitions to spans (if span occurs verbatim; linkwith irec:related)
* link spans to spans:
  * semantic similarity, x and y might be alternative labels or have the same superclass -> based on kNN
  * constitutes; x occurs in y, thus y might be an extended phrase for x and perhaps a subclass, or x may be a material property, and so on
  * morphological similarity, x may be an inflection of y or somehow related

**Semantic similarity**

In [68]:
bert_model_name = "bert-base-cased"
embedding_output_fp = Path.cwd().joinpath("data", "term_embedding")
IDF_path = embedding_output_fp.joinpath("IDF_weights.json")
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
embedder = Embedder(tokenizer, bert_model, 
                      IDF_dict=json.load(open(IDF_path)), 
                      embedding_fp=embedding_output_fp,
                      layers_to_use = [12],         # we'll use the output of the last layer
                      layer_combination = "avg",    # how to combine layers if multiple are used
                      idf_threshold = 1.5,          # minimum IDF value for a token to contribute
                      idf_weight_factor = 1.0,      # modify how strong the influence of IDF weighting is
                      not_found_idf_value = 0.5)    # IDF value for tokens that weren't seen during IDF computation (doesn't apply here)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
# First we'll need to compute the embeddings for new_spans. 
# Same process as before, EXCEPT that we now normalise the spans directly as well.
unique_new_spans = list(set(spans_from_defined_terms))

### TODO: move this into a utility function
# Compute the embeddings, this is split into subsets so we don't overload your memory (adjust these values if needed)
max_num_cpu_threads = 4
subset_size = 1000

# Checks which of the embeddings for the clustering cluster_data already exist, so they can be re-used
term_subsets = split_list(unique_new_spans, subset_size)
embedding_files = [f for f in embedder.embedding_fp.glob('def_term_standardised_embeddings*.pkl')]
span_and_embedding_pairs = []
if len(embedding_files) == len(term_subsets):
    for e in embedding_files:
        span_and_embedding_pairs += pickle.load(open(e, 'rb'))
else:
    print(f"Preparing embeddings for {len(unique_new_spans)} spans, in groups of: {subset_size}")
    subset_idx = 0            # iterator index outside of tqdm 
    for subset in tqdm(term_subsets):
        subset_embeddings = []
        subset_file_name = embedder.embedding_fp.joinpath("def_term_standardised_embeddings_part_{}.pkl".format(subset_idx))
        subset_idx += 1
        if subset_file_name.exists():
            # already computed previously
            continue
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_num_cpu_threads) as executor:
            futures = [executor.submit(embedder.embed_and_normalise_span, subset[idx]) for idx in range(len(subset))]

        subset_embeddings += [f.result() for f in futures if f.result()]

        with open(subset_file_name, 'wb') as f:
            pickle.dump(subset_embeddings, f)

    # Once all embeddings are created; combine them in span_and_embedding_pairs
    embedding_files = [f for f in embedder.embedding_fp.glob('def_term_standardised_embeddings*.pkl')]
    for e in embedding_files:
        span_and_embedding_pairs += pickle.load(open(e, 'rb'))

Preparing embeddings for 3230 spans, in groups of: 1000


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:48<00:00, 12.18s/it]


In [70]:
# load the old span_and_embedding_pairs as well
old_embedding_files = [f for f in embedder.embedding_fp.glob('embeddings*.pkl')]
old_span_and_embedding_pairs = []
for e in old_embedding_files:
    old_span_and_embedding_pairs += pickle.load(open(e, 'rb'))

In [83]:
unique_spans = [s for (s, e) in old_span_and_embedding_pairs + span_and_embedding_pairs]
standardised_clustering_data = np.stack([np.mean(e, axis=0) if len(e.shape) > 1 else e for (s, e) in old_span_and_embedding_pairs + span_and_embedding_pairs])


In [79]:
# Compute the kNN graph for ALL spans now (old + new)
n_neighbors = 5 # the number of neighbours we compute for each term
knn_graph = kneighbors_graph(standardised_clustering_data, 
                             n_neighbors,    
                             metric="cosine", # <- note we're using cosine sim
                             n_jobs=8)

In [86]:
knn_sim_dict = {}
for span_idx, span in enumerate(unique_spans):
    knn_sim_dict[span] = [unique_spans[neighbour_idx] for neighbour_idx in knn_graph[span_idx].indices]

In [87]:
knn_sim_dict

{'vessel': ['a vessel', 'A vessel', 'vessel', 'closed vessel', 'entity'],
 'expansion water': ['hot water safety water efficiency',
  'foul water drainage',
  'foul water',
  'cooling air',
  'variations pressure'],
 'a hot water system': ['the hot water system',
  'a hot water storage system',
  'hot water system',
  'hot water systems',
  'a solar hot water system'],
 'PLY': ['PLY AND', 'UPPLY', 'PHE', 'PE', 'SAFETY'],
 'the outlet': ['The outlet',
  'the inlet',
  'the smoke outlet',
  'the outlet size',
  'the subsidiary'],
 'the hot water': ['the hot water system',
  'the hot tap',
  'a hot water system',
  'hot water',
  'domestic hot water'],
 'Water Fittings': ['Fittings',
  'the Water Supply Water Fittings',
  'Fixed ladders',
  'Fixings',
  'Fire Resisting Metal Doorsets Timber Research and Development Associations [ 2010 ]'],
 'the safety devices': ['the safety',
  'a safety device',
  'the device',
  'the coin',
  'rubber'],
 'the application': ['the application procedure',

**Constitutes & morphological similarity**

In [None]:
class CharacterSpan:
    def __init__(self, span:str, span_uid: str):
        
        if not span:
            raise Exception("Input is an empty string!")
        
        self.text = span
        self.uid = span_uid
        self.blob = TextBlob(span)
        self.words = [w for w in self.blob.words]
        self.stems = [w.stem() for w in self.words]
        
        self.morphologically_similar_uids = {}
        self.semantically_similar_uids = {}
        self.constitutes_uids = {}
        self.contains_antonym_uids = {}       

In [None]:

         
def span_constitutes_span(span_one: CharacterSpan, span_two: CharacterSpan):
    """ True if in a span, you can find all of the words comprising span_two (order doesn't matter) """
    word_overlap = list(set(span_one.words) & set(span_two.words))
    if len(word_overlap) in [len(span_one.words), len(span_two.words)]:
#             span_one.constitutes_uids.add(span_two.uid)
        return True
    return False
    
def morphologically_similar(span_one: CharacterSpan, span_two: CharacterSpan):
    """ True if this span has either a small Levenshtein distance, or many overlapping words/stems with another span """
    if levenshtein(span_one.text, span_two.text):
#         self.morphologically_similar_uids.add(span_two.uid)
        return True
    elif len(span_one.words) > 1 and len(span_two.words) > 1: # maybe both?
        word_overlap = list(set(span_one.words) & set(span_two.words))
        stem_overlap = list(set(span_one.stems) & set(span_two.stems))

        if len(word_overlap) >= ((len(span_one.words) + len(span_two.words) - 2) // 2) or \
            len(stem_overlap) >= ((len(span_one.stems) + len(span_two.stems) - 2) // 2):
#             span_one.morphologically_similar_uids.add(span_two.uid)
            return True
    return False

#     def span_with_antonym(self, span_one: CharacterSpan, span_two: CharacterSpan, wordnet_antonyms: Dict[str, str]=wordnet_antonyms):
#         if any([x for x in wordnet_antonyms.keys() if \
#             (TextBlob(x).words[0].stem() in self.text and \
#             any([d for d in wordnet_antonyms[x] if TextBlob(d).words[0].stem() in span_two.text]))]):
#             self.contains_antonym_uids.append(span_two.uid)

In [None]:
test_1 = "acoustic"
test_2 = "thermal"
cs1 = CharacterSpan(test_1, '1')
cs2= CharacterSpan(test_2, '2')
print("span_1 constitutes span_2: ", span_constitutes_span(cs1, cs2))
print("morphologically similar: ", morphologically_similar(cs1, cs2))
# print("semantically similar: ", semantically_similar(embedder, cs1, cs2))
# print("antonym present: ", antonym_present(cs1, cs2))


In [None]:
s1 = "photo-voltaic cell"
s2 = "photo-voltaic system"
cs1 = CharacterSpan(test_1, '1')
cs2= CharacterSpan(test_2, '2')
print("span_1 constitutes span_2: ", span_constitutes_span(cs1, cs2))
print("morphologically similar: ", morphologically_similar(cs1, cs2))
# print("semantically similar: ", semantically_similar(embedder, cs1, cs2))
# print("antonym present: ", antonym_present(cs1, cs2))



In [None]:
test_1 = "damp proof course"
test_2 = "damp proof membrane"
cs1 = CharacterSpan(test_1, '1')
cs2= CharacterSpan(test_2, '2')
print("span_1 constitutes span_2: ", span_constitutes_span(cs1, cs2))
print("morphologically similar: ", morphologically_similar(cs1, cs2))
# print("semantically similar: ", semantically_similar(embedder, cs1, cs2))
# print("antonym present: ", antonym_present(cs1, cs2))



In [None]:
test_1 = "hot water storage"
test_2 = "cold water system"
cs1 = CharacterSpan(test_1, '1')
cs2= CharacterSpan(test_2, '2')
print("span_1 constitutes span_2: ", span_constitutes_span(cs1, cs2))
print("morphologically similar: ", morphologically_similar(cs1, cs2))
# print("semantically similar: ", semantically_similar(embedder, cs1, cs2))
# print("antonym present: ", antonym_present(cs1, cs2))


In [None]:
test_1 = "hot water storage system"
test_2 = "hot water storage"
cs1 = CharacterSpan(test_1, '1')
cs2= CharacterSpan(test_2, '2')
print("span_1 constitutes span_2: ", span_constitutes_span(cs1, cs2))
print("morphologically similar: ", morphologically_similar(cs1, cs2))
# print("semantically similar: ", semantically_similar(embedder, cs1, cs2))
# print("antonym present: ", antonym_present(cs1, cs2))


In [None]:
def compute_features(argument_list: List[Any], pbar):
    span_one, span_two = argument_list
    feature_tuples = []
    
    if span_constitutes_span(span_one, span_two):
        feature_tuples.append(irec_constitutes(span_one_uid, span_two_uid))
        
    if morphologically_similar(span_one, span_two):
        feature_tuples.append(irec_morp_sim(span_one_uid, span_two_uid))
    
#     if antonym_present(span_one, span_two, wordnet_antonyms):
#         feature_tuples.append(irec_related(span_one_uid, span_two_uid))
            
#     if semantically_similar(embedder, span_one, span_two):
#         irec_graph = irec_sem_sim(irec_graph, span_one_uid, span_two_uid, n1, n2)
#     found_in_n_definitions = 0
#     for s, p, definition in graph.triples((n2[span_two_uid], SKOS.definition, None)):
#         # for each definition of span_two  # TEXTBLOB WORDS? 
#         if check_for_span_in_definition(span_one, definition):
#             found_in_n_definitions += 1
#             feature_tuples.append(irec_related(span_one_uid, span_two_uid, n1, n2))
    
    pbar.update(1)
    return feature_tuples

In [None]:
# ua.UIDs[SPANS.placeholder.defrag().__reduce__()[1][0]]

In [None]:
test_1 = [x for x in range(100)]

In [None]:
test_2 = [x for x in range(100)]

In [None]:
# - I only care about span-span relations 

## TODO; need to think of ways to speed up this process
## - semantic similarity; reuse embeddings of uniquespans
## - avoid occurence in definitions (could use this to filter, e.g. occurs in over 30% of definitions then remove)
## - would I want to remove spans that occur very often in constitutes
spans_namespace_uid = SPANS.placeholder.defrag().__reduce__()[1][0]
concepts_namespace_uid = CONCEPTS.placeholder.defrag().__reduce__()[1][0]

spans = [k for k in ua.UIDs[spans_namespace_uid].keys() if ua.UIDs[spans_namespace_uid][k] != 'schemeUID']
# concepts = [k for k in ua.UIDs[concepts_namespace_uid].keys() if ua.UIDs[concepts_namespace_uid][k] != 'schemeUID']

spans_c = [CharacterSpan(span, ua.UIDs[spans_namespace_uid][span]) for span in spans]
# concepts_c = [CharacterSpan(span, ua.UIDs[concepts_namespace_uid][concept]) for concept in concepts]

In [None]:
# print(f"Computing {len(spans)} x {len(concepts)} = {len(spans) * len(concepts)} combinations")
print(f"Computing {len(spans)} x {len(spans)} = {len(spans) * len(spans)} combinations")

max_num_cpu_threads = 128
tuples_to_add = []
with tqdm(total=len(spans_c) * len(spans_c)) as pbar:
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_num_cpu_threads) as executor: # ThreadPoolExecutor
        futures = [executor.submit(compute_features, pair, pbar) for pair in product(spans_c, spans_c)]
        for future in concurrent.futures.as_completed(futures):
            tuples_to_add += future.result()



In [None]:
tuples_to_add = [t[0] for t_list in lists_of_tuples for t in t_list]
irec_graph = add_tuples(irec_graph, tuples_to_add)

#### Save final graph

In [None]:
ua.print_node_by_text(irec_graph, 'atrium', SPANS)

In [None]:
ua.print_node_by_text(irec_graph, 'a continuous space', SPANS)

In [None]:
ua.print_node_by_id(irec_graph, 'Q189265', WIKI)

In [None]:
ua.print_node_by_text(irec_graph, 'atrium', CONCEPTS)

In [None]:
ua.print_node_by_text(irec_graph, 'wet room', CONCEPTS)

In [None]:
ua.print_node_by_text(irec_graph, 'sanitary accommodation', CONCEPTS)

In [None]:
ua.print_node_by_text(irec_graph, 'wet room', CONCEPTS)

In [None]:
ua.print_node_by_text(irec_graph, 'tundish', SPANS)

In [None]:
# todo; / done now?
# - some concepts missing
# - avoid empty span;
# - need to avoid duplicate wiki concepts, simply add 

In [None]:
ua.print_node_by_text(irec_graph, 'ferrite', SPANS)