In [None]:
import re
import time
import pickle
import glob, os
import requests, urllib
import json, random

import pandas as pd

from bs4 import BeautifulSoup as bs
from tqdm import tqdm
from typing import List, Any, List, Dict
from textblob import TextBlob
from itertools import combinations, combinations_with_replacement
from collections import Counter
from nltk.corpus import wordnet as wn
from SPARQLWrapper import SPARQLWrapper, JSON

from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import XSD, RDF, RDFS, SKOS, NamespaceManager

import utils as util

We would like to express the following features/relations:
* Dictionary definition terms, which are always concepts
  * We'll use the source as namespace, and corresponding concept identifier if it exists
  * SKOS is used to establish a mapping (e.g., skos:exactMatch) and add the definition (skos:definition)
* Special properties that we want to capture between words, which may help identify concepts:
  * Word is part of MWE
  * Morphologically similar words; stemming & Levenshtein distance
  * Semantically similar words; distributed similarity (NNs)
  * Acronyms
  * Related, this is a generic relation, e.g., a `ampere` is related to `electric current`
  * Domain-specificity; foreground or background term following our filtering procedure
 

### Prepare namespaces

* Note: that UNICLASS is not a namespace (yet) only has identifiers 

In [None]:
ROOT = Namespace("http://example.org/top_concept_for_visulisation/#")
WIKI = Namespace("http://www.wikidata.org/entity/#")

In [None]:
IREC_ontology_URL = "http://example.org/irec-schema/#"
IREC_instances_URL = "http://example.org/irec-spans/#"
IREC_concepts_URL = "http://example.org/irec-concepts/#"

In [None]:
# create our custom namespace for the schema to store spans
IREC = Namespace(IREC_ontology_URL)

# create a custom namespace to store spans and concepts
SPANS = Namespace(IREC_instances_URL)
CONCEPTS = Namespace(IREC_concepts_URL)

### graph creation utilities

In [None]:
class UID_assigner:
    def __init__(self):
        self.UIDs = {}
        self.UID = 0
        
    def assign_UID(self, text, namespace: Namespace):
        """
        Determines which type of UID to assign, based on the namespace.
        """
        if namespace == SPANS:
            return self.span_UID(text)
        elif namespace == CONCEPTS:
            return self.concept_UID(text)
        else:
            print("Currently no function implemented for assigning UIDs for this namespace")
    
    def span_UID(self, text):
        """
        NOTE: each text span is a unique identifier in and of itself. We'll simply convert the text span to 
        a URL friendly representation.
        """
        n_space = SPANS.placeholder.defrag().__reduce__()[1][0]
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
        
        urltext = urllib.parse.quote(text)
        if text in self.UIDs[n_space]:
            return self.UIDs[n_space][text], False
        else:
            self.UIDs[n_space][text] = urltext
            return self.UIDs[n_space][text], True
        
    def concept_UID(self, text):
        """
        For now I'll create my own dumb interger-based UIDs for nodes as a simple shortcut, split per namespace
        """
        n_space = CONCEPTS.placeholder.defrag().__reduce__()[1][0]
        
        if n_space not in self.UIDs:
            self.UIDs[n_space] = {}
        
        if text in self.UIDs[n_space]:
            return self.UIDs[n_space][text], False
        else:
            self.UID += 1
            self.UIDs[n_space][text] = str(self.UID)
            return self.UIDs[n_space][text], True
    
    def count_nodes_in_namespace(self, namespace: Namespace = SPANS):
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        print(f"Number of nodes in '{n_space}': {len(self.UIDs[n_space])}")
        
    def print_node_by_id(self, graph, node_id, namespace: Namespace = SPANS):
        for s, p, o in graph.triples((namespace[str(node_id)],  None, None)):
            print(f"{s.split('#')[-1]} ; {p.split('#')[-1]} ; {o.split('#')[-1]}")
        
    def print_node_by_text(self, graph, node_text, namespace: Namespace = SPANS):
        n_space = namespace.placeholder.defrag().__reduce__()[1][0]
        node_id = self.UIDs[n_space][node_text]
        # find all triples with subject
        for s, p, o in graph.triples((namespace[node_id],  None, None)):
            print(f"{s.split('#')[-1]} ; {p.split('#')[-1]} ; {o.split('#')[-1]}")

In [None]:
# These wrappers only exist to help me quickly and consistently add nodes to the graph


def add_top_concept(graph, node_uid, namespace: Namespace=CONCEPTS):
    """ In some cases we'd like the concept to be linked to the ROOT of the graph, for visualisation. """
    graph.add((namespace[node_uid], SKOS.hasTopConcept, ROOT[top_concept_uid]))
    return graph

# IREC functions and REFERENCE
IREC.Span # A span is a sequence of characters that occurs verbatim in a text, either contiguous or discontiguos as extracted by SPaR.txt (Kruiper et al., 2021).   
IREC.constitutes  # Indicates that a span constitutes another span, e.g., the Multi-Word Expression (MWE) Span `hot water storage system` the Span `storage`.
IREC.isMorphologicallySimilarTo # Indicates that a Span is morphologically similar to another Span, e.g., they may have the same stem or a small Levenshtein distance.
IREC.isSemanticallySimilarTo # Indicates that a Span is semantically similar to another Span, following a cosine similarity between their  embeddings.
IREC.related # General way to indicate some relation between two spans, e.g., `ampere` is related to `electric current`
IREC.hasAcronym # A Span can have an acronym, e.g., `British Standards Institute` has the acronym `BSI`.
IREC.isAcronymOf # A Span can have an acronym, e.g., `BSI` is the acronym for `British Standards Institute`.
IREC.hasAntonym # Property that relates a Span to another Span, each being each other's antonyms.

def irec_span(graph, node_uid, text, namespace: Namespace=SPANS):
    """ Add a span node in the SPANS namespace, of type IREC.Span and the span text set as its RDF.label """
    # is preflabel a property? I would assume so
    graph.add((namespace[node_uid], RDF.type, IREC.Span))
    graph.add((namespace[node_uid], RDFS.label,  Literal(text, lang='en')))
    return graph

def irec_constitutes(graph, subject_node_uid, object_node_uid,
                     subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that somewhere in the label of the first SPAN node, you can find the second span's label """
    graph.add((subject_namespace[subject_node_uid], IREC.constitutes, object_namespace[object_node_uid]))
    return graph

def irec_morp_sim(graph, subject_node_uid, object_node_uid,
                  subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the labels of two SPAN nodes are morphologically similar """
    graph.add((subject_namespace[subject_node_uid], IREC.isMorphologicallySimilarTo, object_namespace[object_node_uid]))
    return graph

def irec_sem_sim(graph, subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the labels of two SPAN nodes are semantically similar, following the distributed semantics hypothesis """
    graph.add((subject_namespace[subject_node_uid], IREC.isSemanticallySimilarTo, object_namespace[object_node_uid]))
    return graph

def irec_related(graph, subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the labels of two SPAN nodes are semantically similar, following the distributed semantics hypothesis """
    graph.add((subject_namespace[subject_node_uid], IREC.related, object_namespace[object_node_uid]))
    return graph

def irec_has_acronym(graph, subject_node_uid, object_node_uid,
                     subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the label of the subject node has an acronym, ergo the label of the object node  """
    graph.add((subject_namespace[subject_node_uid], IREC.hasAcronym, object_namespace[object_node_uid]))
    return graph

def irec_is_acronym_of(graph, subject_node_uid, object_node_uid,
                       subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the label of the subject node is an acronym of the label of the object node  """
    graph.add((subject_namespace[subject_node_uid], IREC.isAcronymOf, object_namespace[object_node_uid]))
    return graph

def irec_antonym(graph, subject_node_uid, object_node_uid,
                 subject_namespace: Namespace=SPANS, object_namespace: Namespace=SPANS):
    """ Indicates that the label of the subject node is an antonym of the label of the object node  """
    graph.add((subject_namespace[subject_node_uid], IREC.hasAntonym, object_namespace[object_node_uid]))
    return graph


# SKOS 
def skos_node(graph, node_uid, text, namespace: Namespace=CONCEPTS):
    """ Add a concept to the graph in the CONCEPTS namespace, of type SKOS.Concept """
    graph.add((namespace[node_uid], RDF.type, SKOS.Concept))
    graph = skos_prefLabel(graph, node_uid, text, namespace)
    return graph

def skos_prefLabel(graph, node_uid, text, namespace: Namespace=CONCEPTS):
    """ Add the text label for a node """
    graph.add((namespace[node_uid], SKOS.prefLabel, Literal(text, lang='en')))
    return graph

def skos_altLabel(graph, node_uid, alt_label_uid, namespace: Namespace=CONCEPTS):
    """ Add an alternative text label for a concept node """
    graph.add((namespace[node_uid], SKOS.altLabel, namespace[alt_label_uid]))
    graph.add((namespace[alt_label_uid], SKOS.altLabel, namespace[node_uid]))
    return graph

def skos_related(subject_node_uid, object_node_uid,
                subject_namespace: Namespace=SPANS, object_namespace: Namespace=CONCEPTS):
    """ Denotes a relation between two nodes, would expect the nodes to be in different vocabularies """
    graph.add((subject_namespace[subject_node_uid], SKOS.related, object_namespace[object_node_uid]))
    return graph
    
def skos_broader(graph, narrower_node_uid, broader_node_uid,
                 narrower_namespace: Namespace=CONCEPTS, broader_namespace: Namespace=CONCEPTS):
    """ Assuming narrower/broader is always reflexive, would expect the nodes to be in different vocabularies """
    graph.add((broader_namespace[narrower_node_uid], SKOS.narrower, narrower_namespace[broader_node_uid]))
    graph.add((narrower_namespace[broader_node_uid], SKOS.broader, broader_namespace[narrower_node_uid]))
    return graph
    
def skos_note(graph, node_uid, note_text, namespace: Namespace=CONCEPTS):
    """ I don't think this is used right now; not sure if there is a use-case at any point """
    graph.add((namespace[node_uid], SKOS.note, Literal(note_text, lang='en')))
    return graph

def skos_definition(graph, node_uid, definition_text, namespace: Namespace=CONCEPTS):
    """ The namespace indidcates the source of the definition? """
    graph.add((namespace[node_uid], SKOS.definition, Literal(definition_text, lang='en')))
    return graph

### Prepare namespaces

In [None]:
irec_graph = Graph()

irec_graph.bind("root", ROOT)
irec_graph.bind("wikipedia", WIKI)

In [None]:
# import our vocabulary
irec_graph.parse("IREC.rdf")

In [None]:
ua = UID_assigner()
top_concept_uid = 'ROOT'

irec_graph = skos_prefLabel(irec_graph, top_concept_uid, "NUU_graph_root", namespace = ROOT)

### Add base antonyms

In [None]:
# We want to capture antonyms: dichotomy in meaning of words, 
# For this we'll use NLTK's version of WordNet, which mainly captures antonyms for adjectives and adverbs.
wordnet_antonyms = {}
for i in wn.all_synsets():
    if i.pos() in ['a', 's']:    # If synset is adj or satelite-adj.
        for j in i.lemmas():     # Iterating through lemmas for each synset.
            if j.antonyms():     # If adj has antonym.
                wordnet_antonyms[str(j.name()).strip()] = [x.name() for x in j.antonyms()]

# Example of a useful antonym for us
wordnet_antonyms['hot']

In [None]:
wordnet_antonyms['cold']

In [None]:
for span in wordnet_antonyms.keys():
    span_uid, new_uid_bool = ua.assign_UID(span, SPANS)
    
    if new_uid_bool: # equals if (SPANS[uid], None, None) not in graph: 
        # need to add the span to the graph
        irec_graph = irec_span(irec_graph, span_uid, span)
        
    antonyms = wordnet_antonyms[span]
    for antonym in antonyms:
        antonym_uid, new_uid_bool = ua.assign_UID(antonym, SPANS)
        
        if new_uid_bool:
            irec_graph = irec_span(irec_graph, antonym_uid, antonym)
            
        # add the antonym relation
        irec_graph = irec_antonym(irec_graph, span_uid, antonym_uid)
       

### Add domain terms extracted from the Approved documents as Spans

In [None]:
domain_terms = pickle.load(open('data/domain_terms.pkl', 'rb'))

In [None]:
domain_terms

In [None]:
# simply adding the extracted spans
for span in domain_terms:
    span_uid, new_uid_bool = ua.assign_UID(span, SPANS)
    
    if new_uid_bool:
        irec_graph = irec_span(irec_graph, span_uid, span)

### Add Acronyms that were grabbed from the text

These can help:
* remove terms where the boundary detection is off
* avoid suggesting similar acronyms, e.g., suggest that EPC and EPS are similar 

In [None]:
acronyms = {'PAS': ['ecification', 'Specification'],  'GSIUR': ['Regulations 1998'],  'HSE': ['Regulations 2000',   'water systems',   'Safety Executive',   'Health and Safety Executive'],  'PE': ['Polyethylene', 'polyethylene'],  'DN': ['pipe'],  'DCLG': ['land', 'Local Government', 'England', 'ment'],  'PP': ['Polypropylene'],  'BCB': ['Control Body',   'the building control body',   'Building control body',   'building control body',   'Building Control Body'],  'SRHRV': ['ventilator',   'single room heat recovery ventilator',   'a single room heat recovery ventilator'],  'MVHR': ['blocks', 'heat recovery'],  'WC': ['sets'],  'TFA': ['the total floor area'],  'LRV': ['Light reflectance value'],  'BER': ['Building CO2 Emission Rate', 'CO2 Emission Rate'],  'TER': ['CO2 Emission Rate',   'the Target CO2 Emission Rate',   'Target CO2 Emission Rate'],  'DER': ['CO2 Emission Rate', 'the Dwelling CO2 Emission Rate'],  'EPC': ['energy performance certificate'],  'TFEE': ['Target Fabric Energy Efficiency',   'Fixed building services',   'Energy Efficiency'],  'DHF': ['the Door and Hardware Federation', 'Door and Hardware Federation'],  'REI': ['fire resistance', 'bility'],  'PHE': ['horizontal evacuation'],  'W': ['the final exit', 'final exit'],  'DWELLINGS': ['RESIDENTIAL'],  'OTHER': ['RESIDENTIAL'],  'TSO': ['Office', 'The Stationery Office'],  'FPA': ['the Fire Protection Association', 'Association'],  'A': ['absorption area'],  'AT': ['absorption area'],  'DECC': ['Climate Change'],  'NCM': ['the National Calculation Methodology'],  'ADCAS': ['Allied Services'],  'DFEE': ['Energy Efficiency'],  'LPA': ['the local planning authority', 'planning authority'],  'UKAS': ['the United Kingdom Accreditation Service'],  'BSI': ['the British Standards Institution'],  'EA': ['Accreditation'],  'BGS': ['British Geological Survey'],  'HBN': ['Notes'],  'GGF': ['Glazing Federation'],  'E': ['terms of integrity'],  'TRADA': ['the Timber Research and Development Association', 'Association'],  'ACOP': ['Code of Practice'],  'ATTMA': ['Association'],  'RVA': ['Association', 'the Residential Ventilation Association'],  'TEHVA': ['Association'],  'DSA': ['Association'],  'CIRIA': ['Association'],  'MCRMA': ['Association'],  'DSMA': ['Association'],  'OFTEC': ['Association'],  'WHO': ['Organisation'],  'GAI': ['Architectural Ironmongers'],  'MEV': ['mechanical extract', 'extract ventilation'],  'VST': ['Vicat softening temperature'],  'SCI': ['Guild Steel Construction Institute'],  'FBE': ['the Built Environment', 'ment'],  'DSER': ['Rating'],  'WER': ['Rating'],  'CIWM': ['ment', 'Wastes Management'],  'EOTA': ['ment'],  'GQRA': ['ment'],  'BRE': ['ment', 'the Building Research Establishment'],  'PPS': ['ment'],  'PSV': ['Passive stack ventilation'],  'EST': ['the Energy Saving Trust'],  'CIBSE': ['Ventilation hygiene toolkit', 'Building Services Engineers'],  'AGS': ['Geoenvironmental Specialists'],  'SPAB': ['Ancient Buildings'],  'UF': ['urea formaldehyde'],  'ODPM': ['the Deputy Prime Minister']}

In [None]:
for acronym, spans in acronyms.items():
    
    acronym_uid, new_uid_bool = ua.assign_UID(acronym, SPANS)
    if new_uid_bool:
        # many of these will be in the graph already
        irec_graph = irec_span(irec_graph, acronym_uid, acronym)
    
    for span in spans:
        # these are all part of the graph already
        span_uid, _ = ua.assign_UID(span, SPANS) 
        
        # todo; 
        #  could do some filtering here of the clearly erroneous span-acronym combinations
        #  or leave this until later, using the graph...
    
        if (SPANS[acronym_uid], IREC.isAcronymOf, SPANS[span_uid]) not in irec_graph:
            irec_graph = irec_is_acronym_of(irec_graph, acronym_uid, span_uid)
            irec_graph = irec_has_acronym(irec_graph, span_uid, acronym_uid)
            
    

### Add CONCEPTS: defined terms from the Approved Documents

In [None]:
# read data from csv file
definitions = pd.read_excel("data/Approved Documents and derived terms.xlsx", sheet_name="Definitions", keep_default_na=False)

In [None]:
definitions[:3]

In [None]:
index_terms = pd.read_excel("data/Approved Documents and derived terms.xlsx", sheet_name="Index terms", keep_default_na=False)

In [None]:
# todo; below, perhaps check existince of relations to avoid duplication, e.g.;
# if (SPANS[acronym_uid], IREC.isAcronymOf, SPANS[span_uid]) not in irec_graph:

In [None]:
# create graph from definitions first
for i, row in definitions.iloc[1:].iterrows():
    term = row['Term'] if row['Term'].isupper() else row['Term'].lower()
    alternative_labels = row['Alternative labels']
    definition = row['Definition']
    note = row['Note']
    
    # add the term 
    concept_uid, new_uid = ua.assign_UID(term, CONCEPTS)
    if new_uid:
        irec_graph = skos_node(irec_graph, concept_uid, term)
    
    if note: 
        irec_graph = skos_note(irec_graph, concept_uid, note) 
    
    # always expecting a definition
    irec_graph = skos_definition(irec_graph, concept_uid, definition) 
    
    if alternative_labels:
        # lowercase if not an abbreviation
        alt_labels = [x.strip() if x.isupper() else x.lower().strip() for x in alternative_labels.split(", ")]  
        
        for alt_label in alt_labels:
            # treated like a span, that refers to the same concept <-- this is a deviation from previously
            alt_label_uid, new_uid = ua.assign_UID(alt_label, SPANS)
            if new_uid:
                irec_graph = irec_span(irec_graph, alt_label_uid, alt_label)
        
            irec_graph = skos_altLabel(irec_graph, concept_uid, alt_label_uid)

In [None]:
index_terms[:3]

In [None]:
# add triples from index terms / glossaries
# we will treat glossary terms like concepts, relations between their labels occur between the spans 
for i, row in index_terms.iloc[1:].iterrows():
    term = row['Term'].strip() if row['Term'].isupper() else row['Term'].lower().strip()
    alternative_labels = row['AltLabel(s)']
    related_terms = row['Related terms']
    broader_term = row['Broader term']
    
    # add the term 
    concept_uid, new_uid = ua.assign_UID(term, CONCEPTS)
    if new_uid:
        irec_graph = skos_node(irec_graph, concept_uid, term)
        # hacky visualisation solution; connect all terms to the graph root -> need to find a better solution
        # hacky visualisation solution; connect all terms to the graph root
        # hacky visualisation solution; connect all terms to the graph root
        irec_graph = add_top_concept(irec_graph, concept_uid)

    if alternative_labels:
        # lowercase if not an abbreviation
        alt_labels = [x.strip() if x.isupper() else x.lower().strip() for x in alternative_labels.split(", ")]  
        
        for alt_label in alt_labels:
            # treated like a span, that refers to the same concept <-- this is a deviation from previously
            alt_label_uid, new_uid = ua.assign_UID(alt_label, SPANS)
            if new_uid:
                irec_graph = irec_span(irec_graph, alt_label_uid, alt_label)
                
            irec_graph = skos_altLabel(irec_graph, concept_uid, alt_label_uid)

    if related_terms:
        rel_terms = [x.strip() if x.isupper() else x.lower().strip() for x in related_terms.split(", ")]
        for rel_term in rel_terms:
            related_uid, new_uid = ua.assign_UID(rel_term, SPANS)
            if new_uid:
                irec_graph = irec_span(irec_graph, related_uid, rel_term)
                
            irec_graph = irec_related(irec_graph, concept_uid, related_uid)
    
    if broader_term:
        # We expect 1 broader term at most currently, assuming we'd like a tree structure (DAG with 1 parent at most)
        b_term = broader_term.strip().lower() if not broader_term.isupper() else broader_term.strip()
        b_term_uid, new_uid = ua.assign_UID(b_term, SPANS)
        if new_uid:
            irec_graph = irec_span(irec_graph, b_term_uid, b_term)

        # We do not expect that the broader term is necessarily a concept, although this is a feature we may rely on later
        irec_graph = skos_broader(irec_graph, concept_uid, b_term_uid, CONCEPTS, SPANS)

In [None]:
# irec_graph.serialize(destination="graph/approved_doc_terms_only.ttl")

### Print some insight in the graph so far

In [None]:
ua.count_nodes_in_namespace(SPANS)

In [None]:
ua.count_nodes_in_namespace(CONCEPTS)

In [None]:
ua.print_node_by_id(irec_graph, 291, CONCEPTS)

In [None]:
ua.print_node_by_id(irec_graph, urllib.parse.quote('sanitary accommodation'), SPANS)

In [None]:
ua.print_node_by_text(irec_graph, 'sanitary accommodation', SPANS)

### Grab wikipedia definitions for Concept nodes, and store locally for re-use
* We have previously annotated the relevance of all WikiData classes returned for the defined terms and index terms in the Approved Documents.

In [None]:
annotated_wikidata_classes_df = pd.read_csv("data/wiki_classes_annotated.csv", index_col=1)

In [None]:
annotated_wikidata_classes_df[:3]

In [None]:
wikiclass_dict = {}
for row in annotated_wikidata_classes_df.iterrows():
    uid_list_string, class_annotations_examples = row
    uid_list = uid_list_string[2:-2].split(',')
    for uid in uid_list:
        wikiclass_dict[uid] = {
            'Class': class_annotations_examples['WikiData class'],
            'Annotation': class_annotations_examples['Annotation'],
            'Example spans': class_annotations_examples['Example spans']
        }
    

* First, we try to grab all wiki definitions for all spans and concepts that are in the graph (so far)

In [None]:
# set up the SPARQL endpoint for wikidata

sparql_wrapper = SPARQLWrapper("https://query.wikidata.org/sparql")

In [None]:
def get_wiki_matches(graph_sparql_endpoint: SPARQLWrapper,
                     jargon_term_and_uids: List):

    all_wiki_definitions = {}
    # we want to grab the term (subject), any definition (subjectDescription) and the class (subjectClass)
    sparql_q = """
               SELECT ?subject ?subjectDescription ?classUID ?className WHERE {
                  ?subject rdfs:label "$QUERY"@en.
                  ?subject wdt:P31|wdt:P279 ?classUID.
                  SERVICE wikibase:label { bd:serviceParam wikibase:language "en".}
                  ?classUID  rdfs:label ?className  FILTER(LANG(?className) = "en").
                }
               """
    
    for term, uid in tqdm(jargon_term_and_uids):
        # make the call to 
        temp_q = sparql_q.replace("$QUERY", term)
        graph_sparql_endpoint.setQuery(temp_q)
        graph_sparql_endpoint.setReturnFormat(JSON)
        try:
            json_output = graph_sparql_endpoint.query().convert()
        except:
            # If no result, wait 2s; One client is allowed 30 error queries per minute
            print(f"Error for query, you should what's wrong with the term: {term}")
            time.sleep(2)
            continue
            
        # sometimes multiple Wiki UIDs for a single term, we grab them all here
        bindings = [v for v in json_output['results']['bindings']]
            

        for v in bindings:
            class_uid = v['classUID']['value'] if 'classUID' in v else ""
            class_label = v['className']['value'] if 'className' in v else ""
            
            if 'subjectDescription' in v:
                if uid not in all_wiki_definitions:
                    all_wiki_definitions[uid] = [{'prefLabel': term,
                                                  'class_uid': class_uid,
                                                  'class_label': class_label,
                                                  'WikiUID': v['subject']['value'],
                                                  'WikiDefinition': v['subjectDescription']['value']}]
                else:
                    all_wiki_definitions[uid].append({'prefLabel': term,
                                                      'class_uid': class_uid,
                                                      'class_label': class_label,
                                                      'WikiUID': v['subject']['value'],
                                                      'WikiDefinition': v['subjectDescription']['value']})
            elif 'subject' in v:
                # no description found, simply adding wiki UID if that exists
                if uid not in all_wiki_definitions:
                    all_wiki_definitions[uid] = [{'prefLabel': term,
                                                  'class_uid': class_uid,
                                                  'class_label': class_label,
                                                  'WikiUID': v['subject']['value']}]
                else:
                    all_wiki_definitions[uid].append({'prefLabel': term,
                                                      'class_uid': class_uid,
                                                      'class_label': class_label,
                                                      'WikiUID': v['subject']['value']})
    return all_wiki_definitions


In [None]:
concepts_and_uids = [(k, v) for k, v in ua.UIDs[CONCEPTS.placeholder.defrag().__reduce__()[1][0]].items()]
spans_and_uids = [(k, v) for k, v in ua.UIDs[SPANS.placeholder.defrag().__reduce__()[1][0]].items()]

In [None]:
# First run for the Concepts
if not os.path.exists("data/concept_wiki_dict.json"):
    wiki_dict = get_wiki_matches(sparql_wrapper, concepts_and_uids)#{'test': 1, 'conductor':2})
    with open("data/concept_wiki_dict.json", 'w') as f:
        json.dump(wiki_dict, f, indent=2)
else:
    with open("data/concept_wiki_dict.json", 'r') as f:
        wiki_dict = json.load(f)

In [None]:
# Now run for the spans
if not os.path.exists("data/span_wiki_dict.json"):
    wiki_dict = get_wiki_matches(sparql_wrapper, spans_and_uids)#{'test': 1, 'conductor':2})
    with open("data/span_wiki_dict.json", 'w') as f:
        json.dump(wiki_dict, f, indent=2)
else:
    with open("data/span_wiki_dict.json", 'r') as f:
        wiki_dict = json.load(f)

In [None]:
print(len(wiki_dict))


TODO
* parse WikiData definitions and add spans!
* link definitions to spans (if span occurs, link),
* create concepts for the 500ish spans that occur in Uniclass
* compute span properties 
  * constitutes; x occurs in y, thus y might be an extended phrase for x and perhaps a subclass, or x may be a material property, and so on
  * morphological similarity, x may be an inflection of y
  * semantic similarity, x and y might be alternative labels or have the same superclass


In [None]:
# - create a new node for the wikipedia term, with prefLabel the term
for uid, wiki_def_dict_list in expanded_dict.items():
    
    # we will only take into account the 1st definition for now!
    # we will only take into account the 1st definition for now!
    # we will only take into account the 1st definition for now!
    idx = 0
    d = wiki_def_dict_list[idx]
    
    term = d['prefLabel']
    wiki_uid = d['WikiUID'].rsplit('/',1)[1]
    if 'WikiDefinition' in d:
        if wiki_definition == "Wikimedia disambiguation page":
            # skip disambiguation pages in general
            continue
        
        definition = d['WikiDefinition']
        spar_objects = d['WikiDef_terms']
        mygraph = add_wiki_exact_match(mygraph, term, uid, wiki_uid, definition, spar_objects)
    else:
        print(f"Term with wiki exact match, but without definition: {term}")
        mygraph = add_wiki_exact_match(mygraph, term, uid, wiki_uid)

In [None]:
# def add_wiki_exact_match(graph, term, mygraph_uid, wiki_uid, wiki_definition=None, spar_objects=None):
#     graph.add((EX[mygraph_uid], SKOS.exactMatch, WIKI[wiki_uid]))
#     graph.add((WIKI[wiki_uid], SKOS.prefLabel, Literal(term, lang='en')))
#     if wiki_definition:
#         graph.add((WIKI[wiki_uid], SKOS.definition, Literal(wiki_definition, lang='en')))
#     if spar_objects:
#         for obj in spar_objects:
#             # UIDs for these terms are assigned within our EXAMPLE namespace
#             def_term_uid, new_uid_bool = ua.assign_UID(def_term)
#             if new_uid_bool:
#                 mygraph = add_prefLabel(mygraph, def_term_uid, def_term)
            
#             graph.add((WIKI[wiki_uid], SKOS.related, EX[def_term_uid]))
#             graph.add((EX[def_term_uid], SKOS.related, WIKI[wiki_uid]))
            
#     return graph

### Step 1: create a graph from the approved documents terms file

### Step 3: parse all definitions, in order to identify new nodes and links between nodes
* we won't re-define the new nodes (again), because we assume that this would cause too much drift?
* should check to make sure that that's the case

In [None]:
# todo; change to a local parser, see term extraction notebook

def parse_definition(full_definition):
    definition_sentences = [str(sent) for sent in TextBlob(full_definition).sentences]
    identified_objects = []
    for definition_sent in definition_sentences:
        encoded_def = urllib.parse.quote(definition_sent)
        spar_output = requests.get(f"http://localhost:8000/predict_objects/{encoded_def}").json()
        try:
            spar_objects = spar_output["prediction"]['obj']
            identified_objects += util.custom_cleaning_rules(spar_objects)
        except:
            continue
    return identified_objects

In [None]:
for k, v in complete_dict.items():
    # run spar.txt on definition
    full_definition = v['definition']
    # should break up into sentences
    definition_sentences = [str(sent) for sent in TextBlob(full_definition).sentences]
    complete_dict[k]['def_terms'] = []
    
    for definition_sent in definition_sentences:
        encoded_def = urllib.parse.quote(definition_sent)
        spar_output = requests.get(f"http://localhost:8000/predict_objects/{encoded_def}").json()
        try:
            spar_objects = spar_output["prediction"]['obj']
            cleaned_objs = util.custom_cleaning_rules(spar_objects)
            for obj in cleaned_objs:
                complete_dict[k]['def_terms'] += cleaned_objs
        except:
            continue

In [None]:
# approved_documents
object_dict = {}
for row in definitions.iterrows():
    idx, (term, definition, alt_labels, note) = row
    objects = parse_definition(definition)
    
    object_dict[term] = objects

In [None]:

#     term = term if term.isupper() else term
    
#     for obj in objects:
#         obj = obj if obj.isupper() else obj
#         # add the object to the graph 
#         obj_uid, new_uid = ua.assign_UID(obj)
#         if new_uid:
#             mygraph = add_prefLabel(mygraph, obj_uid, obj)
        
#         # add relation between term and mygraph
            
      # ====      
#     def add_related(graph, jargon_uid, other_term_uid, jargon_namespace: Namespace=JARGON, related_namespace: Namespace=JARGON):
#     graph.add((jargon_namespace[jargon_uid], SKOS.related, related_namespace[other_term_uid]))
#     graph.add((related_namespace[other_term_uid], SKOS.related, jargon_namespace[jargon_uid]))
#     return graph

### Step 4: Add relevant wikipedia nodes and to graph

In [None]:
# TODO merge same defintion, us rdfs:a 
# TODO merge same defintion, us rdfs:a 