In [1]:
from typing import Union, List, Any, Optional, Dict

import os
import re
import time
import json
import glob
import pickle
import random

from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob

from utilities import cleaning_utils
from utilities.customdocument import CustomDocument

### Grab acronyms from text (naive SPaR based approach)

In [2]:
# load our domain terms and foreground corpus
file_names_foreground_corpus = ["merged_approved.json"]
corpus_fp = Path.cwd().joinpath("data", "converted_documents")
foreground_corpus = [CustomDocument.load_document(corpus_fp.joinpath(file_name)) for file_name in file_names_foreground_corpus]

In [3]:
graph_data_fp = Path("data/graph_data/")
domain_terms = pickle.load(open(graph_data_fp.joinpath('domain_terms.pkl'), 'rb'))

In [4]:
# Grab the potential acronyms for spans
span_acronym_dict = {}
for span in tqdm(domain_terms):
    span_acronym_dict[span] = []
    for content in foreground_corpus[0].all_contents:
        if span in content.text:
            # now we start looking for 
            text_splits = content.text.split(span)
            for subsequent_text in text_splits[1:]:
                potential_abbrev = re.match(r"^\s+\([A-Z]+\)", subsequent_text)
                if potential_abbrev:
                    potential_abbrev = potential_abbrev.group(0).split('(', 1)[1][:-1]
                    span_acronym_dict[span].append(potential_abbrev)

    span_acronym_dict[span] = list(set(span_acronym_dict[span]))

100%|██████████████████████████████████████| 5332/5332 [00:07<00:00, 683.76it/s]


In [5]:
# We'll filter out some of the lower quality acronyms
acronym_dict = {}
for span, acronym_list in span_acronym_dict.items():
    if acronym_list:
        acronym_list = list(set(acronym_list))
        for acronym in acronym_list:
            if acronym not in acronym_dict:
                acronym_dict[acronym] = [span]
            else:
                acronym_dict[acronym].append(span) 

In [6]:
print("Some examples of acronyms extracted automatically")
[print(f"{k}: {acronym_dict[k]}")  for k in random.sample(list(acronym_dict.keys()), 10)]
print("We'll disregard the low quality of acronyms for now")

Some examples of acronyms extracted automatically
BRE: ['Building Research Establishment']
LEV: ['Local Exhaust Ventilation', 'Ventilation']
BREL: ['Part L']
N: ['people']
DHW: ['domestic hot water', 'hot water']
REI: ['fire resistance', 'stability']
SEER: ['ratio', 'seasonal energy efficiency ratio']
VST: ['temperature']
FLA: ['Football Licensing Authority']
BRUKL: ['Part L']
We'll disregard the low quality of acronyms for now


* As we can see above, there are still many domain-specific patterns that SPaR.txt should be taught to recognise

In [7]:
def is_acronym(acronym: str, span: str):
    capitals = [c for c in acronym]
    words = [str(w) for w in TextBlob(span).words]
    
    for capital, word, in zip(capitals, words):
        if capital != word[0]:
            return False
    return True

In [8]:
# We'll filter out some of the lower quality acronyms
acronym_dict = {}
for span, acronym_list in span_acronym_dict.items():
    if acronym_list:
        acronym_list = list(set(acronym_list))
        for acronym in acronym_list:
            if is_acronym(acronym, span):
                if acronym not in acronym_dict:
                    acronym_dict[acronym] = [span]
                else:
                    acronym_dict[acronym].append(span)
                
print("Some examples of acronyms extracted automatically")
[print(f"{k}: {acronym_dict[k]}")  for k in random.sample(list(acronym_dict.keys()), 5)]            
print("Total number of cleaned acronyms found: ", len(acronym_dict))

Some examples of acronyms extracted automatically
WRAS: ['Water Regulations Advisory Scheme']
BRE: ['Building Research Establishment']
BCB: ['Body', 'Building Control Body']
FLA: ['Football Licensing Authority']
UKAS: ['United Kingdom Accreditation Service']
Total number of cleaned acronyms found:  11


In [9]:
graph_data_fp = Path.cwd().joinpath("data", "graph_data")
with open(graph_data_fp.joinpath("acronyms_found_in_text.pkl"), 'wb') as f:
    pickle.dump(acronym_dict, f)

### Prepare Uniclass terms that occur in the Merged Approved Documents

<div class="alert alert-block alert-info">
We read Uniclass terms from a .ttl file that we have previoulsy prepared. Could switch to grabbing them from .csv as well.
</div>

Source of Uniclass 2015 as .csv file: [https://buildig.com/uniclass-2015/](https://buildig.com/uniclass-2015/)

In [10]:
def group_ttl_lines(text):
    groups = []
    current_group = []
    for idx, line in enumerate(text.split("\n")):
        if line == '':
            if current_group:
                groups.append(current_group)
            current_group = []
        else:
            current_group.append(line)
            if idx+1 == len(text.split("\n")):
                groups.append(current_group)
    return groups

In [11]:
def grab_uids_and_labels_with_definition(groups):
    uid_dict = {}
    for g in groups:
        if any([line.startswith('  skos:prefLabel') for line in g]):
            # only use group if a prefLabel exists
            pref_label = ''
            alt_labels = []
            definition = ''
            for line in g:
                if line.startswith('  skos:prefLabel'):
                    pref_label = line.split('"')[1]
                elif line.startswith('  skos:altLabel'):
                    labels = line.split('"')[1::2]
                    alt_labels += labels
                elif line.startswith('  skos:definition'):
                    definition = line.split('"')[1]

            if pref_label:
                uid = g[0].split()[0].split(":")[1]
                uid_dict[uid] = {'pref_label': pref_label, 
                                 'alt_labels': alt_labels,
                                 'definition': definition
                                }
    return uid_dict


In [12]:
def grab_nodes(vocab_name):
    processed_file = Path(f"{vocab_name}.json")
    
    # check if file as processed before 
    NODES_LOADED = False
    if processed_file.exists():
        with open(processed_file) as f:
            graph_dict = json.load(f)
            
        NODES_LOADED = True
        print(f"Loaded nodes and neighbours for: {vocab_name}") 
    else: 
        print(f"Will have to grab nodes for: {vocab_name}")
    
        # compute the neighbours for each node
        graph_dict = {}
        
        print(f"Working on file: {vocab_name}")
        with open(vocab_name, 'r') as f:
            text =  f.read()
            
        groups = group_ttl_lines(text)
        print("Collecting nodes with definitions from dict")
        graph_dict = grab_uids_and_labels_with_definition(groups)
        
        # save the dictionary somewhere for reloading
        with open(processed_file, 'w') as f:
            json.dump(graph_dict, f)
    return graph_dict

In [13]:
uniclass_dict = grab_nodes(Path.cwd().joinpath("data", "term_extraction_input", "uniclass_2015.ttl"))

Loaded nodes and neighbours for: /Users/rubenkruiper/dev/irec/data/term_extraction_input/uniclass_2015.ttl


In [14]:
len(uniclass_dict)

15020

* Provide some insight in the number of uniclass terms found in the Merged Approved Documents
  * We will lowercase the terms, otherwise it is very unlikely that classes are found verbatim

In [15]:
uniclass_terms_in_text = {}
for doc in foreground_corpus:
    for content in tqdm(doc.all_contents):
        for uid, uiv in uniclass_dict.items():
            uterm = uiv['pref_label']
            if uterm.lower() in content.text.lower():  # lowercase everything to increase likelihood
                uniclass_terms_in_text[uid] = uiv


100%|███████████████████████████████████████| 1455/1455 [00:41<00:00, 35.13it/s]


In [16]:
percentage = (len(uniclass_terms_in_text)/len(uniclass_dict)) * 100
print("Number of Uniclass terms found in the Merged Approved Documents: {} ({:.2f}%)".format(len(uniclass_terms_in_text), percentage))
print("Examples of Uniclass terms found in the Merged Approved Documents:")
random.sample([x['pref_label'] for x in uniclass_terms_in_text.values()], 10)

Number of Uniclass terms found in the Merged Approved Documents: 598 (3.98%)
Examples of Uniclass terms found in the Merged Approved Documents:


['Standards',
 'Thin',
 'Stairs',
 'Pedestrian routes',
 'Water heaters',
 'Landings',
 'Garages',
 'Beds',
 'Systems engineer',
 'Escalators']

* store the Uniclass terms that we found in the foreground corpus, we want to add them to our graph 

In [17]:
graph_data_fp = Path.cwd().joinpath("data", "graph_data")
with open(graph_data_fp.joinpath("uniclass_terms_in_text.pkl"), 'wb') as f:
    pickle.dump(uniclass_terms_in_text, f)

In [18]:
uniclass_mwes = []
uniclass_single = [{}]
for uid, uiv in uniclass_dict.items():
    uterm = uiv['pref_label']
    if len(uterm.split(' ')) > 1:  # lowercase everything to increase likelihood
        uniclass_mwes.append(uterm)
    else:
        uniclass_single.append(uterm)
        

In [19]:
# some insight in number of MWEs in Uniclass
print("Number of MWEs: {} ({:.2f}%)".format(len(uniclass_mwes), (len(uniclass_mwes)/len(uniclass_mwes+uniclass_single))*100))
random.sample(uniclass_mwes, 10)

Number of MWEs: 14100 (93.87%)


['Gas pressure switches',
 'Ductwork installing',
 'Ceremonial worship activities',
 'Storm water gravity drainage systems',
 'Gas waste collection spaces',
 'Bollards and impact protectors',
 'Water skiing courses',
 'Glass-to-glass clips',
 'Solid waste disposal products',
 'Render stops']