In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from datetime import time
from dateutil.relativedelta import relativedelta
from datetime import timedelta
import regex as re
import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans
from spacy.tokenizer import Tokenizer
from spacy.pipeline import EntityRuler
import re
from spacy.tokens import Span
from spacy.util import filter_spans
from spacy.training.example import Example
from spacy.language import Language
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import json
import random

In [33]:
from matplotlib import pyplot


In [34]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
tokenizer = nlp.tokenizer
all_stopwords = nlp.Defaults.stop_words




In [35]:
df = pd.read_csv('merged_cats.csv')
equip = pd.read_csv('Equipment.csv')

In [36]:
#cleaning up the dataframe

def clean_notes(data, col_name):
    data[col_name] = data[col_name].str.replace('lc', 'lamp column')
    data[col_name] = data[col_name].str.replace('l/c', 'lamp column')
    data[col_name] = data[col_name].str.replace('sugg', 'suggested')
    data[col_name] = data[col_name].str.replace('rreturn', 'return')
    data[col_name] = data[col_name].str.replace('o/s', 'outside')

    return data

In [37]:
df = clean_notes(df, 'job_notes')

In [38]:
#checks if a string matches a regex pattern

def regex_checker(text, pattern):
    matches = re.finditer(pattern, text)
    for match in matches:
        print (match)


In [39]:
#load json file
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)



In [40]:
#save data to a json file
def save_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [41]:
#transforms the data into a json structure
#The counter counts the number of unique entities in the data

def structure_data(text, doc_param, counter):
    results=[]
    entities = []
    ignore = ['GPE', 'DATE', 'ADDRESS', 'ORG', 'PERSON', 'FAC', 'TIME', 'NORP', 'MONEY', 'LOC']
    for ent in doc_param.ents:
        if ent.label_ not in ignore:
            entities.append((ent.start_char, ent.end_char, ent.label_))
            counter[ent.label_] = counter.get(ent.label_, 0) + 1
    
    if len(entities) > 0:
        results = [text, {"entities": entities}]
    else:
        results = None
    
    return (results), counter
    


In [42]:
#to remove the QUANTITY entity from the list of default entities

@Language.component("entity_removal")
def entity_removal(doc):
    ents = list(doc.ents)
    for ent in ents:
        print(ent.label_)
        if ent.label_=='QUANTITY':
            ents.remove(ent)
    ents = tuple(ents)
    doc.ents = ents
    return(doc)
Language.component("entity_removal", func=entity_removal)


<function __main__.entity_removal(doc)>

In [43]:
#training the model

def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            
            ner.add_label(ent[2])
            
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        nlp.add_pipe("entity_removal", before="ner")
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print ("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], losses=losses, drop=0.2, sgd = optimizer)
            print (losses)
    return (nlp)

In [44]:
#a csv of keywords needed to be labelled as the 'NEW EQUIPMENT' entity if found in the data

ruler = nlp.add_pipe("entity_ruler", before = "ner")
for i in range(len(equip)):    
    ruler.add_patterns([{"label": "NEW EQUIPMENT", "pattern": equip.iloc[i,0]}])

In [45]:
#patterns to identify new entities
patterns =  [{"label": "ADDRESS", "pattern": [" ?outside \d+ ?", " ?\d{1,3} [A-Za-z]+ road\.?",
                                             " ?\d{1,3} [A-Za-z]+ rd\.?"," ?\d{1,3} [A-Za-z]+ hill\.?",
                                              " ?\d{1,3} [A-Za-z]+ Hill\.?", " ?\d{1,3} st\.? [A-Za-z]+",
                                              " ?\d{1,3} saint\.? [A-Za-z]+", "\s\d{1}[A-Za-z]{2}\s|\s\d{1}[A-Za-z]{2}$|^\d{1}[A-Za-z]{2}\s"]},
            {"label": "COLUMN MEASUREMENT", "pattern": ["(^ ?\d+(\.)?\d*n?m (column)?)|(^ ?\d+(\.)?\d*N?M (column)?)",
                                                       "(^ ?\d+(\.)?\d* (column)? ?)|(^ ?\d+(\.)?\d* meter ?[A-Za-z]*)"]},
            {"label": "PHONE NUMBER", "pattern": [" ?\d{3} ?\d{7,8}"]},
            {"label": "DAY", "pattern": [" ?(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|monday|tuesday|wednesday|thursday|friday|saturday|sunday) ?"]},
            {"label": "NEW EQUIPMENT", "pattern": ["supply and install \w* columns? ?"]},
            {"label": "TEMPERATURE", "pattern": ["( ?\d{1,4}k ?)|( ?\d{1,4}K ?)"]},
            {"label": "POWER", "pattern": ["( ?\d{1,4}w)|( ?\d{1,4}W)"]},
            {"label": "LANTERN FEATURE", "pattern": [" ?\d{1,3} led lantern ?"]},
            {"label": "REFERENCE", "pattern": ["(lamp)? ?column \d{1,4} ?"]},
            {"label": "JOB NUMBER", "pattern": ["( ?job( number)?| ?quote( number)?| ?quotation( number)?) ?\d+ ?"]},
            {"label": "CUSTOM_QUANTITY", "pattern": [" ?remove (one|two|three|four|five|six|seven|eight|nine) [a-z]+ ?[a-z]* ?",
                                              " ?replace (one|two|three|four|five|six|seven|eight|nine) [a-z]+\b ?[a-z]*\b ?",
                                             " ?[a-z]* ?[a-z]* damaged x\d{1,3} ?[a-z]* ?",
                                              " ?x ?\d{1,4} ?[a-z]* ?",
                                              " ?x?\d{1,4} illuminated (sign)? posts? ?",
                                              " ?[a-z]* ?[a-z]* \d* x ?\d{1,3} ?[a-z]* ?",
                                              " ?supply (one|two|three|four|five|six|seven|eight|nine) [a-z]+ ?[a-z]* ?",
                                              " ?\d{1,4} ?[a-z]+ to upgrade$ ?"                 
                                             ]}
            
     ]
                                              

In [46]:
#splitting the dataset
train_df, test_df = train_test_split(df, test_size=0.3)


In [47]:
patterns[0]

{'label': 'ADDRESS',
 'pattern': [' ?outside \\d+ ?',
  ' ?\\d{1,3} [A-Za-z]+ road\\.?',
  ' ?\\d{1,3} [A-Za-z]+ rd\\.?',
  ' ?\\d{1,3} [A-Za-z]+ hill\\.?',
  ' ?\\d{1,3} [A-Za-z]+ Hill\\.?',
  ' ?\\d{1,3} st\\.? [A-Za-z]+',
  ' ?\\d{1,3} saint\\.? [A-Za-z]+',
  '\\s\\d{1}[A-Za-z]{2}\\s|\\s\\d{1}[A-Za-z]{2}$|^\\d{1}[A-Za-z]{2}\\s']}

In [48]:
entity_text = []
train = []
counter = {}

entity_json=[]
#list of new entities
#nlp.add_pipe("entity_removal", before="ner")

for i in range(len(train_df)):
    doc = nlp(train_df.iloc[i, 3])
    
    ents = list(doc.ents)
    for ent in ents:
        if ent.label_=='QUANTITY':
            ents.remove(ent)
    ents = tuple(ents)
    doc.ents = ents
    
    original_ents = list(doc.ents)

    new_ents = []

    #Identifying new multiple-word entities using regex patterns
    for m in range(len(patterns)):
        for n in range(len(patterns[m]['pattern'])):
            for match in re.finditer(patterns[m]['pattern'][n], doc.text):
                start, end = match.span()
                span = doc.char_span(start, end, alignment_mode = 'expand')
                if span is not None:
                    #appending start char, end char, and text of entity
                    new_ents.append((span.start, span.end, span.text))
                    #print(span.text, patterns[m]['label'])
                
        for ent in new_ents:
            start, end, name = ent
            per_ent = Span(doc, start, end, label=patterns[m]['label'])
            original_ents.append(per_ent)    
   
    #prioritizing the matches        
    filtered = filter_spans(original_ents)
    #print(filtered)
    #updating entities
    doc.ents = filtered 
    small_entities =[]

    for ent in doc.ents:
        entity_text.append((ent.text, ent.label_))
    x = [train_df.iloc[i,3], {'entities': entity_text}]
    if len(entity_text)>0:
        entity_json.append(x)
    
    results, counter = structure_data(train_df.iloc[i, 3], doc, counter)
    if results != None:
        train.append(results)
        


    

In [49]:
print (len(train))
save_data("training_data.json", train)


300


In [50]:
save_data("labels_training_data.json", entity_json)


In [51]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [52]:
TRAIN_DATA = load_data("training_data.json")
TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
random.shuffle(TRAIN_DATA)
nlp = train_spacy(TRAIN_DATA, 20)



Starting iteration 0
{'ner': 1647.1958266718552}
Starting iteration 1
{'ner': 676.78769595937}
Starting iteration 2
{'ner': 519.2003353200348}
Starting iteration 3
{'ner': 438.5287167724247}
Starting iteration 4
{'ner': 360.2200547630951}
Starting iteration 5
{'ner': 351.8670820593295}
Starting iteration 6
{'ner': 300.7903429463749}
Starting iteration 7
{'ner': 352.4327928516401}
Starting iteration 8
{'ner': 259.4880218898682}
Starting iteration 9
{'ner': 267.04299725110644}
Starting iteration 10
{'ner': 251.27171776720024}
Starting iteration 11
{'ner': 214.40712465301996}
Starting iteration 12
{'ner': 181.4726970639915}
Starting iteration 13
{'ner': 203.97288683493335}
Starting iteration 14
{'ner': 172.49316962409452}
Starting iteration 15
{'ner': 154.37017681258186}
Starting iteration 16
{'ner': 166.73938198865687}
Starting iteration 17
{'ner': 171.53055568468102}
Starting iteration 18
{'ner': 123.63836893908147}
Starting iteration 19
{'ner': 192.15249339831195}


In [53]:
#generating the test set from the trained model
small_entities =[]
for i in range(len(test_df)):
    entities = []
    doc = nlp(test_df.iloc[i,3])
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    results = [test_df.iloc[i,3], {'entities': entities}]
    if len(entities)>0:
        small_entities.append(results)

In [54]:
save_data("test_results.json", small_entities)

### Evaluation

In [55]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [56]:
docs = load_data("real_test_data.json")
print (docs[10])

['suppy operative for filming on 4th march 2021 btwn 4pm, and midnight   confirmation of work completed', {'entities': [[31, 34, 'ORDINAL']]}]


In [57]:
from spacy.training import offsets_to_biluo_tags
def get_cleaned_label(label: str):
    if "-" in label:
        return label.split("-")[1]
    else:
        return label
    
def create_total_target_vector(docs):
    target_vector = []
    for doc in docs:
        print (doc)
        new = nlp.make_doc(doc[0])
        entities = doc[1]["entities"]
        bilou_entities = offsets_to_biluo_tags(new, entities)
        final = []
        for item in bilou_entities:
            final.append(get_cleaned_label(item))
        target_vector.extend(final)
    return target_vector

In [58]:
def create_prediction_vector(text):
    return [get_cleaned_label(prediction) for prediction in get_all_ner_predictions(text)]

def create_total_prediction_vector(docs: list):
    prediction_vector = []
    for doc in docs:
        prediction_vector.extend(create_prediction_vector(doc[0]))
    return prediction_vector

def get_all_ner_predictions(text):
    doc = nlp(text)
    entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
    bilou_entities = offsets_to_biluo_tags(doc, entities)
    return bilou_entities


In [59]:
def get_model_labels():
    labels = list(nlp.get_pipe("ner").labels)
    labels.append("O")
    return sorted(labels)
def get_dataset_labels():
    return sorted(set(create_total_target_vector(docs)))

In [71]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
def generate_confusion_matrix(docs): 
    classes = sorted(set(create_total_target_vector(docs)))
    y_true = create_total_target_vector(docs)
    y_pred = create_total_prediction_vector(docs)
    print('YTRUE')
    print (y_true)
    print('YPRED')
    print (y_pred)
    print(classification_report(y_true, y_pred, target_names=classes))
    return confusion_matrix(y_true, y_pred, classes)


In [72]:
import numpy

def plot_confusion_matrix(docs, classes, normalize=False, cmap=pyplot.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
   
    title = 'Confusion Matrix, for SpaCy NER'

    # Compute confusion matrix
    cm = generate_confusion_matrix(docs)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]

    fig, ax = pyplot.subplots()
    fig = plt.figure(figsize=(14, 10))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=numpy.arange(cm.shape[1]),
           yticks=numpy.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    pyplot.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
         for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return cm, ax, pyplot

In [None]:
plot_confusion_matrix(docs,classes=get_dataset_labels(),normalize=False)
