In [1]:
import spacy
import numpy as np
import pandas as pd
# nlp=spacy.load('en_core_web_sm')
nlp = spacy.load('en')
from spacy import displacy
from spacy.gold import GoldParse

In [2]:
df = pd.read_csv("esg_ner_labels_scheme2.csv")

In [3]:
df['sentence_ID'].max()

3859

In [None]:
domain_label = 'KPI'

In [None]:
df = df[df['label'] != domain_label]

In [None]:
# df['label'].unique()

In [None]:
no_sentences = df['sentence_ID'].max()
print("Number of Sentences",no_sentences)

In [None]:
add_ents = [domain_label]

In [None]:
ner = nlp.get_pipe("ner")

In [None]:
prev_ents = ner.move_names
print('[Existing Entities] = ', ner.move_names)

for ent in add_ents:
    ner.add_label(ent)
    
new_ents = ner.move_names
# print('\n[All Entities] = ', ner.move_names)

print('\n\n[New Entities] = ', list(set(new_ents) - set(prev_ents)))

### Create Dataset

In [None]:
X = []
Y = []
for sent_id in np.arange(no_sentences):
    df_temp = df[df['sentence_ID'] == sent_id + 1]
    
    words  = df_temp.word.values
    ents = df_temp.label.values
    if len(ents) > 0 and len(words) > 0:
        text = ' '.join(words)
        doc = nlp.make_doc(text)
        try:
            g = GoldParse(doc, entities=ents)
            X.append(doc)
            Y.append(g)
#             print("sentence ID = ", sent_id+1)
        except:
            print("Exception: sentence_id = ",sent_id+1)   

### Training

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
print(f'[OtherPipes] = {other_pipes} will be disabled')

In [None]:
model = None # Since we training a fresh model not a saved model
n_iter = 1000
with nlp.disable_pipes(*other_pipes):  # only train ner
    # optimizer = nlp.begin_training()
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    for i in range(n_iter):
        losses = {}
        nlp.update(X, Y,  sgd=optimizer, drop=0.35, losses=losses)
            # nlp.entity.update(d, g)
        print("Losses", losses)

In [None]:
output_dir = "../models/model_scheme2"
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

In [None]:
# # Load the saved model and predict
# print("Loading from", output_dir)
# nlp_updated = spacy.load(output_dir)
# doc = nlp_updated("Fridge can be ordered in FlipKart" )
# print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
test_text = "The Winix 5500-2 is an exceptional performer on particulates, capturing 99.9% of the smoke in our test room in just 30 minutes on high and 97.2% on its medium-high setting, where it emits an easy-to-live-with 40 decibels."
displacy.render(nlp(test_text), style='ent', jupyter=True)

In [None]:
test_text = "The Blueair Blue Pure 211+ is our choice among air purifiers for large spaces of up to 650 square feet, especially when the space involves open floor plans or high ceilings"
displacy.render(nlp(test_text), style='ent', jupyter=True)

In [None]:
test_text = "This is the matching electric dryer for the EFLS627U washer, though some lower-end Electrolux models will stack, too."
displacy.render(nlp(test_text), style='ent', jupyter=True)

In [None]:
test_text = "The WTG86400UC dryer is a ventless condenser model, and the washer can piggyback off its power supply, so you need only one outlet for the pair."
displacy.render(nlp(test_text), style='ent', jupyter=True)

In [None]:
test_text = "Although the Miele W1 holds only half as much laundry as most front-loaders, you can expect it to last at least twice as long."
displacy.render(nlp(test_text), style='ent', jupyter=True)

In [None]:
test_text = "The LG WMXC100 is a competive washer dryer."
displacy.render(nlp(test_text), style='ent', jupyter=True)

In [None]:
test_text = "The Café CTS70DP2NS1 stands out among wall ovens for its beautiful pro-style design and great reputation for cooking performance at about half the price of an upscale"
displacy.render(nlp(test_text), style='ent', jupyter=True)

In [None]:
test_text = "Within 30 minutes, the Coway AP1512HH Mighty reduced heavy smoke pollution in a 135-square-foot, 1,215-cubic-foot New York office by as much as 99.6 air purifier."
displacy.render(nlp(test_text), style='ent', jupyter=True)