In [1]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

In [2]:
df = pd.read_csv('../data/combined-training.csv', usecols=[0, 1, 2, 3, 4])
df.head()

Unnamed: 0,Version,Rating,Date,Aspects,Body
0,19.12.0.12,1,1/31/20,"Customers Service, Quality",Not intuitive. Bad customer satisfaction servi...
1,20.0.1,1,1/31/20,"Customers Service, Price",I just want to upload an image as an attachmen...
2,20.0.1,5,1/31/20,General,Wow this is a fantastic program I am a new sma...
3,19.12.0.12,4,1/31/20,Dashboard,Good apart from the top of the dashboard on th...
4,19.12.0.12,5,1/31/20,General,Very good


In [3]:
nlp = spacy.load('en_core_web_sm')

df.review = df.Body.str.lower()

print(nlp.pipe_names)

['tagger', 'parser', 'ner']


  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
class EntityMatcher(object):
    name = "entity_matcher"

    def __init__(self, nlp, terms, label):
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]
        return doc

In [5]:
aspect_terms = []
for review in nlp.pipe(df.Body):
    chunks = [(chunk.root.text) for chunk in review.noun_chunks if chunk.root.pos_ == 'NOUN']
    aspect_terms.append(', '.join(chunks))
df['aspect_terms'] = aspect_terms

In [6]:
df.sample(25)

Unnamed: 0,Version,Rating,Date,Aspects,Body,aspect_terms
152,19.12.0.11,3,1/12/20,"Quality, Performance","The interface is really cute. On my laptop, on...","interface, laptop, data, years, thing, app, da..."
353,19.11.2,3,11/21/19,"experience,data,invoicing,reports",I love the app and it has been great for us fo...,"app, years, update, range, reports, invoice, s..."
306,19.12.1,1,12/16/19,"data,quality,updates",The data on the app does not match the data on...,"data, app, data, computer, time, way, app, pho..."
219,20.01.3,1,1/25/20,"general,quality,experience",app stinks,
252,20.01.0,1,1/12/20,"integration,experience",The app isn’t syncing with online account data...,"app, data, app, entry, go"
42,,5,1/27/20,General,Very easy to use and providing all functionali...,functionality
201,20.01.5,3,1/30/20,"help,data,customers service,experience,quality...",The majority of the time this app is wonderful...,"majority, time, app, basis, transaction, middl..."
3,19.12.0.12,4,1/31/20,Dashboard,Good apart from the top of the dashboard on th...,"top, dashboard, page"
50,19.12.0.12,5,1/26/20,General,This app helps me out alot I really like it.,"app, alot"
369,19.11.2,4,11/19/19,"general,experience,quality",Overall this is a fantastic app that allows me...,"app, books, thing, feature, images, tool, imag..."


In [7]:
import spacy 
from spacy import displacy  
text1 = """Overall, it is a decent product they just need to make some tweaks""" 
nlp = spacy.load("en") 
doc = nlp(text1) 
displacy.render(doc, style="ent")

  "__main__", mod_spec)


In [55]:
import spacy
import random


# TRAIN_DATA = [{"content":"Doesn't work. After logged in, I can seem to review my accounts. This would be a great app if they could fix this issue.","entities":[[0,12,"Quality"]]},{"content":"Love QB, but this app is really glitchy. Asks for password, loops me back, won't let me activate fingerprint. I'm signed in using Google, when I click on my Google account it asks for my password, when I enter password, it puts me in a loop. The only fix is uninstalling and then reinstalling the app. Which I have to do daily. It's incredibly frustrating.","entities":[[18,21,"App"],[251,254,"Quality"]]},{"content":"Good apart from the top of the dashboard on the home page being greyed out.","entities":[[31,40,"Dashboard"]]},{"content":"Great app for small business owners!","entities":[[6,9,"App"]]},{"content":"Garbage, won't let you delete your account. It took hours to finally be able to delete email and account, save yourself the hassle and dont download it.","entities":[[0,7,"Quality"],[52,57,"Performance"]]},{"content":"Error Everytime I log in, and requires password so often. Edit: the log in is working, but when I try to add an expense it doesn't save, the same button doesn't work","entities":[[0,5,"Quality"],[112,119,"Expenses"]]}]
TRAIN_DATA = [{"content":"what is the price of polo?","entities":[[21,25,"PrdName"]]},{"content":"what is the price of ball?","entities":[[21,25,"PrdName"]]},{"content":"what is the price of jegging?","entities":[[21,28,"PrdName"]]},{"content":"what is the price of t-shirt?","entities":[[21,28,"PrdName"]]},{"content":"what is the price of jeans?","entities":[[21,26,"PrdName"]]},{"content":"what is the price of bat?","entities":[[21,24,"PrdName"]]},{"content":"what is the price of shirt?","entities":[[21,26,"PrdName"]]},{"content":"what is the price of bag?","entities":[[21,24,"PrdName"]]},{"content":"what is the price of cup?","entities":[[21,24,"PrdName"]]},{"content":"what is the price of jug?","entities":[[21,24,"PrdName"]]},{"content":"what is the price of plate?","entities":[[21,26,"PrdName"]]},{"content":"what is the price of glass?","entities":[[21,26,"PrdName"]]},{"content":"what is the price of moniter?","entities":[[21,28,"PrdName"]]},{"content":"what is the price of desktop?","entities":[[21,28,"PrdName"]]},{"content":"what is the price of bottle?","entities":[[21,27,"PrdName"]]},{"content":"what is the price of mouse?","entities":[[21,26,"PrdName"]]},{"content":"what is the price of keyboad?","entities":[[21,28,"PrdName"]]},{"content":"what is the price of chair?","entities":[[21,26,"PrdName"]]},{"content":"what is the price of table?","entities":[[21,26,"PrdName"]]},{"content":"what is the price of watch?","entities":[[21,26,"PrdName"]]},{"content":"","entities":[]}]


def train_spacy(data,n_iter):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for entry in TRAIN_DATA:
        for ent in entry.get('entities'):
            ner.add_label(ent[2])
#     print(TRAIN_DATA)
#     for _, annotations in TRAIN_DATA:
#          for ent in annotations.get('entities'):
#             ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA,  size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch) 
                # Updating the weights
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
                print('Losses', losses)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
                print('Losses', losses)

    # get names of other pipes to disable them during training
#     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
#     with nlp.disable_pipes(*other_pipes):  # only train NER
#         optimizer = nlp.begin_training()
#         for itn in range(iterations):
#             print("Statring iteration " + str(itn))
#             random.shuffle(TRAIN_DATA)
#             losses = {}
#             for annotations in TRAIN_DATA:
#                 nlp.update(
#                     [annotations.get('content')],  # batch of texts
#                     [annotations.get('content')],  # batch of annotations
#                     drop=0.2,  # dropout - make it harder to memorise data
#                     sgd=optimizer,  # callable to update weights
#                     losses=losses)
# #             for text, annotations in TRAIN_DATA:
# #                 print(annotations)
# #                 nlp.update(
# #                     [text],  # batch of texts
# #                     [annotations],  # batch of annotations
# #                     drop=0.2,  # dropout - make it harder to memorise data
# #                     sgd=optimizer,  # callable to update weights
# #                     losses=losses)
#             print(losses)
    return nlp


prdnlp = train_spacy(TRAIN_DATA, 20)


ValueError: [E151] Trying to call nlp.update without required annotation types. Expected top-level keys: ('words', 'tags', 'heads', 'deps', 'entities', 'cats', 'links'). Got: ['e', 'n', 't', 'i', 't', 'i', 'e', 's'].

In [27]:
TRAIN_DATA = [{"content":"Doesn't work. After logged in, I can seem to review my accounts. This would be a great app if they could fix this issue.","entities":[[0,12,"Quality"]]},{"content":"Love QB, but this app is really glitchy. Asks for password, loops me back, won't let me activate fingerprint. I'm signed in using Google, when I click on my Google account it asks for my password, when I enter password, it puts me in a loop. The only fix is uninstalling and then reinstalling the app. Which I have to do daily. It's incredibly frustrating.","entities":[[18,21,"App"],[251,254,"Quality"]]},{"content":"Good apart from the top of the dashboard on the home page being greyed out.","entities":[[31,40,"Dashboard"]]},{"content":"Great app for small business owners!","entities":[[6,9,"App"]]},{"content":"Garbage, won't let you delete your account. It took hours to finally be able to delete email and account, save yourself the hassle and dont download it.","entities":[[0,7,"Quality"],[52,57,"Performance"]]},{"content":"Error Everytime I log in, and requires password so often. Edit: the log in is working, but when I try to add an expense it doesn't save, the same button doesn't work","entities":[[0,5,"Quality"],[112,119,"Expenses"]]}]


# 


# print(type(TRAIN_DATA))
# for content, annotations in TRAIN_DATA:
#     print(type(annotations))
#     for ent in annotations.get('entities'):
#         ner.add_label(ent[2])

[0, 12, 'Quality']
Quality
[18, 21, 'App']
App
[251, 254, 'Quality']
Quality
[31, 40, 'Dashboard']
Dashboard
[6, 9, 'App']
App
[0, 7, 'Quality']
Quality
[52, 57, 'Performance']
Performance
[0, 5, 'Quality']
Quality
[112, 119, 'Expenses']
Expenses


In [None]:

# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)


In [None]:
#Test your text
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [43]:
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# training data
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]

# TRAIN_DATA = [
#     {"content":"Doesn't work. After logged in, I can seem to review my accounts. This would be a great app if they could fix this issue.","entities":[[0,12,"Quality"]]},{"content":"Love QB, but this app is really glitchy. Asks for password, loops me back, won't let me activate fingerprint. I'm signed in using Google, when I click on my Google account it asks for my password, when I enter password, it puts me in a loop. The only fix is uninstalling and then reinstalling the app. Which I have to do daily. It's incredibly frustrating.","entities":[[18,21,"App"],[251,254,"Quality"]]},{"content":"Good apart from the top of the dashboard on the home page being greyed out.","entities":[[31,40,"Dashboard"]]},{"content":"Great app for small business owners!","entities":[[6,9,"App"]]},{"content":"Garbage, won't let you delete your account. It took hours to finally be able to delete email and account, save yourself the hassle and dont download it.","entities":[[0,7,"Quality"],[52,57,"Performance"]]},{"content":"Error Everytime I log in, and requires password so often. Edit: the log in is working, but when I try to add an expense it doesn't save, the same button doesn't work","entities":[[0,5,"Quality"],[112,119,"Expenses"]]}]



@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                print(texts)
                print(annotations)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


prdnlp = main()

Created blank 'en' model
('I like London and Berlin.', 'Who is Shaka Khan?')
({'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]}, {'entities': [(7, 17, 'PERSON')]})
Losses {'ner': 9.899998903274536}
('Who is Shaka Khan?', 'I like London and Berlin.')
({'entities': [(7, 17, 'PERSON')]}, {'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]})
Losses {'ner': 9.679013013839722}
('Who is Shaka Khan?', 'I like London and Berlin.')
({'entities': [(7, 17, 'PERSON')]}, {'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]})
Losses {'ner': 9.480179786682129}
('I like London and Berlin.', 'Who is Shaka Khan?')
({'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]}, {'entities': [(7, 17, 'PERSON')]})
Losses {'ner': 9.099778056144714}
('Who is Shaka Khan?', 'I like London and Berlin.')
({'entities': [(7, 17, 'PERSON')]}, {'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]})
Losses {'ner': 8.961228489875793}
('I like London and Berlin.', 'Who is Shaka Khan?')
({'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]}, {'entities': [(7, 

Losses {'ner': 1.8203126382431947}
('Who is Shaka Khan?', 'I like London and Berlin.')
({'entities': [(7, 17, 'PERSON')]}, {'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]})
Losses {'ner': 0.8164988591161091}
('Who is Shaka Khan?', 'I like London and Berlin.')
({'entities': [(7, 17, 'PERSON')]}, {'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]})
Losses {'ner': 1.1951607942464761}
('Who is Shaka Khan?', 'I like London and Berlin.')
({'entities': [(7, 17, 'PERSON')]}, {'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]})
Losses {'ner': 2.414423618640285}
('I like London and Berlin.', 'Who is Shaka Khan?')
({'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]}, {'entities': [(7, 17, 'PERSON')]})
Losses {'ner': 3.037676514737541}
('I like London and Berlin.', 'Who is Shaka Khan?')
({'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]}, {'entities': [(7, 17, 'PERSON')]})
Losses {'ner': 2.5304495019372553}
('I like London and Berlin.', 'Who is Shaka Khan?')
({'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]}, {'ent