In [165]:
import pandas as pd
import spacy

In [166]:
df = pd.read_excel(r'./DatabaseTableUpload/Appendix_A_Capstone_DataSharingProposal.xlsx', sheet_name='A.25_ServiceNow_Incidents') # Provide path for a single file.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Columns: 118 entries, parent to category
dtypes: float64(72), object(46)
memory usage: 17.6+ KB



### Named Entity Recognition (NER)


Sizes: ['sm', 'md', 'lg']<br>
Your pipeline must be compatable with your current version of SpaCy.

Can download the following on Conda (base) environment:
`pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_[SIZE]-[VERSION]/en_core_web_md-[VERSION].tar.gz`

Alternatively, in Python:
`python -m spacy download en_core_web_sm`

In [167]:
from spacy.lang.en import English
nlp = English()

# Instantiate a Tokenizer with the default settings for English, including punctuation rules and exceptions.
tokenizer = nlp.tokenizer

In [168]:
nlp = spacy.load('en_core_web_trf') # Load transfomer model.
print(nlp.pipe_names)

['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']



#### Enhance the transformer NER model with added examples.
#### https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/


In [169]:
text_test = "Microsoft Excel throws error 0xC0000142"
phrase_len = len("Microsoft Outlook")
text_test[0:15]
print(phrase_len)

17


In [170]:
# Training examples - do this only for *individual* NERs, e.g. single-word NERs.
train_data = [
    ("Windows Defender blocked my internet access this morning, can you take a look?", {"entities": [(0, 16, "SOFTWARE")]}),
    ("BitLocker malfunctioned upon scanning this morning", {"entities": [(0,9, "SOFTWARE")]}),
    ("Please check to see if Windows Firewall is working", {"entities": [(23,39, "SOFTWARE")]}),
    ("Baird TrustDesk Migration to OneDrive - Error migrating", {"entities": [(0, 15, "SECURITY"), (29, 37, "SOFTWARE")]}),
    ("Hello. My name is spelled incorrectly on the DocuSign application. It is Veronica Fitzpatrick, some letters are all flipped around  backwards and I was hoping to have that fixed!", {"entities": [(45, 53, "SOFTWARE"), (73, 93, "PERSON")]}),
    ("I use Microsoft Office for my daily work", {"entities": [(6, 22, "SOFTWARE")]}),
    ("Microsoft Teams is crashing", {"entities": [(0, 15, "SOFTWARE")]}),
    ("Microsoft Outlook is crashing", {"entities": [(0, 17, "SOFTWARE")]}),
    ("Microsoft Excel throws error 0xC0000142", {"entities": [(0, 15, "SOFTWARE")]})
]

In [171]:
# Adding labels to the `ner`
# for _, annotations in train_data:
#     for ent in annotations.get("entities"):
#         ner.add_label(ent[2])

In [172]:
from spacy.training.example import Example
import random

ner = nlp.get_pipe("ner")

# TRAINING THE MODEL
# Disable pipeline components you don't need to change.

# METHOD 1.
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
# with nlp.select_pipes(disable=["tagger", "parser", "attribute_ruler", "lemmatizer"]):
#     batches = spacy.util.minibatch(train_data, size=2)
#     for batch in batches:
#         for text, annotations in batch:
#             # Create Example.
#             doc = nlp.make_doc(text)
#             example = Example.from_dict(doc, annotations)
#             # Update the model.
#             nlp.update([example], drop=0.3)

# METHOD 2.
# pipe_exceptions = ["transformer", "ner"]
# unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
# with nlp.select_pipes(disable=unaffected_pipes):
#     for itn in range(20): # Set iterations.
#         random.shuffle(train_data)
#         losses = {}
#         for text, annotations in train_data:
#             doc = nlp.make_doc(text)
#             example = Example.from_dict(doc, annotations)
#             nlp.update([example], losses=losses)
#         print("Iteration:", itn + 1, "Loss:", losses)
#
#
# METHOD 3.
# if "SOFTWARE" not in ner.labels:
#     ner.add_label("SOFTWARE")
#
# pipe_exceptions = ["transformer", "ner", "trf_wordpiecer", "trf_tok2vec"]
# unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
#
# with nlp.select_pipes(disable=unaffected_pipes):
#     optimizer = nlp.resume_training()
#     for iteration in range(10):  # Adjust the number of iterations as needed
#         losses = {}
#         for text, annotations in train_data:
#             doc = nlp.make_doc(text)
#             example = Example.from_dict(doc, annotations)
#             nlp.update([example], losses=losses, sgd=optimizer)
#         print("Iteration:", iteration + 1, "Loss:", losses)

# METHOD 4.
from spacy.util import minibatch
pipe_exceptions = ["transformer", "ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.select_pipes(disable=unaffected_pipes):
    for itn in range(20): # Set iterations.
        random.shuffle(train_data)
        batches = minibatch(train_data, size=2)
        losses = {}
        for batch in batches:
            examples = []
            for text, annotations in train_data:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            nlp.update(examples=examples, losses=losses, drop=0.4)
        print("Iteration:", itn + 1, "Loss:", losses)

nlp.to_disk("./en_web_core_trf_updated")

KeyboardInterrupt: 

In [173]:
nlp = spacy.load("./en_web_core_trf_updated")
# cust_nlp = spacy.load("en_web_core_trf_updated")
# cust_nlp.replace_listeners(tok2vec_name="transformer", pipe_name="ner", listeners=["model.tok2vec"])
# nlp.add_pipe(factory_name="ner", name="ner_custom", source=cust_nlp, before="ner")

In [174]:
print(nlp.pipe_names)
print(nlp.get_pipe("ner").labels)

['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'SECURITY', 'SOFTWARE', 'TIME', 'WORK_OF_ART')


In [175]:
for text, _ in train_data:
    doc = nlp(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities []
Tokens [('Baird', '', 2), ('TrustDesk', '', 2), ('Migration', '', 2), ('to', '', 2), ('OneDrive', '', 2), ('-', '', 2), ('Error', '', 2), ('migrating', '', 2)]
Entities [('Microsoft', 'ORG'), ('Outlook', 'PRODUCT')]
Tokens [('Microsoft', 'ORG', 3), ('Outlook', 'PRODUCT', 3), ('is', '', 2), ('crashing', '', 2)]
Entities []
Tokens [('Microsoft', '', 2), ('Excel', '', 2), ('throws', '', 2), ('error', '', 2), ('0xC0000142', '', 2)]
Entities [('Windows Defender', 'PRODUCT'), ('this morning', 'TIME')]
Tokens [('Windows', 'PRODUCT', 3), ('Defender', 'PRODUCT', 1), ('blocked', '', 2), ('my', '', 2), ('internet', '', 2), ('access', '', 2), ('this', 'TIME', 3), ('morning', 'TIME', 1), (',', '', 2), ('can', '', 2), ('you', '', 2), ('take', '', 2), ('a', '', 2), ('look', '', 2), ('?', '', 2)]
Entities [('Microsoft Teams', 'ORG')]
Tokens [('Microsoft', 'ORG', 3), ('Teams', 'ORG', 1), ('is', '', 2), ('crashing', '', 2)]
Entities [('BitLocker', 'ORG')]
Tokens [('BitLocker', 'ORG', 3), ('m


## Using EntityRuler() to directly classify new entities based on a set of rules.


In [None]:
# config = {"overwrite_ents": True}
# nlp.add_pipe("entity_ruler", config=config)

In [None]:
from spacy.pipeline import EntityRuler
rulerSoftwares = nlp.add_pipe("entity_ruler", before="ner")
patterns = [
    {"label": "SOFTWARE", "pattern": "Windows"},
    {"label": "SOFTWARE", "pattern": "Windows Defender"},
    {"label": "SOFTWARE", "pattern": "Windows OneDrive"},
    {"label": "SOFTWARE", "pattern": "BitLocker"},
    {"label": "SOFTWARE", "pattern": "BitDefender"},
    {"label": "SOFTWARE", "pattern": "OneDrive"},
    {"label": "SOFTWARE", "pattern": "Windows Firewall"},
    {"label": "SOFTWARE", "pattern": "Windows Server"},
    {"label": "SOFTWARE", "pattern": "Microsoft Teams"},
    {"label": "SOFTWARE", "pattern": "Microsoft"},
    {"label": "SOFTWARE", "pattern": "DocuSign"}
]
rulerSoftwares.add_patterns(patterns)
print(nlp.pipe_names)

In [176]:
# nlp = spacy.load("en_web_core_trf_updated")
doc = nlp("DocuSigns is doing its job.")
for ent in doc.ents:
    print(ent.text, ent.label_)

DocuSigns ORG


In [177]:
# Example of token generation for the first body of text.
doc = nlp(df['short_description'][0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_) # Print: token, POS, syntactic dependency.

Pitney Bowes P2000 Series isn't able to connect with network
Pitney PROPN compound
Bowes PROPN compound
P2000 PROPN compound
Series PROPN nsubj
is AUX ROOT
n't PART neg
able ADJ acomp
to PART aux
connect VERB xcomp
with ADP prep
network NOUN pobj


In [178]:
# Create function to add an article's tokens to `doc_list`.
# Tokenize one time, then use that object for the subsequent accumulators.
# Returns None many times.
doc_list = []
def to_doc_list(text):
    doc_list.append(nlp(text))

In [179]:
# Takes time to generate tokens from each cell's fulltext.
df['short_description'].apply(to_doc_list)

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
Name: short_description, dtype: object

In [180]:
# Assign `doc_list` to `doc_series` as a Series object.
doc_series = pd.Series(doc_list)
doc_series

0     (Pitney, Bowes, P2000, Series, is, n't, able, ...
1       (Can, not, connect, to, in, -, office, Desktop)
2             (Brittany, Tyler, -, Fixed, Income, Desk)
3     (Break, /, fix, Docking, station, not, providi...
4     (Baird, TrustDesk, Migration, to, OneDrive, -,...
5                      (Deposit, Edge, Account, Unlock)
6     (MFA, Authentication, Loop, -, Unable, to, acc...
7     (Beta, report, ran, out, of, paper, ., Needs, ...
8         (Laptop, not, charging, on, Docking, Station)
9               (Mac, not, connecting, to, Guest, wifi)
10    (RJ, Edgerly, needs, his, Outlook, inbox, rest...
11    (Hello, ., My, name, is, spelled, incorrectly,...
12    (Custom, example, :, Bryce, Townsend, tried, t...
13    (Custom, example, :, Aditya, Patel, needs, a, ...
14    (Custom, example, :, Stanislav, Oliynyk, needs...
15    (Custom, example, :, Parasol, needs, to, have,...
16    (Please, call, +49, 30, 901820, to, resolve, t...
17    (Please, call, +1, -, 568, -, 445, -, 8792

In [181]:
for doc in doc_series:
    print(doc)
    filtered_string = ""
    for token in doc:
        if token.ent_type_ in ['PERSON', 'PRODUCT', 'MONEY', 'CARDINAL', 'QUANTITY', 'PERCENT', 'SOFTWARE', 'SECURITY']:
            new_token = " <{}>".format(token.ent_type_)
        # elif token.pos_ in ['PROPN']:
        #     new_token = " <PROPN>"
        # elif token.pos_ in ['PROPN', 'NUM']:
        #     new_token = " <{}>".format(token.ent_type_)
        elif token.pos_ == "PUNCT":
            new_token = token.text
        else:
            new_token = " {}".format(token.text)
        filtered_string += new_token
    filtered_string = filtered_string[1:]
    print(filtered_string, '\n')

Pitney Bowes P2000 Series isn't able to connect with network
Pitney Bowes P2000 Series is n't able to connect with network 

Cannot connect to in-office Desktop
Can not connect to in- office Desktop 

Brittany Tyler - Fixed Income Desk
<PERSON> <PERSON>- Fixed Income Desk 

Break/fix Docking station not providing monitor display or network
Break / fix Docking station not providing monitor display or network 

Baird TrustDesk Migration to OneDrive - Error migrating 
Baird TrustDesk Migration to OneDrive- Error migrating 

Deposit Edge Account Unlock
Deposit Edge Account Unlock 

MFA Authentication Loop - Unable to access email
MFA Authentication Loop- Unable to access email 

Beta report ran out of paper. Needs a restart.
Beta report ran out of paper. Needs a restart. 

Laptop not charging on Docking Station
Laptop not charging on Docking Station 

Mac not connecting to Guest wifi
Mac not connecting to Guest wifi 

RJ Edgerly needs his Outlook inbox restored
<PERSON> <PERSON> needs his 

In [185]:
# Remove phone numbers.
import re
def remove_phone_numbers(text):
    pattern = r"\b(?:\+?\d{1,3}[-.])?\(?\d{1,3}\)?[-.\s]?\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b"
    return re.sub(pattern, "<PHONE NUMBER>", text)

In [186]:
df['short_description'].apply(remove_phone_numbers)

0     Pitney Bowes P2000 Series isn't able to connec...
1                   Cannot connect to in-office Desktop
2                    Brittany Tyler - Fixed Income Desk
3     Break/fix Docking station not providing monito...
4     Baird TrustDesk Migration to OneDrive - Error ...
5                           Deposit Edge Account Unlock
6      MFA Authentication Loop - Unable to access email
7        Beta report ran out of paper. Needs a restart.
8                Laptop not charging on Docking Station
9                      Mac not connecting to Guest wifi
10          RJ Edgerly needs his Outlook inbox restored
11    Hello. My name is spelled incorrectly on the D...
12    Custom example: Bryce Townsend tried to run th...
13    Custom example: Aditya Patel needs a new machi...
14    Custom example: Stanislav Oliynyk needs a new ...
15    Custom example: Parasol needs to have her new ...
16    Please call +<PHONE NUMBER> to resolve the iss...
17    Please call +<PHONE NUMBER> to reach our h

In [None]:
# Remove email addresses.
def remove_email_addresses(text):
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    return re.sub(pattern, "<EMAIL>", text)


# Check NER results.



Dictionary accumulator for entities based on entity type


In [None]:
ent_dict = {}
def count_ent(doc):
    for ent in doc.ents:
        if ent.label_ not in ent_dict:
            ent_dict[ent.label_] = 1
        else:
            ent_dict[ent.label_]+=1
doc_series.apply(count_ent)
ent_dict

In [164]:
print(nlp.get_pipe("ner").labels)

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'SECURITY', 'SOFTWARE', 'TIME', 'WORK_OF_ART')



#### PERSON.


In [None]:
ent_dict = {}
def count_person(doc):
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            if ent.text not in ent_dict:
                ent_dict[ent.text]=1
            else:
                ent_dict[ent.text]+=1
doc_series.apply(count_person)
sorted(ent_dict.items(), key=lambda x: x[1], reverse=True)[0:10]


&nbsp;
#### Save out above entity list + counts as a pd.Sereies() object.


In [None]:
output_dir = r'./PII'

In [None]:
ner_obj = sorted(ent_dict.items(), key=lambda x: x[1], reverse=True) # specify which dict to save
ner_obj = pd.Series(ner_obj)
ner_obj.to_csv(output_dir + 'ner_obj.csv', sep=';', encoding='utf-8-sig')