In [15]:
import pandas as pd
import spacy

In [16]:
df = pd.read_excel(r'./DatabaseTableUpload/Appendix_A_Capstone_DataSharingProposal.xlsx', sheet_name='A.25_ServiceNow_Incidents') # Provide path for a single file.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Columns: 118 entries, parent to category
dtypes: float64(72), object(46)
memory usage: 14.9+ KB



### Named Entity Recognition (NER)


Sizes: ['sm', 'md', 'lg']<br>
Your pipeline must be compatable with your current version of SpaCy.

Can download the following on Conda (base) environment:
`pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_[SIZE]-[VERSION]/en_core_web_md-[VERSION].tar.gz`

Alternatively, in Python:
`python -m spacy download en_core_web_sm`

In [17]:
from spacy.lang.en import English
nlp = English()

# Instantiate a Tokenizer with the default settings for English, including punctuation rules and exceptions.
tokenizer = nlp.tokenizer

In [18]:
# Assign SpaCy `en_core_web_` as `nlp`.
nlp = spacy.load('en_core_web_trf')
ner = nlp.get_pipe('ner')

In [19]:
# Example of token generation for the first body of text.
doc = nlp(df['short_description'][0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_) # Print: token, POS, syntactic dependency.

Pitney Bowes P2000 Series isn't able to connect with network
Pitney PROPN compound
Bowes PROPN compound
P2000 PROPN compound
Series PROPN nsubj
is AUX ROOT
n't PART neg
able ADJ acomp
to PART aux
connect VERB xcomp
with ADP prep
network NOUN pobj


In [20]:
# Create function to add an article's tokens to `doc_list`.
# Tokenize one time, then use that object for the subsequent accumulators.
# Returns None many times.
doc_list = []
def to_doc_list(text):
    doc_list.append(nlp(text))

In [21]:
# Takes time to generate tokens from each cell's fulltext.
df['short_description'].apply(to_doc_list)

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
Name: short_description, dtype: object

In [22]:
# Assign `doc_list` to `doc_series` as a Series object.
doc_series = pd.Series(doc_list)
doc_series

0     (Pitney, Bowes, P2000, Series, is, n't, able, ...
1       (Can, not, connect, to, in, -, office, Desktop)
2             (Brittany, Tyler, -, Fixed, Income, Desk)
3     (Break, /, fix, Docking, station, not, providi...
4     (Baird, TrustDesk, Migration, to, OneDrive, -,...
5                      (Deposit, Edge, Account, Unlock)
6     (MFA, Authentication, Loop, -, Unable, to, acc...
7     (Beta, report, ran, out, of, paper, ., Needs, ...
8         (Laptop, not, charging, on, Docking, Station)
9               (Mac, not, connecting, to, Guest, wifi)
10    (RJ, Edgerly, needs, his, Outlook, inbox, rest...
11    (Hello, ., My, name, is, spelled, incorrectly,...
12    (Custom, example, :, Bryce, Townsend, tried, t...
13    (Custom, example, :, Aditya, Patel, needs, a, ...
14    (Custom, example, :, Stanislav, Oliynyk, needs...
15    (Custom, example, :, Parasol, needs, to, have,...
dtype: object

In [23]:
for doc in doc_series:
    print(doc)
    filtered_string = ""
    for token in doc:
        if token.ent_type_ in ['PERSON', 'MONEY', 'CARDINAL', 'QUANTITY', 'PERCENT']:
            new_token = " <{}>".format(token.ent_type_)
        # elif token.pos_ in ['PROPN']:
        #     new_token = " <PROPN>"
        # elif token.pos_ in ['PROPN', 'NUM']:
        #     new_token = " <{}>".format(token.ent_type_)
        elif token.pos_ == "PUNCT":
            new_token = token.text
        else:
            new_token = " {}".format(token.text)
        filtered_string += new_token
    filtered_string = filtered_string[1:]
    print(filtered_string, '\n')

Pitney Bowes P2000 Series isn't able to connect with network
Pitney Bowes P2000 Series is n't able to connect with network 

Cannot connect to in-office Desktop
Can not connect to in- office Desktop 

Brittany Tyler - Fixed Income Desk
<PERSON> <PERSON>- Fixed Income Desk 

Break/fix Docking station not providing monitor display or network
Break / fix Docking station not providing monitor display or network 

Baird TrustDesk Migration to OneDrive - Error migrating 
Baird TrustDesk Migration to OneDrive- Error migrating 

Deposit Edge Account Unlock
Deposit Edge Account Unlock 

MFA Authentication Loop - Unable to access email
MFA Authentication Loop- Unable to access email 

Beta report ran out of paper. Needs a restart.
Beta report ran out of paper. Needs a restart. 

Laptop not charging on Docking Station
Laptop not charging on Docking Station 

Mac not connecting to Guest wifi
Mac not connecting to Guest wifi 

RJ Edgerly needs his Outlook inbox restored
<PERSON> <PERSON> needs his 


# Check NER results.



Dictionary accumulator for entities based on entity type


In [24]:
ent_dict = {}
def count_ent(doc):
    for ent in doc.ents:
        if ent.label_ not in ent_dict:
            ent_dict[ent.label_] = 1
        else:
            ent_dict[ent.label_]+=1
doc_series.apply(count_ent)
ent_dict

{'ORG': 2, 'PERSON': 9, 'PRODUCT': 1, 'CARDINAL': 2, 'MONEY': 1, 'PERCENT': 1}

In [25]:
print(nlp.get_pipe("ner").labels)

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')



#### PERSON.


In [26]:
ent_dict = {}
def count_person(doc):
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            if ent.text not in ent_dict:
                ent_dict[ent.text]=1
            else:
                ent_dict[ent.text]+=1
doc_series.apply(count_person)
sorted(ent_dict.items(), key=lambda x: x[1], reverse=True)[0:10]

[('Brittany Tyler', 1),
 ('RJ Edgerly', 1),
 ('Veronica Fitzpatrick', 1),
 ('Bryce Townsend', 1),
 ('Aditya Patel', 1),
 ('Emilio Hernandez', 1),
 ('Stanislav Oliynyk', 1),
 ('Takeshi Ueda', 1),
 ('Parasol', 1)]


&nbsp;
#### Save out above entity list + counts as a pd.Sereies() object.


In [27]:
output_dir = r'./PII'

In [28]:
ner_obj = sorted(ent_dict.items(), key=lambda x: x[1], reverse=True) # specify which dict to save
ner_obj = pd.Series(ner_obj)
ner_obj.to_csv(output_dir + 'ner_obj.csv', sep=';', encoding='utf-8-sig')