In [1]:
import pandas as pd

import spacy
from spacy import displacy

In [2]:
nlp = spacy.load(r'C:/Users/Namwater/anaconda3/Lib/site-packages/en_core_web_sm/en_core_web_sm-2.3.1')

In [3]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [4]:
conv = pd.read_csv('data/Text files/text.csv')

In [5]:
conv.head()

Unnamed: 0,Sentence
0,"Hello, you have called Virtual bank, this is N..."
1,"Hello, you have called Virtual bank, this is L..."
2,"Hello, you have called Virtual bank, this is M..."
3,"Hello, you have called Virtual bank, this is H..."
4,"Hello, you have called Virtual bank, this is L..."


In [8]:
gen_pii = pd.read_csv('D:/DSBA/Project/Final-Project-2/pii_detect-master/piidetect/train_text_with_pii_2020_10_18_16_36_29_513505.csv')

FileNotFoundError: [Errno 2] File D:/DSBA/Project/Final-Project-2/pii_detect-master/piidetect/train_text_with_pii_2020_10_18_16_36_29_513505.csv does not exist: 'D:/DSBA/Project/Final-Project-2/pii_detect-master/piidetect/train_text_with_pii_2020_10_18_16_36_29_513505.csv'

In [12]:
sent = conv['Sentence'][1]
sent

'Hello, you have called Virtual bank, this is Linda speaking. How may I help you?\nHi Linda. I was just at your Ville branch and I think I left my Debit card in the ATM machine.\nOkay. Do you have your Debit card number?\nActually, I do not have.\nOkay, well do you have the checking account number associated with the Debit\ncard? \nThat I do have. Are you ready? I will give you what I have got. 765456789. \nOkay. That’s 765456789.\nCorrect.\nWhat is your identification number?\n7745896589665.\nOkay, I have 7745896589665 and what is your name sir? \nIt is Robert Applebaum. That is A-P-P-L-E-B-A-U-M. \nOkay. I have Robert Applebaum. A-P-P-L-E-B-A-U-M.\nYes.\nAnd what is your date of birth Mr. Applebaum?\nJuly 7th, 1974. \nOkay. July 7th, 1974.\nYes.\nAnd your phone number?\nIt is 6102651715. \nOkay. I have 6102651715.\nYes.\nOkay Mr. Applebaum. I have just temporarily suspended your card. If it is in the machine, we will contact you and lift the suspension. \nOh, thank you.\nSure. Thank 

### Sentence Segmentation

In [14]:
doc = nlp(sent)

In [15]:
for sent in doc.sents: 
    print(sent)

Hello, you have called Virtual bank, this is Linda speaking.
How may I help you?

Hi Linda.
I was just at your Ville branch
and I think I left my Debit card in the ATM machine.

Okay.
Do you have your Debit card number?

Actually, I do not have.

Okay, well do you have the checking account number associated with the Debit
card? 

That I do have.
Are you ready?
I will give you what I have got.
765456789. 

Okay.
That’s 765456789.

Correct.

What is your identification number?

7745896589665.

Okay, I have 7745896589665 and what is your name sir? 

It is Robert Applebaum.
That is A-P-P-L-E-B-A-U-M. 

Okay.
I have Robert Applebaum.
A-P-P-L-E-B-A-U-M.

Yes.

And what is your date of birth Mr. Applebaum?

July 7th, 1974. 

Okay.
July 7th, 1974.

Yes.

And your phone number?

It is 6102651715. 

Okay.
I have 6102651715.

Yes.

Okay
Mr. Applebaum.
I have just temporarily suspended your card.
If it is in the machine, we will contact you and lift the suspension. 

Oh, thank you.

Sure.
Thank yo

### Predefined named entities

In [16]:
for ent in doc.ents:
    print(f'{ent.label_.upper():{10}} - {ent.text}')

PERSON     - Linda
PERSON     - Linda
GPE        - Ville
ORG        - Debit
ORG        - ATM
ORG        - Debit
ORG        - Debit
DATE       - 765456789
PERSON     - 7745896589665
PERSON     - Robert Applebaum
PERSON     - Robert Applebaum
PERSON     - Applebaum
DATE       - July 7th, 1974
DATE       - July 7th, 1974
DATE       - 6102651715
CARDINAL   - 6102651715
PERSON     - Applebaum


In [17]:
# Explanation of all ner
print(f'PERSON - {spacy.explain("PERSON")}')
print(f'GPE    - {spacy.explain("GPE")}')
print(f'DATE   - {spacy.explain("DATE")}')
print(f'MONEY  - {spacy.explain("MONEY")}')
print(f'ORG  - {spacy.explain("ORG")}')
print(f'CARDINAL  - {spacy.explain("CARDINAL")}')

PERSON - People, including fictional
GPE    - Countries, cities, states
DATE   - Absolute or relative dates or periods
MONEY  - Monetary values, including unit
ORG  - Companies, agencies, institutions, etc.
CARDINAL  - Numerals that do not fall under another type


More predefined named entites present in spacy: https://spacy.io/api/annotation#named-entities

In [18]:
# Extracting all PERSON named entities.

for ent in doc.ents:
    if ent.label_.upper() == 'PERSON':
        print(f'{ent.label_.upper():{10}} - {ent.text}')

PERSON     - Linda
PERSON     - Linda
PERSON     - 7745896589665
PERSON     - Robert Applebaum
PERSON     - Robert Applebaum
PERSON     - Applebaum
PERSON     - Applebaum


In [19]:
for ent in doc.ents:
    if ent.label_.upper() == 'ORG':
        print(f'{ent.label_.upper():{10}} - {ent.text}')

ORG        - Debit
ORG        - ATM
ORG        - Debit
ORG        - Debit


ORG named entities are also labelled incorrect.

In [20]:
displacy.render(doc, style = "ent")

***

### Creating custom NER

In [21]:
ner = nlp.get_pipe("ner")

In [None]:
# training data
Train_data = [
              ("It is 6102651715", {"entities": [(6, 16, "CARDINAL")]}),
              ("That is A-P-P-L-E-B-A-U-M.", {"entities": [(8, 18, "PERSON")]}),
              ("C-L-A-R-K-S-O-N.", {"entities": [(0, 16, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

***

In [None]:
print([t.text if not t.ent_type_ else t.ent_type_ for t in doc])