In [1]:
import re

import pandas as pd

import spacy
from spacy import displacy

In [2]:
nlp = spacy.load(r'C:/Users/Namwater/anaconda3/Lib/site-packages/en_core_web_sm/en_core_web_sm-2.3.1')

In [3]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [4]:
conv = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/text.csv')

In [5]:
conv.head()

Unnamed: 0,Sentence
0,"Hello, you have called Virtual bank, this is N..."
1,"Hello, you have called Virtual bank, this is L..."
2,"Hello, you have called Virtual bank, this is M..."
3,"Hello, you have called Virtual bank, this is H..."
4,"Hello, you have called Virtual bank, this is L..."


In [6]:
gen_pii = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/train_text_with_pii_2020_10_18_16_36_29_513505.csv')

In [7]:
gen_pii.head()

Unnamed: 0,Text,Labels,PII
0,Suite 426 Produce education hand statement. St...,Address,Suite 426
1,252 Kelly Camp Imagine food analysis so. Reall...,Address,252 Kelly Camp
2,Education poor interview society on nice simpl...,Address,"9202 Jennifer Valleys Suite 890 Port Sara, ID ..."
3,Practice enough Apt. 480 learn instead read ro...,Address,Apt. 480
4,Cause example so serious mention. Reflect Amer...,Address,Suite 072


In [6]:
# Data cleaning
def clean_text(text):
    # replace . and a space with only a space, then make all words lower case.
    text = text.replace(".", " ").replace(",", "").lower()
    # get rid of the . at the end of each line. 
    cleaned_text = re.sub("\.$","", text)
    
    return cleaned_text

In [29]:
clean_text(conv['Sentence'][0])

'hello you have called virtual bank this is nancy speaking  how may i help you?\noh i just had withdrawn some cash from the atm machine and atm transaction failed but money got debited  can you fix this problem?\nsure  what is your account number?\nit is 111236669 \njust a moment …  okay and what is your name ma’am?\nmy name is sandra reed \nokay miss reed  can i have your identify number?\nokay  5589766523663 \nokay  i have 5589766523663 \ncorrect \nwhere is the atm machine that you had withdrawn the cash?\ni do not know where exactly it is but it is in the pattaya beach \nthat is fine we will check your withdrawal transaction and we will refund the money to your account  do you want to receive the message when we refunding the money?\nyes please \nokay what is your phone number ma’am?\n8779526987 \nokay i have 8779526987  we will send the message when we refund the money to your account \nthanks nancy \nhave a good day ma’am  thank you '

In [8]:
for ent in nlp(gen_pii['Text'][4]).ents:
    print(f'{ent.label_.upper():{10}} - {ent.text}')

NORP       - American
CARDINAL   - 072


In [30]:
inp_text = clean_text(conv['Sentence'][0])
doc = nlp(inp_text)

# for ent in doc.ents:
    # print(ent.text, ent.start_char, ent.end_char, ent.label_)
    
new_tokens = []

for token in doc:
    # print("Text: "+ token.text + " Entity: "+ token.ent_type_ + token.ent_iob_)
    if not token.ent_type_:
        new_tokens.append(token.text)
    else:
        new_tokens.append('xxxx')
        
new_text = ' '.join(new_tokens)
print("new Text->", new_text)

new Text-> hello you have called virtual bank this is nancy speaking   how may i help you ? 
 oh i just had withdrawn some cash from the xxxx xxxx and xxxx transaction failed but money got debited   can you fix this problem ? 
 sure   what is your account number ? 
 it is xxxx 
 just a moment …   okay and what is your name ma’am ? 
 my name is xxxx xxxx 
 okay miss reed   can i have your identify number ? 
 okay   xxxx 
 okay   i have xxxx 
 correct 
 where is the xxxx machine that you had withdrawn the cash ? 
 i do not know where exactly it is but it is in the xxxx xxxx 
 that is fine we will check your withdrawal transaction and we will refund the money to your account   do you want to receive the message when we refunding the money ? 
 yes please 
 okay what is your phone number ma’am ? 
 xxxx 
 okay i have xxxx   we will send the message when we refund the money to your account 
 thanks nancy 
 have xxxx xxxx xxxx ma’am   thank you


In [None]:
sent = conv['Sentence'][1]
sent

In [36]:
from tqdm import tqdm
import numpy as np
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### Sentence Segmentation

In [None]:
doc = nlp(sent)

In [None]:
for sent in doc.sents: 
    print(sent)

### Predefined named entities

In [None]:
for ent in doc.ents:
    print(f'{ent.label_.upper():{10}} - {ent.text}')

In [None]:
# Explanation of all ner
print(f'PERSON - {spacy.explain("PERSON")}')
print(f'GPE    - {spacy.explain("GPE")}')
print(f'DATE   - {spacy.explain("DATE")}')
print(f'MONEY  - {spacy.explain("MONEY")}')
print(f'ORG  - {spacy.explain("ORG")}')
print(f'CARDINAL  - {spacy.explain("CARDINAL")}')

More predefined named entites present in spacy: https://spacy.io/api/annotation#named-entities

In [None]:
# Extracting all PERSON named entities.

for ent in doc.ents:
    if ent.label_.upper() == 'PERSON':
        print(f'{ent.label_.upper():{10}} - {ent.text}')

In [None]:
for ent in doc.ents:
    if ent.label_.upper() == 'ORG':
        print(f'{ent.label_.upper():{10}} - {ent.text}')

ORG named entities are also labelled incorrect.

In [None]:
displacy.render(doc, style = "ent")

***

### Creating custom NER

In [None]:
ner = nlp.get_pipe("ner")

In [None]:
# training data
Train_data = [
              ("It is 6102651715", {"entities": [(6, 16, "CARDINAL")]}),
              ("That is A-P-P-L-E-B-A-U-M.", {"entities": [(8, 18, "PERSON")]}),
              ("C-L-A-R-K-S-O-N.", {"entities": [(0, 16, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

***

In [7]:
text = clean_text(conv['Sentence'][0])
text

'hello you have called virtual bank this is nancy speaking  how may i help you?\noh i just had withdrawn some cash from the atm machine and atm transaction failed but money got debited  can you fix this problem?\nsure  what is your account number?\nit is 111236669 \njust a moment …  okay and what is your name ma’am?\nmy name is sandra reed \nokay miss reed  can i have your identify number?\nokay  5589766523663 \nokay  i have 5589766523663 \ncorrect \nwhere is the atm machine that you had withdrawn the cash?\ni do not know where exactly it is but it is in the pattaya beach \nthat is fine we will check your withdrawal transaction and we will refund the money to your account  do you want to receive the message when we refunding the money?\nyes please \nokay what is your phone number ma’am?\n8779526987 \nokay i have 8779526987  we will send the message when we refund the money to your account \nthanks nancy \nhave a good day ma’am  thank you '

In [10]:
import spacy
from pprint import pprint
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [11]:
doc = nlp(text)
pprint([(X.text, X.label_) for X in doc.ents])

[('atm machine', 'ORG'),
 ('atm', 'ORG'),
 ('111236669', 'DATE'),
 ('sandra reed', 'PERSON'),
 ('5589766523663', 'DATE'),
 ('5589766523663', 'DATE'),
 ('atm', 'ORG'),
 ('pattaya beach', 'GPE'),
 ('8779526987', 'CARDINAL'),
 ('8779526987', 'CARDINAL'),
 ('a good day', 'DATE')]


In [13]:
# Token
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(hello, 'O', ''),
 (you, 'O', ''),
 (have, 'O', ''),
 (called, 'O', ''),
 (virtual, 'O', ''),
 (bank, 'O', ''),
 (this, 'O', ''),
 (is, 'O', ''),
 (nancy, 'O', ''),
 (speaking, 'O', ''),
 ( , 'O', ''),
 (how, 'O', ''),
 (may, 'O', ''),
 (i, 'O', ''),
 (help, 'O', ''),
 (you, 'O', ''),
 (?, 'O', ''),
 (
, 'O', ''),
 (oh, 'O', ''),
 (i, 'O', ''),
 (just, 'O', ''),
 (had, 'O', ''),
 (withdrawn, 'O', ''),
 (some, 'O', ''),
 (cash, 'O', ''),
 (from, 'O', ''),
 (the, 'O', ''),
 (atm, 'B', 'ORG'),
 (machine, 'I', 'ORG'),
 (and, 'O', ''),
 (atm, 'B', 'ORG'),
 (transaction, 'O', ''),
 (failed, 'O', ''),
 (but, 'O', ''),
 (money, 'O', ''),
 (got, 'O', ''),
 (debited, 'O', ''),
 ( , 'O', ''),
 (can, 'O', ''),
 (you, 'O', ''),
 (fix, 'O', ''),
 (this, 'O', ''),
 (problem, 'O', ''),
 (?, 'O', ''),
 (
, 'O', ''),
 (sure, 'O', ''),
 ( , 'O', ''),
 (what, 'O', ''),
 (is, 'O', ''),
 (your, 'O', ''),
 (account, 'O', ''),
 (number, 'O', ''),
 (?, 'O', ''),
 (
, 'O', ''),
 (it, 'O', ''),
 (is, 'O', ''),


In [14]:
items = [x.text for x in doc.ents]
Counter(items).most_common(3)

[('atm', 2), ('5589766523663', 2), ('8779526987', 2)]

In [24]:
# verbatim, extract part-of-speech and lemmatize this sentence.
[(x.orth_,x.pos_, x.lemma_) for x in [y for y in nlp(text) if not y.is_stop and y.pos_ != 'PUNCT']]

[('hello', 'INTJ', 'hello'),
 ('called', 'VERB', 'call'),
 ('virtual', 'ADJ', 'virtual'),
 ('bank', 'NOUN', 'bank'),
 ('nancy', 'NOUN', 'nancy'),
 ('speaking', 'VERB', 'speak'),
 (' ', 'SPACE', ' '),
 ('help', 'VERB', 'help'),
 ('\n', 'SPACE', '\n'),
 ('oh', 'INTJ', 'oh'),
 ('withdrawn', 'VERB', 'withdraw'),
 ('cash', 'NOUN', 'cash'),
 ('atm', 'NOUN', 'atm'),
 ('machine', 'NOUN', 'machine'),
 ('atm', 'NOUN', 'atm'),
 ('transaction', 'NOUN', 'transaction'),
 ('failed', 'VERB', 'fail'),
 ('money', 'NOUN', 'money'),
 ('got', 'VERB', 'get'),
 ('debited', 'ADJ', 'debited'),
 (' ', 'SPACE', ' '),
 ('fix', 'VERB', 'fix'),
 ('problem', 'NOUN', 'problem'),
 ('\n', 'SPACE', '\n'),
 ('sure', 'INTJ', 'sure'),
 (' ', 'SPACE', ' '),
 ('account', 'NOUN', 'account'),
 ('number', 'NOUN', 'number'),
 ('\n', 'SPACE', '\n'),
 ('111236669', 'NUM', '111236669'),
 ('\n', 'SPACE', '\n'),
 ('moment', 'NOUN', 'moment'),
 (' ', 'SPACE', ' '),
 ('okay', 'INTJ', 'okay'),
 ('ma’am', 'PROPN', 'madam'),
 ('\n', 'SPAC

In [20]:
dict([(str(x), x.label_) for x in nlp(text).ents])

{'atm machine': 'ORG',
 'atm': 'ORG',
 '111236669': 'DATE',
 'sandra reed': 'PERSON',
 '5589766523663': 'DATE',
 'pattaya beach': 'GPE',
 '8779526987': 'CARDINAL',
 'a good day': 'DATE'}

In [23]:
pprint([(x, x.ent_iob_, x.ent_type_) for x in doc])

[(hello, 'O', ''),
 (you, 'O', ''),
 (have, 'O', ''),
 (called, 'O', ''),
 (virtual, 'O', ''),
 (bank, 'O', ''),
 (this, 'O', ''),
 (is, 'O', ''),
 (nancy, 'O', ''),
 (speaking, 'O', ''),
 ( , 'O', ''),
 (how, 'O', ''),
 (may, 'O', ''),
 (i, 'O', ''),
 (help, 'O', ''),
 (you, 'O', ''),
 (?, 'O', ''),
 (
, 'O', ''),
 (oh, 'O', ''),
 (i, 'O', ''),
 (just, 'O', ''),
 (had, 'O', ''),
 (withdrawn, 'O', ''),
 (some, 'O', ''),
 (cash, 'O', ''),
 (from, 'O', ''),
 (the, 'O', ''),
 (atm, 'B', 'ORG'),
 (machine, 'I', 'ORG'),
 (and, 'O', ''),
 (atm, 'B', 'ORG'),
 (transaction, 'O', ''),
 (failed, 'O', ''),
 (but, 'O', ''),
 (money, 'O', ''),
 (got, 'O', ''),
 (debited, 'O', ''),
 ( , 'O', ''),
 (can, 'O', ''),
 (you, 'O', ''),
 (fix, 'O', ''),
 (this, 'O', ''),
 (problem, 'O', ''),
 (?, 'O', ''),
 (
, 'O', ''),
 (sure, 'O', ''),
 ( , 'O', ''),
 (what, 'O', ''),
 (is, 'O', ''),
 (your, 'O', ''),
 (account, 'O', ''),
 (number, 'O', ''),
 (?, 'O', ''),
 (
, 'O', ''),
 (it, 'O', ''),
 (is, 'O', ''),
