# Detecting PII (Personally Identifiable Information)

**Using NLTK Library**

Ref: https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk

In [1]:
import json

import pandas as pd

import nltk, re
import nltk.corpus
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords, state_union
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag.stanford import StanfordNERTagger
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk import RegexpParser
from nltk.chunk.api import ChunkParserI

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette='Set2', )
%matplotlib inline

from sklearn.model_selection import train_test_split

# from faker import Faker

In [None]:
# with open('D:/DSBA/Year3 Term2/Project/Datasets/conversation.json', 'r') as json_file:
#     f = json_file.read()
# data = json.loads(f)
# data

In [None]:
# for i in data['conversations']:
#    print(i)

***

In [2]:
sentence = """Hello, you have called Virtual bank, this is Nancy speaking. How may I help you?
Oh, I just had withdrawn some cash from the ATM machine and ATM transaction failed but money got debited. Can you fix this problem?
Sure. What is your account number?
It is 111236669.
Just a moment …. Okay and what is your name ma’am?
My name is Sandra Reed.
Okay, Miss Reed. Can I have your identify number?
Okay. 5589766523663.
Okay. I have 5589766523663.
Correct.
Where is the ATM machine that you had withdraw the cash?
I do not know where exactly it is, but it is in the Pattaya beach.
That is fine, we will check your withdrawal transaction and we will refund the money to your account. Do you want to receive the message when we refunding the money?
Yes, please.
Okay, what is your phone number ma’am?
8779526987.
Okay, I have 8779526987. We will send the message when we refunding the money to your account.
Thanks, Nancy.
Have a good day ma’am. Thank you.
"""

In [3]:
sentence

'Hello, you have called Virtual bank, this is Nancy speaking. How may I help you?\nOh, I just had withdrawn some cash from the ATM machine and ATM transaction failed but money got debited. Can you fix this problem?\nSure. What is your account number?\nIt is 111236669.\nJust a moment …. Okay and what is your name ma’am?\nMy name is Sandra Reed.\nOkay, Miss Reed. Can I have your identify number?\nOkay. 5589766523663.\nOkay. I have 5589766523663.\nCorrect.\nWhere is the ATM machine that you had withdraw the cash?\nI do not know where exactly it is, but it is in the Pattaya beach.\nThat is fine, we will check your withdrawal transaction and we will refund the money to your account. Do you want to receive the message when we refunding the money?\nYes, please.\nOkay, what is your phone number ma’am?\n8779526987.\nOkay, I have 8779526987. We will send the message when we refunding the money to your account.\nThanks, Nancy.\nHave a good day ma’am. Thank you.\n'

In [4]:
type(sentence)

str

***

## Tokenization

    Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentence is 
    called Tokenization. Token is a single entity that is building blocks for sentence or paragraph.

**Sentences tokenization:** the process of splitting up strings into “sentences”

In [5]:
tokenized_sent = sent_tokenize(sentence)

In [6]:
tokenized_sent

['Hello, you have called Virtual bank, this is Nancy speaking.',
 'How may I help you?',
 'Oh, I just had withdrawn some cash from the ATM machine and ATM transaction failed but money got debited.',
 'Can you fix this problem?',
 'Sure.',
 'What is your account number?',
 'It is 111236669.',
 'Just a moment ….',
 'Okay and what is your name ma’am?',
 'My name is Sandra Reed.',
 'Okay, Miss Reed.',
 'Can I have your identify number?',
 'Okay.',
 '5589766523663.',
 'Okay.',
 'I have 5589766523663.',
 'Correct.',
 'Where is the ATM machine that you had withdraw the cash?',
 'I do not know where exactly it is, but it is in the Pattaya beach.',
 'That is fine, we will check your withdrawal transaction and we will refund the money to your account.',
 'Do you want to receive the message when we refunding the money?',
 'Yes, please.',
 'Okay, what is your phone number ma’am?',
 '8779526987.',
 'Okay, I have 8779526987.',
 'We will send the message when we refunding the money to your account.',

**Word Tokenization:** the process of splitting up “sentences” into “words”

In [7]:
tokenized_word = []

for sent in tokenized_sent:
    tokenized_word.append(word_tokenize(sent))
print(tokenized_word)

[['Hello', ',', 'you', 'have', 'called', 'Virtual', 'bank', ',', 'this', 'is', 'Nancy', 'speaking', '.'], ['How', 'may', 'I', 'help', 'you', '?'], ['Oh', ',', 'I', 'just', 'had', 'withdrawn', 'some', 'cash', 'from', 'the', 'ATM', 'machine', 'and', 'ATM', 'transaction', 'failed', 'but', 'money', 'got', 'debited', '.'], ['Can', 'you', 'fix', 'this', 'problem', '?'], ['Sure', '.'], ['What', 'is', 'your', 'account', 'number', '?'], ['It', 'is', '111236669', '.'], ['Just', 'a', 'moment', '…', '.'], ['Okay', 'and', 'what', 'is', 'your', 'name', 'ma', '’', 'am', '?'], ['My', 'name', 'is', 'Sandra', 'Reed', '.'], ['Okay', ',', 'Miss', 'Reed', '.'], ['Can', 'I', 'have', 'your', 'identify', 'number', '?'], ['Okay', '.'], ['5589766523663', '.'], ['Okay', '.'], ['I', 'have', '5589766523663', '.'], ['Correct', '.'], ['Where', 'is', 'the', 'ATM', 'machine', 'that', 'you', 'had', 'withdraw', 'the', 'cash', '?'], ['I', 'do', 'not', 'know', 'where', 'exactly', 'it', 'is', ',', 'but', 'it', 'is', 'in', 

***

## Lowercasing

In [15]:
tokenized_word_lower = []

for sent in tokenized_sent:
    tokenized_word_lower.append([word.lower() for word in word_tokenize(sent)])
print(tokenized_word_lower)

[['hello', ',', 'you', 'have', 'called', 'virtual', 'bank', ',', 'this', 'is', 'nancy', 'speaking', '.'], ['how', 'may', 'i', 'help', 'you', '?'], ['oh', ',', 'i', 'just', 'had', 'withdrawn', 'some', 'cash', 'from', 'the', 'atm', 'machine', 'and', 'atm', 'transaction', 'failed', 'but', 'money', 'got', 'debited', '.'], ['can', 'you', 'fix', 'this', 'problem', '?'], ['sure', '.'], ['what', 'is', 'your', 'account', 'number', '?'], ['it', 'is', '111236669', '.'], ['just', 'a', 'moment', '…', '.'], ['okay', 'and', 'what', 'is', 'your', 'name', 'ma', '’', 'am', '?'], ['my', 'name', 'is', 'sandra', 'reed', '.'], ['okay', ',', 'miss', 'reed', '.'], ['can', 'i', 'have', 'your', 'identify', 'number', '?'], ['okay', '.'], ['5589766523663', '.'], ['okay', '.'], ['i', 'have', '5589766523663', '.'], ['correct', '.'], ['where', 'is', 'the', 'atm', 'machine', 'that', 'you', 'had', 'withdraw', 'the', 'cash', '?'], ['i', 'do', 'not', 'know', 'where', 'exactly', 'it', 'is', ',', 'but', 'it', 'is', 'in', 

## Frequency Distribution

In [10]:
# Distinct words
fdist = FreqDist(tokenized_word_lower)

In [None]:
print(fdist)

In [None]:
for i in tokenized_word:
    fdist[i.lower()]+=1
fdist

In [None]:
# checking distinct tokens
len(fdist)

In [None]:
fdist.most_common()

In [None]:
print(fdist.most_common())

In [None]:
# Frequency Distribution Plot
plt.figure(figsize = (20,7))
fdist.plot(30, cumulative = False)
plt.show()

***

## Stop words

    Stopwords considered as noise in the text. Text may contain stop words such as is, am, are, this, a, an, the, etc.
    
    In NLTK for removing stopwords, you need to create a list of stopwords and filter out your list of tokens from these words.

In [None]:
stop_words = set(stopwords.words("english"))
print(stop_words)

**Removing Stopwords**

In [None]:
filtered_sent = []

for w in tokens_text:
    if w not in stop_words:
        filtered_sent.append(w)

print("Tokenized Sentence: ", tokens_text)
print("")
print("Filterd Sentence: ", filtered_sent)

***

## Lexicon Normalization

    Lexicon normalization considers another type of noise in the text. For example, connection, connected, connecting word reduce to a common word "connect". It reduces derivationally related forms of a word to a common root word.

**Lemmatization:** reduces words to their base word, which is linguistically correct lemmas. It transforms root word with the use of vocabulary and morphological analysis. Lemmatization is usually more sophisticated than stemming. Stemmer works on an individual word without knowledge of the context. For example, The word "better" has "good" as its lemma. This thing will miss by stemming because it requires a dictionary look-up.

In [None]:
lem = WordNetLemmatizer()

lemma_words = []

for w in filtered_sent:
    lemma_words.append(lem.lemmatize(w.lower(), "v"))
    
print("Filtered Sentence: ", filtered_sent)
print("")
print("Lemmatized Sentence: ", lemma_words)

In [None]:
#Lexicon Normalization
#performing stemming and Lemmatization
lem = WordNetLemmatizer()
stem = PorterStemmer()

word = "spoken"
print("Lemmatized Word: ", lem.lemmatize(word, "v"))
print("Stemmed Word: ", stem.stem(word))

***

## POS Tagging

    The primary target of Part-of-Speech(POS) tagging is to identify the grammatical group of a given word. Whether it is a NOUN, PRONOUN,
    ADJECTIVE, VERB, ADVERBS, etc. based on the context. POS Tagging looks for relationships within the sentence and assigns a corresponding tag
    to the word.
    
Checking pos tag lists at https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/

In [None]:
nltk.pos_tag(lemma_words)

***

In [None]:
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

#    if continuous_chunk:
#        named_entity = " ".join(current_chunk)
#        if named_entity not in continuous_chunk:
#            continuous_chunk.append(named_entity)

    return continuous_chunk

print(get_continuous_chunks(sentence))

GPE means geo-political entities

In [None]:
for sent in nltk.sent_tokenize(sentence):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

In [None]:
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document) # sentence segmenter
    sentences = [nltk.word_tokenize(sent) for sent in sentences] # word tokenizer
    sentences = [nltk.pos_tag(sent) for sent in sentences] # part-of-speech tagger
    return sentences

***
##### Test Kaggle