# Detecting PII (Personally Identifiable Information)

**Using NLTK Library**

In [1]:
import json

import pandas as pd

from string import punctuation

import nltk, re
import nltk.corpus
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords, state_union
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag.stanford import StanfordNERTagger
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk import RegexpParser
from nltk.chunk.api import ChunkParserI

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette='Set2', )
%matplotlib inline

from sklearn.model_selection import train_test_split

# from faker import Faker

In [3]:
text = """Hello, you have called Virtual bank, this is Helen speaking. How may I help you?
Hi Helen. I want to report a lost credit card.
Okay. Do you have your Debit card number?
Oh yes, that is 8574562111234522.
Okay. That is 8574562111234522.
Yes.
What is your identification number?
1145824598874.
Okay, I have 1145824598874. And what is your name sir?
My name is Sakai Jinn.
Okay. Do you want me to permanent suspend your card sir?
Yes, please. 
Okay, and your ledger balance in the account is 56,000 dollars, is that correct?
Yes.
Okay, I just permanent suspended your card. Thank you for using our service. Have a good day sir.
Thank you."""

In [4]:
text

'Hello, you have called Virtual bank, this is Helen speaking. How may I help you?\nHi Helen. I want to report a lost credit card.\nOkay. Do you have your Debit card number?\nOh yes, that is 8574562111234522.\nOkay. That is 8574562111234522.\nYes.\nWhat is your identification number?\n1145824598874.\nOkay, I have 1145824598874. And what is your name sir?\nMy name is Sakai Jinn.\nOkay. Do you want me to permanent suspend your card sir?\nYes, please. \nOkay, and your ledger balance in the account is 56,000 dollars, is that correct?\nYes.\nOkay, I just permanent suspended your card. Thank you for using our service. Have a good day sir.\nThank you.'

In [8]:
# Stop words lists from nltk
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
stop_words = set(stopwords.words("english")) # Set checking is faster in Python than list.
print(stop_words)

{"don't", "you're", 'again', "she's", 'no', 'can', "needn't", 'couldn', 'only', 'him', 'down', 'in', 'am', 'nor', 'aren', 'against', 'herself', 'didn', 'who', 'their', 'is', 'there', 'y', 'ma', 'a', "aren't", 'and', 'how', 'under', "didn't", 'yours', 'doing', "weren't", 'doesn', "mightn't", "hadn't", 'if', 'himself', 'do', "mustn't", 'theirs', 'm', 'on', 'you', 'itself', 'the', 'during', 'where', 'she', 'such', 'it', 'he', 'by', 've', "doesn't", 'weren', 'them', 'off', 'ourselves', 'me', 'why', 'of', 'before', 'until', 'each', 'hers', 'after', 'its', 'now', 'this', 'needn', "you'll", 'when', 'here', 'for', 's', 'more', 'an', 'are', 'both', 'themselves', 'up', "it's", 'but', 'other', 'll', 'don', 'been', 'yourselves', 'myself', 'through', "should've", 'further', 'does', 'hadn', 'very', 'did', "couldn't", 'because', 'wasn', 'were', 'our', "that'll", 'just', 'mustn', 'about', 'out', 'her', 'hasn', 'from', 'ours', 'then', 'most', 'not', 'any', "you've", "haven't", 'have', 'whom', 'we', 'in

In [9]:
# Combine stop words from nltk and json and puntuation together
# Stopwords from stopwords-json
stopwords_json = {"en": ["a","a's","able","about","above","according","accordingly","across","actually","after",
                         "afterwards","again","against","ain't","all","allow","allows","almost","alone","along",
                         "already","also","although","always","am","among","amongst","an","and","another","any",
                         "anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear",
                         "appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated",
                         "at","available","away","awfully","b","be","became","because","become","becomes","becoming",
                         "been","before","beforehand","behind","being","believe","below","beside","besides","best",
                         "better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't",
                         "cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come",
                         "comes","concerning","consequently","consider","considering","contain","containing","contains",
                         "corresponding","could","couldn't","course","currently","d","definitely","described","despite",
                         "did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards",
                         "during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially",
                         "et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly",
                         "example","except","f","far","few","fifth","first","five","followed","following","follows","for",
                         "former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given",
                         "gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly",
                         "has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's",
                         "hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither",
                         "hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate",
                         "in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into",
                         "inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept",
                         "know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let",
                         "let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may",
                         "maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my",
                         "myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never",
                         "nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing",
                         "novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one",
                         "ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out",
                         "outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please",
                         "plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re",
                         "really","reasonably","regarding","regardless","regards","relatively","respectively","right","s",
                         "said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming",
                         "seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she",
                         "should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime",
                         "sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub",
                         "such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx",
                         "that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's",
                         "thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll",
                         "they're","they've","think","third","this","thorough","thoroughly","those","though","three","through",
                         "throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly",
                         "try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up",
                         "upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via",
                         "viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome",
                         "well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's",
                         "whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither",
                         "who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without",
                         "won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your",
                         "yours","yourself","yourselves","z","zero"]}

stopwords_json_en = set(stopwords_json['en'])
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)

# Combine the stopwords.
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct)

### Creating functions

In [12]:
wnl = WordNetLemmatizer()

# Create new lemmatization function
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN': 'n', 'JJ': 'a',
                  'VB': 'v', 'RB': 'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' # if mapping isn't found, fall back to Noun.

In [13]:
def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos = penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

In [18]:
# Creating removing stop words and lemmatize function
def preprocess_text(text):
    return [word for word in lemmatize_sent(text) 
            if word not in stoplist_combined]

In [19]:
# Try lemmatize_sent() and remove stopwords
print('Original Sentence:')
print(list(map(str.lower, word_tokenize(text))), '\n')
print('Lemmatized and removed stopwords:')
print(preprocess_text(text))

Original Sentence:
['hello', ',', 'you', 'have', 'called', 'virtual', 'bank', ',', 'this', 'is', 'helen', 'speaking', '.', 'how', 'may', 'i', 'help', 'you', '?', 'hi', 'helen', '.', 'i', 'want', 'to', 'report', 'a', 'lost', 'credit', 'card', '.', 'okay', '.', 'do', 'you', 'have', 'your', 'debit', 'card', 'number', '?', 'oh', 'yes', ',', 'that', 'is', '8574562111234522', '.', 'okay', '.', 'that', 'is', '8574562111234522', '.', 'yes', '.', 'what', 'is', 'your', 'identification', 'number', '?', '1145824598874', '.', 'okay', ',', 'i', 'have', '1145824598874', '.', 'and', 'what', 'is', 'your', 'name', 'sir', '?', 'my', 'name', 'is', 'sakai', 'jinn', '.', 'okay', '.', 'do', 'you', 'want', 'me', 'to', 'permanent', 'suspend', 'your', 'card', 'sir', '?', 'yes', ',', 'please', '.', 'okay', ',', 'and', 'your', 'ledger', 'balance', 'in', 'the', 'account', 'is', '56,000', 'dollars', ',', 'is', 'that', 'correct', '?', 'yes', '.', 'okay', ',', 'i', 'just', 'permanent', 'suspended', 'your', 'card', '.

In [21]:
# Creating POS tagging function
def postagging_word(text):
    return nltk.pos_tag([word for word in lemmatize_sent(text) 
            if word not in stoplist_combined])

In [23]:
print(postagging_word(text))

[('call', 'VB'), ('virtual', 'JJ'), ('bank', 'NN'), ('helen', 'VBN'), ('speak', 'JJ'), ('helen', 'JJ'), ('report', 'NN'), ('lose', 'VB'), ('credit', 'NN'), ('card', 'NN'), ('debit', 'VBZ'), ('card', 'JJ'), ('number', 'NN'), ('8574562111234522', 'CD'), ('8574562111234522', 'CD'), ('identification', 'NN'), ('number', 'NN'), ('1145824598874', 'CD'), ('1145824598874', 'CD'), ('sir', 'NN'), ('sakai', 'NN'), ('jinni', 'NN'), ('permanent', 'JJ'), ('suspend', 'NN'), ('card', 'NN'), ('sir', 'NN'), ('ledger', 'JJR'), ('balance', 'NN'), ('account', 'NN'), ('56,000', 'CD'), ('dollar', 'NN'), ('correct', 'JJ'), ('permanent', 'JJ'), ('suspend', 'VB'), ('card', 'NN'), ('service', 'NN'), ('good', 'JJ'), ('day', 'NN'), ('sir', 'VB')]


In [40]:
# getting the entity names (NER)
def get_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None 
    continuous_chunk = []
    current_chunk = []
    chunk_label = []

    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

In [41]:
print(get_chunks(text))

['Hello', 'Virtual', 'Helen', 'Okay', 'Okay Okay', 'Sakai Jinn', 'Okay Okay Okay']
