In [16]:
import re
import nltk
import json
import cPickle as pickle

In [5]:
def find_emails(text):
        """ Returns e-mail addresses [tag: EMAIL] """
        emails_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}"
        emails_re = re.compile(emails_regex)
        emails_list = [("EMAIL", email) for email in emails_re.findall(text)]
        return emails_list

In [6]:
def find_phoneNumber(text, regex=None):
        """ Returns phone numbers"""

        if not regex:
            # Using US phone regex as default
            regex = r'''(\b
                                \d{3}     # area code is 3 digits (e.g. '800')
                                \D*         # optional separator is any number of non-digits
                                \d{3}     # trunk is 3 digits (e.g. '555')
                                \D*         # optional separator
                                \d{4}\b     # rest of number is 4 digits (e.g. '1212')
                                )'''
        
        phone_re = re.compile(regex, re.VERBOSE)
        phone_list = [("PHONE", phone) for phone in phone_re.findall(text)]
        return phone_list

In [7]:
def find_urls(text):
        """ Returns URLs"""
        url_regex = r'''
                        (?xi)
                            \b
                            (                           
                            (?:
                                [a-z][\w-]+:                
                                (?:
                                /{1,3}                        
                                |                             
                                [a-z0-9%]                     
                                                                
                                )
                                |                           
                                www\d{0,3}[.]               
                                |                           
                                [a-z0-9.\-]+[.][a-z]{2,4}/  
                            )
                            (?:                           
                                [^\s()<>]+                      
                                |                               
                                \(([^\s()<>]+|(\([^\s()<>]+\)))*\)  
                            )+
                            (?:                           
                                \(([^\s()<>]+|(\([^\s()<>]+\)))*\)  
                                |                                   
                                [^\s`!()\[\]{};:'".,<>?«»“”‘’]
                            )
                        )'''

        url_re = re.compile(url_regex, re.VERBOSE)
        url_list = [("URL", url[0]) for url in url_re.findall(text)]
        return url_list

In [8]:
def find_singaporeID(text):
        """Returns Singapore National ID based on  characters and length long"""
        ids_list = []
        ends=['N','S','R']
        starts=['G','S']
        for sent in nltk.sent_tokenize(text):
            for word in nltk.word_tokenize(sent):
                if len(word) >=8 and any(word.startswith(start) for start in starts) and any(word.endswith(end) for end in ends) and any(char.isdigit() for char in word):
                    ids_list.append(("Singapore ID", word))
        return ids_list

In [9]:
def find_ids(text):
        """Returns IDs based on length - needs work to make it accurate"""
        ids_list = []
        for sent in nltk.sent_tokenize(text):
            for word in nltk.word_tokenize(sent):
                if len(word) >= 4 and len(word)<=8 and any(char.isdigit() for char in word):
                    ids_list.append(("ID", word))
        return ids_list

In [10]:
def find_twitterID(text):
        """Returns Twitter usernames
        """
        twitter_regex = r'^|[^@\w](@\w{1,15})\b'
        twitter_re = re.compile(twitter_regex)
        twitter_list = [("TWITTER", twitter) for twitter in twitter_re.findall(
            text) if twitter != ""]
        return twitter_list

In [11]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import Tree

def get_chunks(text, label):
    chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == nltk.Tree and subtree.label() == label:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk




In [12]:
def find_location(text,label='GPE'):
    loc_list = [('Location', loc ) for loc in get_chunks(text, label)]
    return loc_list

In [13]:
def find_creditCard(text):
        """Returns CC Info based on length - can also find out if its AMEX or not"""
        cc_regex=r'''(\b(?:\d[ -]*?){13,16}\b)'''
        cc_re = re.compile(cc_regex, re.VERBOSE)
        cc_list=[]
        for ccInfo in cc_re.findall(text):
            if len(ccInfo) ==15:
                cc_list.append(("CreditCard-AMEX", ccInfo))
            else:
                cc_list.append(("CreditCard", ccInfo))
        return cc_list

In [14]:
def find_sensitive_data(text, **kwargs):
        """ Returns sensitive info
        """
        return  find_singaporeID(text)+  \
                find_twitterID(text) + \
                find_emails(text) + \
                find_urls(text) + \
                find_phoneNumber(text)+ \
                find_location(text)+ \
                find_creditCard(text)




In [24]:
# List of files to be processed
arr_file_list = ["output_10_oliver_twist.txt",
                 "output_03_hamlet.txt",
                 "output_02_romeo_juliet.txt",
                ]

v_doc_count = 0

# output file
f_run_log = open('run_log.txt','w')

# 1. loop through each file
# 2. Analize the POS tags
# 3. Evaluate the risk of each word
for _file in arr_file_list: 
    
    v_doc_count  += 1
    
    v_file_in = 'files/'+_file
    v_file_out = 'files/output_'+_file
    
    
    
    # Initialize variables to collect and print content
    s_singaporeID =  set([])
    s_twitterID =  set([])
    s_emails =  set([])
    s_urls =  set([])
    s_phoneNumber =  set([])
    s_location =  set([])
    s_creditCard =  set([])
    v_line_count = 0
    v_singaporeID_count = 0
    v_twitterID_count = 0
    v_emails_count = 0
    v_urls_count = 0
    v_phoneNumber_count = 0
    v_location_count = 0
    v_creditCard_count = 0
    
    f_sens = open(v_file_in,'r')
    
    
    for line_raw in f_sens:
        v_line_count += 1
        line = line_raw.encode("utf-8")
        if len(find_singaporeID(line))>0:
            s_singaporeID.add(str(find_singaporeID(line)))
            v_singaporeID_count += 1
        if len(find_twitterID(line))>0:
            s_twitterID.add(str(find_twitterID(line)))
            v_twitterID_count += 1
        if len(find_emails(line))>0:
            s_emails.add(str(find_emails(line)))
            v_emails_count += 1
        if len(find_urls(line))>0:
            s_urls.add(str(find_urls(line)))
            v_urls_count += 1
        if len(find_phoneNumber(line))>0:
            s_phoneNumber.add(str(find_phoneNumber(line)))
            v_phoneNumber_count += 1
        if len(find_location(line))>0:
            s_location.add(str(find_location(line)))
            v_location_count += 1
        if len(find_creditCard(line))>0:
            s_creditCard.add(str(find_creditCard(line)))
            v_creditCard_count += 1   
    f_sens.close()
    f_run_log.write("Document number being processed : " + str(v_doc_count))
    f_run_log.write("\n\nTotal number of lines in the file : " + str(v_line_count))
    f_run_log.write("\n\nTotal number occurrences of Singaporean IDs in the file : " + str(v_singaporeID_count))
    f_run_log.write(("====> The Singapore IDs are " + str(s_singaporeID)).encode("utf-8"))
    f_run_log.write("\n\nTotal number of occurrences Twitter IDs in the file : " + str(v_twitterID_count))
    f_run_log.write(("====> The Twitter IDs are " + str(s_twitterID)).encode("utf-8"))
    f_run_log.write("\n\nTotal number of occurrences Email IDs in the file : " + str(v_emails_count))
    f_run_log.write(("====> The Email IDs are " + str(s_emails)).encode("utf-8"))
    f_run_log.write("\n\nTotal number of occurrences URLs in the file : " + str(v_urls_count))
    f_run_log.write(("====> The URLs are " + str(s_urls)).encode("utf-8"))
    f_run_log.write("\n\nTotal number of occurrences Ph Numbers in the file : " + str(v_phoneNumber_count))
    f_run_log.write(("====> The Phone numbers are " + str(s_phoneNumber)).encode("utf-8"))
    f_run_log.write("\n\nTotal number of occurrences Addresses in the file : " + str(v_location_count))
    f_run_log.write(("====> The Addresses are " + str(s_location)).encode("utf-8"))
    f_run_log.write("\n\nTotal number of occurrences Credit Cards in the file : " + str(v_creditCard_count))
    f_run_log.write(("====> The credit-card numbers are " + str(s_creditCard)).encode("utf-8"))
    f_run_log.write("\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n")
v_doc_count = 0
f_run_log.close()

<h4>Demo Code below</h4>
Each function explaining each feature

<b>1.</b> Test for sensitive email content

In [154]:
find_emails("my email id is ipd2@illinois.edu")

[('EMAIL', 'ipd2@illinois.edu')]

<b>2.</b> Test for sensitive phone number

In [159]:
find_phoneNumber("You can contact my manager at +7237724501", regex=None)

[('PHONE', '7237724501')]

<b>3.</b> Test for web-URLS

In [160]:
find_urls("If you think you're a muggle - You ought to visit http://www.mugglenet.com/ ")

[('URL', 'http://www.mugglenet.com/')]

<b>4.</b> Test for social Singaporean social security IDs 

In [163]:
    find_singaporeID('G9443425N')

[('Singapore ID', 'G9443425N')]

<b>5.</b> Test for twitter handles

In [25]:
find_twitterID('If you''re caught by an obscurian, tweet me @gandalf')

[('TWITTER', '@gandalf')]

<b>6.</b> Test for address locations

In [167]:
find_location("I need to fix my sleigh. I live at Santa Claus 325,St. Santa Claus Lane, North Pole")

[('Location', 'St'), ('Location', 'North')]

<b>6.</b> Test for credit-card numbers

In [169]:
find_creditCard("My card with number 5510399013453413 seems to be blocked")

[('CreditCard', '5510399013453413')]

<b>7.</b> Combined test

In [181]:
find_sensitive_data("I am Gandalf the Wizard, I live at Pinewood, North Mirkwood forest. My social realms are @GreyWizard. \
                    I pay my credit card bills to 5520-3277-9345-9498 with my glorious staff. If you need my help, call me at \
                    +7237724000 or email me at gandalf_the_grey@wizards.com. I shall come to thy help, unless I am busy with busy with my social media profile at https://wizard-world.com \
                    My exclusive wizrd-id is G1413425N")

[('Singapore ID', 'G1413425N'),
 ('TWITTER', '@GreyWizard'),
 ('EMAIL', 'gandalf_the_grey@wizards.com'),
 ('URL', 'https://wizard-world.com'),
 ('PHONE', '7237724000'),
 ('Location', 'North Mirkwood'),
 ('CreditCard', '5520-3277-9345-9498')]