# 1. Import Packages

In [0]:
import re
import pandas as pd
import os
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import sys

In [0]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
os.chdir("D:\\Sensitive Data Detection")
ctr = 0
file = open(r"mergeddata1.csv", encoding = "ISO-8859-1")
total = len(file.readlines())
print(total)

10637


# 2. Data Cleaning

## a. Remove all non-ascii characters

In [0]:
def clean_nonascii(text):
    return text.encode("ascii",errors="ignore").decode()

## b. Remove HTML(if any)

In [0]:
def clean_html(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', text)
    return cleantext

## c. Replace different punctuation with whitespace

In [0]:
def clean_punc(text):
    return (text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))

## d. Case folding

In [0]:
def clean_case(text):
    return text.lower()

## e. Assuming digits don't provide any valuable insight in the dyncorp data - cleaning them
*  For Banking Data, this should now be done. A better way of handling digits should be explored.

In [0]:
def clean_dig(text):
    cleanr = re.compile('\d')
    cleantext = re.sub(cleanr, ' ', text)
    return cleantext

## f. Cleaning extra whitespaces

In [0]:
def clean_ws(text):
    return (' '.join(text.split()))

## g. Remove stopwords 

In [0]:
def clean_stopwords(text):
    setstopwords = set(stopwords.words('english'))
    return " ".join(word for word in text.split() if word not in setstopwords)

## h. PoS Tagging 
* helper function for lemmatization to ensure verb and noun forms are treated differently

In [0]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J":wordnet.ADJ,
                "N":wordnet.NOUN,
                "V":wordnet.VERB,
                "R":wordnet.ADV}
    #default is noun
    return tag_dict.get(tag, wordnet.NOUN)

## i. Lemmatize all sentences basis POS tag

In [0]:
def dolemma(text):
    global ctr
    global total
#     print(ctr)
    ctr = ctr + 1
    sys.stdout.write('\r')
    sys.stdout.write('Processing %d/%d' % (ctr, total))
    
    lemmatizer = WordNetLemmatizer()
    word_list = nltk.word_tokenize(text)
    lemmatized_output = " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in word_list])
    return lemmatized_output

## j. Combined clean routines

In [0]:
def cleandf(df):
    df['Text'] = df[['Text']].apply(lambda row: clean_case(row['Text']), axis=1)
    df['Text'] = df[['Text']].apply(lambda row: clean_nonascii(str(row['Text'])), axis=1)
    df['Text'] = df[['Text']].apply(lambda row: clean_html(row['Text']), axis=1)
    df['Text'] = df[['Text']].apply(lambda row: clean_punc(row['Text']), axis=1)
    df['Text'] = df[['Text']].apply(lambda row: clean_dig(row['Text']), axis=1)
    df['Text'] = df[['Text']].apply(lambda row: clean_ws(row['Text']), axis=1)
    df['Text'] = df[['Text']].apply(lambda row: clean_stopwords(row['Text']), axis=1)
    df['Text'] = df[['Text']].apply(lambda row: dolemma(row['Text']), axis=1)
    return df

## k. Main Routine

In [0]:
#Step 1: Load the file into memory
df = pd.read_csv(r"mergeddata1.csv",encoding = "ISO-8859-1")
#Step 2: Clean the data
df = cleandf(df)
df.to_csv('cleaneddata.csv', index=False)

Processing 10492/10637