# How well can DOTCE learn to classify your emails?

#### <i>Tip: put your cursor in each code snippet box and press shift-enter to execute. Pay attention to the output of each step, checking for errors.</i>

## Step 1: Go to correct directory and activate the DOTCE virtual env

In [1]:
#Replace the directory string in line 3
#with your own directory path where you stored your email files.
%cd c:\Users\pviechnicki\Desktop\pviechnicki_home\sandbox\state\dotce
%pwd
! activate dotce

c:\Users\pviechnicki\Desktop\pviechnicki_home\sandbox\state\dotce


## Step 2: Load various machine learning libraries to extract features from text document corpus and build classification models

In [2]:
import pandas as pd
import os
import numpy as np
import sys
sys.path.append('C:\\Users\\pviechnicki\\AppData\\Local\\Continuum\\Anaconda3\\envs\\dotce\\lib\\site-packages\\pyLDAvis')
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pickle



## Step 3: Read csv file of emails into df and split into training and test sets

#### <i>Tip: make sure you've exported 5-7 email folders from outlook as CSV files, then edit stack_email_files.py to reflect the names of the folders you've exported, then execute stack_email_files.py to create combined_emails.csv before running step 3. The path of your combined emails file needs to match what is in line 3 of step 3 below.

In [3]:
#Make sure we're in the right directory
os.getcwd()
email_df = pd.read_csv('C:\\Users\\pviechnicki\\Desktop\\pviechnicki_home\\sandbox\\state\\data\\pv_email\\combined_emails.csv', sep='|')
#Add rowid
email_df['rownum'] = range(0, len(email_df))
email_df.groupby('cat').count()
# Filter out empty rows
non_empty_df = email_df[email_df['body'].isnull() == False].sample(frac=.2)
#sample method chooses a random sample of the origina frame
#https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
print("Original dataframe contains {} messages\nNon-empty datafram contains {} messages\n".format(
len(email_df), len(non_empty_df)))

Original dataframe contains 4629 messages
Non-empty datafram contains 925 messages



#### <i>Tip: change 'about_fraud' and 'fraud_waste_abuse' in the code snippet below to match the folders and categories you've chosen and exported.</i>

In [4]:
#Add new column to dataframe with True if cat == fraud_waste_abuse
#Edit the column names and truth conditions to match your data
non_empty_df['about_fraud'] = (non_empty_df['cat'] == 'fraud_waste_abuse')
#Create a vector of class labels
class_labels = non_empty_df['about_fraud']
#use value_counts() method of series
class_labels.value_counts()


False    657
True     268
Name: about_fraud, dtype: int64

In [5]:
#Create a training set and test set, 80% 20%
train_df, test_df = train_test_split(non_empty_df, train_size = 0.8, random_state=44)
class_labels_training = list(train_df['about_fraud'])
class_labels_test = list(test_df['about_fraud'])
value_counts = nltk.FreqDist(class_labels_training)


## Step 4: Let's make sure we can tokenize these properly and remove stop words
<p>from <a href="http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html">CS Duke.edu</a></p>
<p>Tip: Make sure you've installed the english punctuation and stop words list 
following <a href="http://www.nltk.org/data.html">these instructions.</a> If you've done it right, you'll see a list of ten common words from your first email when you run step 4.
</p>

In [6]:
#Instantiate a stemmer and a tokenizer to preprocess the email text

snowballStemmer = SnowballStemmer("english", ignore_stopwords=True)
def preprocess(text):
    no_punctuation_text = ''
    if (type(text)== str):
        lower_text = text.lower()
        no_punctuation_text = lower_text.translate({ord(c):'' for c in string.punctuation})
    return no_punctuation_text

def myTokenize(text):
    global snowballStemmer
    tokens = []
    cleaned = preprocess(text)
    tokens = nltk.word_tokenize(cleaned)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    stemmed = [w for w in map(snowballStemmer.stem, filtered)]
    return stemmed
    
tokens = myTokenize(train_df['body'].iloc[0])
count = Counter(tokens)
print(count.most_common(10))

[('us', 8), ('mahesh', 5), ('studi', 5), ('kelkar', 4), ('govern', 4), ('thank', 3), ('let', 3), ('know', 3), ('manag', 3), ('sector', 3)]


## Step 5: convert train and test dfs to term X document representation matrices (_X)

#### <i>Tip: make sure the circle at top right labeled with the name of your kernerl ('dotce' in my case) turns white, showing that each step has completed, before you move on to the next step.</i>

In [7]:
##Instantiate a TFidf vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', 
                             max_df=0.5, tokenizer=myTokenize)

In [8]:
#This step can take a while
train_X = vectorizer.fit_transform(train_df['body'])


In [9]:
##Save feature names in a separate list
feature_names = vectorizer.get_feature_names()

In [10]:
#From http://fastml.com/classifying-text-with-bag-of-words-a-tutorial/
#Create another matrix of tfidf scores for the documents in the test set
test_X = vectorizer.transform(test_df['body'])

## Step 5: Instantiate and Train a Naive Bayes Classifier

In [11]:
#The alpha value is the sensitivity parameter.
#We train the classifier by feeding it with the labeled training data we created in step 3 above.
model_nb = MultinomialNB(alpha=0.05)
model_nb.fit(train_X, class_labels_training)

MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)

## Step 6: Test classifier on test_X matrix

### Test NB Classifier results

In [12]:
predictions = model_nb.predict( test_X )
print("Accuracy score for your classifier: {:.3f}\n".format(model_nb.score( test_X, class_labels_test)))
print("Error rate for your classifier: {:.3f}\n".format(1-model_nb.score( test_X, class_labels_test)))
classifierStats = dict()
classifierStats['accuracy'] = model_nb.score( test_X, class_labels_test)
classifierStats['errorRate'] = (1 - model_nb.score( test_X, class_labels_test))

Accuracy score for your classifier: 0.973

Error rate for your classifier: 0.027



In [13]:
def truth_value(myRow):
    if (myRow['ground_truth'] == True and myRow['predicted_value'] == True):
        return 'truePositive'
    elif (myRow['ground_truth'] == True and myRow['predicted_value'] == False):
        return 'falseNegative'
    elif (myRow['ground_truth'] == False and myRow['predicted_value'] == True):
        return 'falsePositive'
    elif (myRow['ground_truth'] == False and myRow['predicted_value'] == False):
        return 'trueNegative'
    else:
        return None
    
results = [(class_labels_test[i], predictions[i]) for i in range(0,len(predictions))]
#Add in the email id, subject, and body, then truePos, falsePos, trueNeg, falseNeg
enrichedResults = pd.DataFrame.from_records(results, test_df['rownum'].tolist(), 
    columns = ['ground_truth', 'predicted_value'])
enrichedResults['truthValue'] = enrichedResults.apply(lambda row: truth_value(row), axis=1)
enrichedResults['subject'] = test_df['subject'].tolist()
enrichedResults['body'] = test_df['body'].tolist()
counts = enrichedResults['truthValue'].value_counts()
for i in range(0,len(counts)):
    classifierStats[counts.index[i]] = counts[i]
with open('classifierStats.pyc', 'wb') as f:
    pickle.dump(classifierStats, f)
f.close()
with open('classifierTestResults.pyc', 'wb') as f1:
    pickle.dump(enrichedResults, f1)
f1.close()

## Step 7: Evaluate Results
<p>Nice summary of different formulas for accuracy, precision, recall, etc 
<a href="http://www.damienfrancois.be/blog/files/modelperfcheatsheet.pdf">here</a>.</p>
We're using the sklearn metrics module to evaluate our results.

In [14]:
from sklearn import metrics
print(metrics.classification_report(class_labels_test, predictions))
#Need to write this out to persistent object for use by visualizer

             precision    recall  f1-score   support

      False       0.99      0.98      0.98       140
       True       0.93      0.96      0.95        45

avg / total       0.97      0.97      0.97       185



## Step 8: Let's look more closely at how the classifier uses tf-idf scores to categorize emails
<p>Tip of the hat to this blog: <a href="https://buhrmann.github.io/tfidf-analysis.html">blog post from Thomas Buhrmann</a></p>

In [15]:
#Returns top n tfidf features as df, but takes dense format vector as input
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

#convert single row into dense format
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [16]:
#This is showing us the top ten tfidf scores for document 3
#This is a sanity check and should show you some informative words from within an email
top_feats_in_doc(train_X, feature_names, 3, 5)

Unnamed: 0,feature,tfidf
0,panzarella,0.274297
1,edward,0.223563
2,ill,0.184952
3,urbanczyk,0.18037
4,rosslyn,0.17455


In [17]:
#Helper function to calculate top n features that are on average most important among
#documents with grp_ids = ?
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

#### <i>Tip: change 'about_fraud' in line 5 below to match the category you selected in line 3 of step 3 above.</i>

In [18]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    ids = np.where(y.about_fraud==True)
    feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = 'fraud_waste_abuse'
    return feats_df

In [19]:
informativeTerms = top_feats_by_class(train_X, train_df, feature_names, top_n=100)
#Should print out 10 most informative features for you
informativeTerms.head(10)
with open('informativeTerms.pyc', 'wb') as f:
    pickle.dump(informativeTerms, f)
f.close()

### Step 9: write out most informative features and counts for training set and test set to power dashboard

In [40]:
#Count up occurrences of the top 100 most informative features
from collections import defaultdict
feature_counts_training = defaultdict(int)
feature_counts_test = defaultdict(int)
informativeTermsSet = set(informativeTerms['feature'])
trainTokensTotal = 0
testTokensTotal = 0

for index, row in train_df.iterrows():
    for token in myTokenize(row['body']):
        trainTokensTotal += 1
        if (token in informativeTermsSet):
            feature_counts_training[token] += 1 

for index, row in test_df.iterrows():
    for token in myTokenize(row['body']):
        testTokensTotal+= 1
        if (token in informativeTermsSet):
            feature_counts_test[token] += 1 

#Write to disk
with open('feature_counts_training.pyc', 'wb') as f:
    pickle.dump(feature_counts_training, f)
f.close()

with open('feature_counts_training.pyc', 'wb') as f:
    pickle.dump(feature_counts_training, f)
f.close()


In [37]:
print(informativeTermsSet)

{'junko', 'séan', 'fight', 'william', 'iphon', 'sean', 'boston', 'pulkit', '844', 'snap', 'daniel', 'advisori', 'websit', 'eggersdirector', 'pukapoordeloittecomsubject', 'check', 'addit', 'program', 'file', 'decemb', '25', '5718826585fax', 'brien', 'researchdeloitt', 'book', 'lp', 'outlin', '2015', 'articl', 'consult', 'ok', 'thankspulkit', 'kaji', 'payment', 'hartford', 'gov2020', 'denver', 'thai', 'improp', 'fraud', 'steven', 'mailtostthaideloittecom', 'slide', 'final', 'april', 'httpwwwsolutionrevolutionbookcom', 'jake', 'error', 'wast', 'comment', 'revis', 'dan', 'novemb', 'tel', 'murphi', 'foglift', 'draft', 'wwwwilliameggerscom', 'tiffani', '12022469684', 'section', 'kapoor', 'wfe', 'tbishopdeloittecom', 'lorenz', 'japunzenbergerdeloittecom', 'httpgovernment2020dupresscom', 'graphic', 'februari', 'danolsondeloittecom', 'bishop', 'hood', 'new', 'crystal', 'miss', '7033981771mobil', 'abus', 'fwa', 'stthaideloittecom', 'kaleigh', 'edit', 'read', 'text', 'minneapoli', 'mailtoweggersd

In [42]:
totalTermCounts = {'trainTokensTotal': trainTokensTotal, 'testTokensTotal': testTokensTotal}
with open('totalTermCounts.pyc', 'wb') as f:
    pickle.dump(totalTermCounts, f)
f.close()