# Test Project: Import emails exported from outlook via csv into DOTCE, classify

## Step 1: Go to correct directory and activate the DOTCE virtual env

In [1]:
%cd c:\Users\pviechnicki\Desktop\pviechnicki_home\sandbox\state\dotce
%pwd
! activate dotce
! conda info --envs

c:\Users\pviechnicki\Desktop\pviechnicki_home\sandbox\state\dotce
# conda environments:
#
dotce                    C:\Users\pviechnicki\AppData\Local\Continuum\Anaconda3\envs\dotce
root                  *  C:\Users\pviechnicki\AppData\Local\Continuum\Anaconda3



## Step 2: Load various machine learning libraries to extract features from text document corpus and build classification models

In [2]:
import pandas as pd
import os
import numpy as np
import sys
sys.path.append('C:\\Users\\pviechnicki\\AppData\\Local\\Continuum\\Anaconda3\\envs\\dotce\\lib\\site-packages\\pyLDAvis')
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression as LR



## Step 3: Read csv file of emails into df and split into training and test sets

In [3]:
#Make sure we're in the right directory
os.getcwd()
email_df = pd.read_csv('C:\\Users\\pviechnicki\\Desktop\\pviechnicki_home\\sandbox\\state\\data\\pv_email\\combined_emails.csv', sep='|')
#Add rowid
email_df['rownum'] = range(0, len(email_df))
email_df.groupby('cat').count()
# Filter out empty rows
non_empty_df = email_df[email_df['body'].isnull() == False].sample(frac=.2)
#sample method chooses a random sample of the origina frame
#https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
print("Original dataframe contains {} messages\nNon-empty datafram contains {} messages\n".format(
len(email_df), len(non_empty_df)))


Original dataframe contains 4629 messages
Non-empty datafram contains 925 messages



In [4]:
#Add new column to dataframe with True if cat == fraud_waste_abuse
#Edit the column names and truth conditions to match your data
non_empty_df['about_fraud'] = (non_empty_df['cat'] == 'fraud_waste_abuse')
#Create a vector of class labels
class_labels = non_empty_df['about_fraud']
#use value_counts() method of series
class_labels.value_counts()


False    646
True     279
Name: about_fraud, dtype: int64

In [6]:
#Create a training set and test set, 80% 20%
train_df, test_df = train_test_split(non_empty_df, train_size = 0.8, random_state=44)
class_labels_training = train_df['about_fraud']
class_labels_training.value_counts()


False    519
True     221
Name: about_fraud, dtype: int64

## Step 4: Let's make sure we can tokenize these properly and remove stop words
<p>from <a href="http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html">CS Duke.edu</a></p>
<p>Make sure you've installed the english punctuation and stop words list 
following <a href="http://www.nltk.org/data.html">these instructions.</a>
</p>

In [15]:
#Instantiate a stemmer and a tokenizer to preprocess the email text

snowballStemmer = SnowballStemmer("english", ignore_stopwords=True)
def preprocess(text):
    no_punctuation_text = ''
    if (type(text)== str):
        lower_text = text.lower()
        no_punctuation_text = lower_text.translate({ord(c):'' for c in string.punctuation})
    return no_punctuation_text

def myTokenize(text):
    global snowballStemmer
    tokens = []
    cleaned = preprocess(text)
    tokens = nltk.word_tokenize(cleaned)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    stemmed = [w for w in map(snowballStemmer.stem, filtered)]
    return stemmed
    
tokens = myTokenize(train_df['body'].iloc[0])
count = Counter(tokens)
print(count.most_common(10))

[('model', 11), ('variabl', 7), ('peter', 7), ('differ', 6), ('linear', 6), ('hour', 5), ('make', 5), ('see', 5), ('depend', 4), ('data', 4)]


## Step 5: convert train and test dfs to term X document representation matrices (_X)

In [18]:
##Instantiate a TFidf vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', 
                             max_df=0.5, tokenizer=myTokenize)

In [19]:
#This step can take a while
train_X = vectorizer.fit_transform(train_df['body'])


In [20]:
##Save feature names in a separate list
feature_names = vectorizer.get_feature_names()

In [21]:
#From http://fastml.com/classifying-text-with-bag-of-words-a-tutorial/
#Create another matrix of tfidf scores for the documents in the test set
test_X = vectorizer.transform(test_df['body'])

## Step 5: Instantiate and Train a Logistic Regression Classifier

In [22]:
#Create a logistic regression model
#From same tutorial https://github.com/zygmuntz/classifying-text/blob/master/bow_predict.py
model = LR()
model.fit(train_X, class_labels_training)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Step 6: Test classifier on test_X matrix

In [23]:
#Predict probability of class membership
p = model.predict_proba( test_X )[:,1]

## Step 7: Evaluate Results
<p>Nice summary of different formulas for accuracy, precision, recall, etc 
<a href="http://www.damienfrancois.be/blog/files/modelperfcheatsheet.pdf">here</a></p>

In [25]:
#Write out results
output_df = pd.DataFrame( data = {'rownum': test_df['rownum'], 
                                  'about_fraud': test_df['about_fraud'], 
                                  'logistic': p })


In [28]:
print("Output dataframe length: {}\n".format(len(output_df)))
THRESHOLD = .23
output_df['prediction'] = (output_df['logistic'] >= THRESHOLD)
print("DF columns:{}\n".format(", ".join(output_df.columns)))

##If 'prediction' matches 'about_fraud' then the classifier got it right
output_df.head(10)

Output dataframe length: 185

DF columns:about_fraud, logistic, rownum, prediction



Unnamed: 0,about_fraud,logistic,rownum,prediction
1605,False,0.06056,1605,False
1707,False,0.153765,1707,False
745,False,0.154392,745,False
2263,False,0.179762,2263,False
3977,True,0.706092,3977,True
1396,False,0.11108,1396,False
60,False,0.353156,60,True
4283,True,0.398459,4283,True
3621,True,0.895062,3621,True
3086,False,0.121713,3086,False


In [42]:
#Now let's see how well this model did, true positives, false positives, etc
def accuracy(tp, tn, fp, fn):
    return ((tp + tn)/(tp + tn + fp + fn))

def error_rate(tp, tn, fp, fn):
    return ((fp + fn)/ (tp + tn + fp + fn))

true_positives = len(output_df.loc[(output_df['about_fraud'] == True) & (output_df['prediction'] == True)])
false_positives = len(output_df.loc[(output_df['about_fraud'] == False) & (output_df['prediction'] == True)])
true_negatives = len(output_df.loc[(output_df['about_fraud'] == False) & (output_df['prediction'] == False)])
false_negatives = len(output_df.loc[(output_df['about_fraud'] == True) & (output_df['prediction'] == False)])
print("Results\nTrue Positives\tTrue_Negatives\tFalse_Positives\False_Negatives\n")
print("\t".join(map(str, [true_positives, true_negatives, false_positives, false_negatives])))

print("Classifier Accuracy: {}\n".format(accuracy(true_positives, true_negatives, 
                                                  false_positives, false_negatives)))
print("Classifier Error Rate: {}\n".format(error_rate(true_positives, true_negatives,
                                                      false_positives, false_negatives)))


Results
True Positives	True_Negatives	False_Positives\False_Negatives

58	99	28	0
Classifier Accuracy: 0.8486486486486486

Classifier Error Rate: 0.15135135135135136



## Step 8: Optional Exploratory Data Analysis -- Check counts for specific terms
<p><i>Not worth doing this on larger corpora -- too slow!</i></p>

In [13]:
##Instantiate a new counter and count frequencies of the tokens
c = Counter()
lineNo = 0
for text in non_empty_df['body'].tolist():
    lineNo += 1
    tokens = myTokenize(text)
    c.update(tokens)
    if (lineNo % 100 == 0):
        sys.stderr.write("Processing email # {}\n".format(lineNo))
print(c.most_common(10))

Processing email # 100
Processing email # 200
Processing email # 300
Processing email # 400


[('us', 5312), ('arlington', 2462), ('peter', 1768), ('sent', 1126), ('viechnicki', 1055), ('mumbai', 800), ('1', 797), ('subject', 677), ('data', 610), ('mailtopviechnickideloittecom', 604)]


## Step 9: More EDA -- A better way of compiling the vocabulary
<a href="http://nlpforhackers.io/tf-idf/">Source http://nlpforhackers.io/tf-idf/</a>

In [44]:
# build the vocabulary in one pass
vocabulary = set()
for text in train_df['body']:
    words = myTokenize(text)
    vocabulary.update(words)
 
vocabulary = list(vocabulary)
word_index = {w: idx for idx, w in enumerate(vocabulary)}
 
VOCABULARY_SIZE = len(vocabulary)
DOCUMENTS_COUNT = len(train_df.index)
 
print("Vocabulary size: {}\nDocuments Count: {}\n".format(
    VOCABULARY_SIZE, DOCUMENTS_COUNT))   

Vocabulary size: 14212
Documents Count: 740



## Step 10: Let's look more closely at the features that have high tf-idf scores
<p>Tip of the hat to this blog: <a href="https://buhrmann.github.io/tfidf-analysis.html">blog post from Thomas Buhrmann</a></p>

In [45]:


#Returns top n tfidf features as df, but takes dense format vector as input
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

#convert single row into dense format
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [18]:
#This is showing us the top ten tfidf scores for document 3
top_feats_in_doc(tfidf_matrix, feature_names, 3, 10)

Unnamed: 0,feature,tfidf
0,mailtoweggers38gmailcom,0.475205
1,839,0.424112
2,tn,0.394224
3,27,0.283355
4,weggersdeloittecomsubject,0.278087
5,octob,0.270831
6,egger,0.239569
7,william,0.237561
8,2015,0.233639
9,tuesday,0.153122


In [48]:
#Helper function to calculate top n features that are on average most important among
#documents with grp_ids = ?
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

#Top features for entire corpus
top_mean_feats(train_X, feature_names, None)

Unnamed: 0,feature,tfidf
0,junko,0.01114
1,mission,0.008808
2,seattl,0.007536
3,httpgovernment2020dupresscom,0.00752
4,slide,0.007152
5,ai,0.00715
6,httpwwwsolutionrevolutionbookcom,0.006952
7,cathryn,0.006602
8,kaji,0.006592
9,iphon,0.006308


In [49]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    ids = np.where(y.about_fraud==True)
    feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = 'fraud_waste_abuse'
    dfs.append(feats_df)
    return dfs

In [51]:
top_feats_by_class(test_X, test_df, feature_names)

[                         feature     tfidf
 0                          fraud  0.032673
 1                           wast  0.021011
 2                           2015  0.019074
 3                        program  0.019062
 4                          junko  0.018963
 5                         pulkit  0.018369
 6                           kaji  0.016954
 7                       interact  0.016675
 8     pukapoordeloittecomsubject  0.015777
 9                          brien  0.015573
 10                          nudg  0.015497
 11  httpgovernment2020dupresscom  0.014873
 12                       graphic  0.014441
 13                      strategi  0.013924
 14                          abus  0.013748
 15                        steven  0.013071
 16                          séan  0.012421
 17                         error  0.012381
 18                          thai  0.012314
 19                       michael  0.011989
 20                         reduc  0.011981
 21                            1