# Test Project: Import emails exported from outlook via csv into DOTCE, classify

## Step 1: Go to correct directory and activate the DOTCE virtual env

%cd c:\Users\embicks\dotce
%pwd
! activate dotce
! conda info --envs

## Step 2: Load various machine learning libraries to extract features from text document corpus and build classification models

In [2]:
import pandas as pd
import os
import numpy as np
import sys
sys.path.append('C:\\Users\\ComputerB\\AppData\\Local\\Continuum\\Anaconda3\\envs\\dotce\\lib\\site-packages\\pyLDAvis')
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import MultinomialNB



## Step 3: Read csv file of emails into df and split into training and test sets

In [3]:
#Make sure we're in the right directory
os.getcwd()
email_df = pd.read_csv('C:\\Users\\ComputerB\\EmailTest\\Deloitte_Test\\combined_emails.csv', sep='|')
#Add rowid
email_df['rownum'] = range(0, len(email_df))
email_df.groupby('cat').count()
# Filter out empty rows
non_empty_df = email_df[email_df['body'].isnull() == False].sample(frac=.2)
#sample method chooses a random sample of the origina frame
#https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
print("Original dataframe contains {} messages\nNon-empty datafram contains {} messages\n".format(
len(email_df), len(non_empty_df)))


Original dataframe contains 324 messages
Non-empty datafram contains 65 messages



In [4]:
#Add new column to dataframe with True if cat == fraud_waste_abuse
#Edit the column names and truth conditions to match your data
non_empty_df['about_USAID'] = (non_empty_df['cat'] == 'USAID')
#Create a vector of class labels
class_labels = non_empty_df['about_USAID']
#use value_counts() method of series
class_labels.value_counts()


False    41
True     24
Name: about_USAID, dtype: int64

In [5]:
#Create a training set and test set, 80% 20%
train_df, test_df = train_test_split(non_empty_df, train_size = 0.8, random_state=44)
class_labels_training = list(train_df['about_USAID'])
class_labels_test = list(test_df['about_USAID'])
value_counts = nltk.FreqDist(class_labels_training)


## Step 4: Let's make sure we can tokenize these properly and remove stop words
<p>from <a href="http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html">CS Duke.edu</a></p>
<p>Make sure you've installed the english punctuation and stop words list 
following <a href="http://www.nltk.org/data.html">these instructions.</a>
</p>

In [6]:
#Instantiate a stemmer and a tokenizer to preprocess the email text

snowballStemmer = SnowballStemmer("english", ignore_stopwords=True)
def preprocess(text):
    no_punctuation_text = ''
    if (type(text)== str):
        lower_text = text.lower()
        no_punctuation_text = lower_text.translate({ord(c):'' for c in string.punctuation})
    return no_punctuation_text

def myTokenize(text):
    global snowballStemmer
    tokens = []
    cleaned = preprocess(text)
    tokens = nltk.word_tokenize(cleaned)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    stemmed = [w for w in map(snowballStemmer.stem, filtered)]
    return stemmed
    
tokens = myTokenize(train_df['body'].iloc[0])
count = Counter(tokens)
print(count.most_common(10))

[('nanci', 1), ('ostrowskidirector', 1), ('offic', 1), ('dc', 1), ('pensionsoffic', 1), ('2026222214', 1)]


## Step 5: convert train and test dfs to term X document representation matrices (_X)

In [7]:
##Instantiate a TFidf vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', 
                             max_df=0.5, tokenizer=myTokenize)

In [8]:
#This step can take a while
train_X = vectorizer.fit_transform(train_df['body'])


In [9]:
##Save feature names in a separate list
feature_names = vectorizer.get_feature_names()

In [10]:
#From http://fastml.com/classifying-text-with-bag-of-words-a-tutorial/
#Create another matrix of tfidf scores for the documents in the test set
test_X = vectorizer.transform(test_df['body'])

## Step 5: Instantiate and Train a Logistic Regression Classifier

In [11]:
#Create a logistic regression model
#From same tutorial https://github.com/zygmuntz/classifying-text/blob/master/bow_predict.py
model_lr = LR()
model_lr.fit(train_X, class_labels_training)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Step 5B: Try a Naive Bayes Classifier

In [12]:
#Try a naive bayes classifier instead
model_nb = MultinomialNB(alpha=0.05)
model_nb.fit(train_X, class_labels_training)

MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)

## Step 6: Test classifier on test_X matrix

In [13]:
#Predict probability of class membership
p = model_lr.predict_proba( test_X )[:,1]
model_lr.score( test_X, class_labels_test)


0.84615384615384615

### Test NB Classifier results

In [14]:
p2 = model_nb.predict( test_X )
model_nb.score( test_X, class_labels_test)

0.84615384615384615

In [15]:
results = [(class_labels_test[i], p2[i]) for i in range(0,len(p2))]
len(results)
results[0:20]

[(False, False),
 (False, False),
 (True, False),
 (True, True),
 (False, True),
 (True, True),
 (True, True),
 (True, True),
 (False, False),
 (False, False),
 (False, False),
 (True, True),
 (True, True)]

## Step 7: Evaluate Results
<p>Nice summary of different formulas for accuracy, precision, recall, etc 
<a href="http://www.damienfrancois.be/blog/files/modelperfcheatsheet.pdf">here</a></p>

In [16]:
#Write out results
output_df = pd.DataFrame( data = {'rownum': test_df['rownum'], 
                                  'about_USAID': test_df['about_USAID'], 
                                  'logistic': p })


In [17]:
print("Output dataframe length: {}\n".format(len(output_df)))
THRESHOLD = .23
output_df['prediction'] = (output_df['logistic'] >= THRESHOLD)
print("DF columns:{}\n".format(", ".join(output_df.columns)))

##If 'prediction' matches 'about_fraud' then the classifier got it right
output_df.head(10)

Output dataframe length: 13

DF columns:about_USAID, logistic, rownum, prediction



Unnamed: 0,about_USAID,logistic,rownum,prediction
81,False,0.093433,81,False
13,False,0.350024,13,True
199,True,0.429728,199,True
268,True,0.750199,268,True
155,False,0.537789,155,True
213,True,0.556474,213,True
233,True,0.603493,233,True
252,True,0.74834,252,True
146,False,0.093433,146,False
157,False,0.258898,157,True


In [18]:
#Now let's see how well this model did, true positives, false positives, etc
def accuracy(tp, tn, fp, fn):
    return ((tp + tn)/(tp + tn + fp + fn))

def error_rate(tp, tn, fp, fn):
    return ((fp + fn)/ (tp + tn + fp + fn))

true_positives = len(output_df.loc[(output_df['about_USAID'] == True) & (output_df['prediction'] == True)])
false_positives = len(output_df.loc[(output_df['about_USAID'] == False) & (output_df['prediction'] == True)])
true_negatives = len(output_df.loc[(output_df['about_USAID'] == False) & (output_df['prediction'] == False)])
false_negatives = len(output_df.loc[(output_df['about_USAID'] == True) & (output_df['prediction'] == False)])
print("Results\nTrue Positives\tTrue_Negatives\tFalse_Positives\False_Negatives\n")
print("\t".join(map(str, [true_positives, true_negatives, false_positives, false_negatives])))

print("Classifier Accuracy: {}\n".format(accuracy(true_positives, true_negatives, 
                                                  false_positives, false_negatives)))
print("Classifier Error Rate: {}\n".format(error_rate(true_positives, true_negatives,
                                                      false_positives, false_negatives)))


Results
True Positives	True_Negatives	False_Positives\False_Negatives

7	2	4	0
Classifier Accuracy: 0.6923076923076923

Classifier Error Rate: 0.3076923076923077



### Evaluate NB using metrics module

In [19]:
from sklearn import metrics
print(metrics.classification_report(class_labels_test, p2))

             precision    recall  f1-score   support

      False       0.83      0.83      0.83         6
       True       0.86      0.86      0.86         7

avg / total       0.85      0.85      0.85        13



### Which features are most informative?
<p>Stolen/lifted from <a href="https://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers">stack overflow</a></p>

In [20]:
#What are the most informative features in this test?
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

show_most_informative_features(vectorizer, model_nb, 25)

	-8.7830	01             		-5.5106	arlington      
	-8.7830	03             		-5.5155	us             
	-8.7830	07             		-5.8404	1              
	-8.7830	131            		-5.8475	pleas          
	-8.7830	13320          		-5.9482	mark           
	-8.7830	1470           		-5.9516	consult        
	-8.7830	1470sbrownleedeloittecom		-5.9606	question       
	-8.7830	15             		-5.9681	roopa          
	-8.7830	1denialsparti  		-6.0056	571            
	-8.7830	1pdf           		-6.0753	work           
	-8.7830	2016shoutout   		-6.0914	·              
	-8.7830	2026222214     		-6.0991	2016           
	-8.7830	21             		-6.1028	thank          
	-8.7830	21denialsparti 		-6.1205	attach         
	-8.7830	22091742       		-6.1363	final          
	-8.7830	22209teldirect 		-6.1428	categori       
	-8.7830	2327           		-6.1988	mobil          
	-8.7830	235            		-6.2039	wwwdeloittecom 
	-8.7830	240            		-6.2073	              
	-8.7830	266            		-6.2200	data   

## Step 8: Optional Exploratory Data Analysis -- Check counts for specific terms
<p><i>Not worth doing this on larger corpora -- too slow!</i></p>

In [21]:
##Instantiate a new counter and count frequencies of the tokens
c = Counter()
lineNo = 0
for text in non_empty_df['body'].tolist():
    lineNo += 1
    tokens = myTokenize(text)
    c.update(tokens)
    if (lineNo % 100 == 0):
        sys.stderr.write("Processing email # {}\n".format(lineNo))
print(c.most_common(10))

[('us', 757), ('arlington', 692), ('file', 321), ('new', 320), ('item', 317), ('goldstein', 313), ('marle', 313), ('·', 122), ('chang', 114), ('roopa', 103)]


## Step 9: More EDA -- A better way of compiling the vocabulary
<a href="http://nlpforhackers.io/tf-idf/">Source http://nlpforhackers.io/tf-idf/</a>

In [22]:
# build the vocabulary in one pass
vocabulary = set()
for text in train_df['body']:
    words = myTokenize(text)
    vocabulary.update(words)
 
vocabulary = list(vocabulary)
word_index = {w: idx for idx, w in enumerate(vocabulary)}
 
VOCABULARY_SIZE = len(vocabulary)
DOCUMENTS_COUNT = len(train_df.index)
 
print("Vocabulary size: {}\nDocuments Count: {}\n".format(
    VOCABULARY_SIZE, DOCUMENTS_COUNT))   

Vocabulary size: 2635
Documents Count: 52



## Step 10: Let's look more closely at the features that have high tf-idf scores
<p>Tip of the hat to this blog: <a href="https://buhrmann.github.io/tfidf-analysis.html">blog post from Thomas Buhrmann</a></p>

In [23]:


#Returns top n tfidf features as df, but takes dense format vector as input
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

#convert single row into dense format
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [24]:
#This is showing us the top ten tfidf scores for document 3
top_feats_in_doc(tfidf_matrix, feature_names, 3, 10)

NameError: name 'tfidf_matrix' is not defined

In [25]:
#Helper function to calculate top n features that are on average most important among
#documents with grp_ids = ?
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

#Top features for entire corpus
top_mean_feats(train_X, feature_names, None)

Unnamed: 0,feature,tfidf
0,2026222214,0.185499
1,pensionsoffic,0.185499
2,ostrowskidirector,0.185499
3,dc,0.176911
4,nanci,0.16901
5,arlington,0.034434
6,us,0.031487
7,file,0.025007
8,new,0.024464
9,item,0.024286


In [26]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    ids = np.where(y.about_fraud==True)
    feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = 'fraud_waste_abuse'
    dfs.append(feats_df)
    return dfs

In [27]:
top_feats_by_class(test_X, test_df, feature_names)

AttributeError: 'DataFrame' object has no attribute 'about_fraud'