# Test Project: Import emails exported from outlook via csv into DOTCE, classify

## Step 1: Go to correct directory and activate the DOTCE virtual env

In [1]:
%cd c:\Users\pviechnicki\Desktop\pviechnicki_home\sandbox\state\dotce
%pwd
! activate dotce
! conda info --envs

c:\Users\pviechnicki\Desktop\pviechnicki_home\sandbox\state\dotce
# conda environments:
#
dotce                    C:\Users\pviechnicki\AppData\Local\Continuum\Anaconda3\envs\dotce
root                  *  C:\Users\pviechnicki\AppData\Local\Continuum\Anaconda3



## Step 2: Read csv file of emails into df

In [2]:
import pandas as pd
import os
import numpy as np

#Make sure we're in the right directory
os.getcwd()
email_df = pd.read_csv('C:\\Users\\pviechnicki\\Desktop\\pviechnicki_home\\sandbox\\state\\data\\pv_email\\combined_emails.csv', sep='|')
#Add rowid
email_df['rownum'] = range(0, len(email_df))
email_df.groupby('cat').count()
# Filter out empty rows
non_empty_df = email_df[email_df['body'].isnull() == False].sample(frac=.1)
#sample method chooses a random sample of the origina frame
#https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
print("Original dataframe contains {} messages\nNon-empty datafram contains {} messages\n".format(
len(email_df), len(non_empty_df)))


Original dataframe contains 4629 messages
Non-empty datafram contains 463messages



## Step 3: Extract features from text document corpus

In [3]:
import sys
sys.path.append('C:\\Users\\pviechnicki\\AppData\\Local\\Continuum\\Anaconda3\\envs\\dotce\\lib\\site-packages\\pyLDAvis')
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
##From https://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space
###Purpose of this is because some messages in email archive had empty bodies and
###pandas read them as float type NAN which threw an error
from unidecode import unidecode
def remove_non_ascii(text, rowid):
    if (type(text) != str):
        sys.stderr.write("Found a non-string at row {}, {}\n".format(rowid, text))
    temp = str(text)
    return "".join(s for s in temp if ord(s) < 128)

## Step 4: Let's make sure we can tokenize these properly and remove stop words
<p>from <a href="http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html">CS Duke.edu</a></p>
<p>Make sure you've installed the english punctuation and stop words list 
following <a href="http://www.nltk.org/data.html">these instructions.</a>
</p>

In [6]:
from collections import Counter
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

snowballStemmer = SnowballStemmer("english", ignore_stopwords=True)
def preprocess(text):
    no_punctuation_text = ''
    if (type(text)== str):
        lower_text = text.lower()
        no_punctuation_text = lower_text.translate({ord(c):'' for c in string.punctuation})
    return no_punctuation_text

def myTokenize(text):
    global snowballStemmer
    tokens = []
    cleaned = preprocess(text)
    tokens = nltk.word_tokenize(cleaned)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    stemmed = [w for w in map(snowballStemmer.stem, filtered)]
    return stemmed
    
tokens = myTokenize(email_df['body'][0])
count = Counter(tokens)
print(count.most_common(10))

[('put', 2), ('mahesh', 1), ('registri', 1), ('research', 1), ('team', 1), ('peter', 1), ('may', 1), ('end', 1), ('drop', 1), ('nowrespons', 1)]


## Step 5: convert to term X document representation

In [7]:
##Instantiate a TFidf vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', 
                             max_df=0.5, tokenizer=myTokenize)

In [10]:
#Create a vector of class labels
class_labels = non_empty_df['cat']
#use value_counts() method of series
class_labels.value_counts()


future_of_work         190
fraud_waste_abuse      138
mission_analytics      117
workforce_analytics     18
Name: cat, dtype: int64

In [17]:
#This step can take a while
tfidf_matrix = vectorizer.fit_transform(non_empty_df['body'])


In [12]:
##Save feature names in a separate list
feature_names = vectorizer.get_feature_names()

## Step 6: Check counts for specific terms
<p><i>Not worth doing this on larger corpora -- too slow!</i></p>

In [13]:
##Instantiate a new counter and count frequencies of the tokens
c = Counter()
lineNo = 0
for text in non_empty_df['body'].tolist():
    lineNo += 1
    tokens = myTokenize(text)
    c.update(tokens)
    if (lineNo % 100 == 0):
        sys.stderr.write("Processing email # {}\n".format(lineNo))
print(c.most_common(10))

Processing email # 100
Processing email # 200
Processing email # 300
Processing email # 400


[('us', 5312), ('arlington', 2462), ('peter', 1768), ('sent', 1126), ('viechnicki', 1055), ('mumbai', 800), ('1', 797), ('subject', 677), ('data', 610), ('mailtopviechnickideloittecom', 604)]


## Step 7: A better way of compiling the vocabulary
<a href="http://nlpforhackers.io/tf-idf/">Source http://nlpforhackers.io/tf-idf/</a>

In [14]:
# build the vocabulary in one pass
vocabulary = set()
for text in non_empty_df['body']:
    words = myTokenize(text)
    vocabulary.update(words)
 
vocabulary = list(vocabulary)
word_index = {w: idx for idx, w in enumerate(vocabulary)}
 
VOCABULARY_SIZE = len(vocabulary)
DOCUMENTS_COUNT = len(non_empty_df.index)
 
print("Vocabulary size: {}\nDocuments Count: {}\n".format(
    VOCABULARY_SIZE, DOCUMENTS_COUNT))   

Vocabulary size: 11634
Documents Count: 463



## Step 8: Let's look more closely at the features that have high tf-idf scores
<p>Tip of the hat to this blog: <a href="https://buhrmann.github.io/tfidf-analysis.html">blog post from Thomas Buhrmann</a></p>

In [15]:
import numpy as np

#Returns top n tfidf features as df, but takes dense format vector as input
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

#convert single row into dense format
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [18]:
#This is showing us the top ten tfidf scores for document 3
top_feats_in_doc(tfidf_matrix, feature_names, 3, 10)

Unnamed: 0,feature,tfidf
0,mailtoweggers38gmailcom,0.475205
1,839,0.424112
2,tn,0.394224
3,27,0.283355
4,weggersdeloittecomsubject,0.278087
5,octob,0.270831
6,egger,0.239569
7,william,0.237561
8,2015,0.233639
9,tuesday,0.153122


In [20]:
#Helper function to calculate top n features that are on average most important among
#documents with grp_ids = ?
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

#Top features for entire corpus
top_mean_feats(tfidf_matrix, feature_names, None)

Unnamed: 0,feature,tfidf
0,seattl,0.009854
1,91,0.009809
2,india,0.009113
3,junko,0.008851
4,7142,0.008495
5,jake,0.007699
6,teldirect,0.007575
7,mission,0.00751
8,join,0.007279
9,slide,0.007188


In [41]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = ['fraud_waste_abuse']
    for label in labels:
        ids = np.where(y.cat==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [42]:
top_feats_by_class(tfidf_matrix, non_empty_df, feature_names)

[                             feature     tfidf
 0                               jake  0.025829
 1                              fraud  0.022808
 2                              brien  0.019223
 3                                dan  0.018991
 4                            graphic  0.016865
 5                               wast  0.016769
 6                             steven  0.015194
 7                               troy  0.015120
 8                               abus  0.014870
 9                              olson  0.014731
 10                        punzenberg  0.014698
 11                            pulkit  0.013977
 12      httpgovernment2020dupresscom  0.013881
 13                           program  0.013717
 14                               fwa  0.013497
 15  httpwwwsolutionrevolutionbookcom  0.013406
 16                           crystal  0.013276
 17                             iphon  0.012985
 18                          interact  0.012121
 19                        minneapoli  0