In [226]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import email

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from math import log, sqrt
from sklearn.model_selection import train_test_split
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.naive_bayes import ComplementNB
from sklearn.feature_extraction.text import CountVectorizer
import random
%matplotlib inline

In [125]:
# we have the enron email data set and we are trying to automatically
# classify emails into sub-folders
# we have the mid-size data set where each email has given its text,
# subject, sender, CC, etc.
# let's look at the data first

In [272]:
mails = pd.read_csv('C:/Users/genaucka/Downloads/emails150MB/emails150MB.csv',header=None)

In [273]:
print(mails.head(10))
print(mails.tail(10))
print(' ')
print('size of the data set:')
print(mails.shape)

                                                   0  \
0  stokley-c/chris_stokley/projects/ees_/brenda_h...   
1     stokley-c/chris_stokley/projects/ees_/ercot/1.   
2    stokley-c/chris_stokley/projects/ees_/ercot/10.   
3    stokley-c/chris_stokley/projects/ees_/ercot/11.   
4     stokley-c/chris_stokley/projects/ees_/ercot/2.   
5     stokley-c/chris_stokley/projects/ees_/ercot/3.   
6     stokley-c/chris_stokley/projects/ees_/ercot/4.   
7     stokley-c/chris_stokley/projects/ees_/ercot/5.   
8     stokley-c/chris_stokley/projects/ees_/ercot/6.   
9     stokley-c/chris_stokley/projects/ees_/ercot/7.   

                                                   1  
0  Message-ID: <32295689.1075858519201.JavaMail.e...  
1  Message-ID: <11422706.1075858518802.JavaMail.e...  
2  Message-ID: <24333819.1075858519016.JavaMail.e...  
3  Message-ID: <17082846.1075858519041.JavaMail.e...  
4  Message-ID: <16514884.1075858518826.JavaMail.e...  
5  Message-ID: <3404255.1075858518849.JavaMail.ev... 

In [274]:
# so we have 59235 emails given
# give some column names to work with
mails.columns = ['file','message']

In [275]:
# so this data set is not a ready-made data-frame
# we will have to build it
# specifically, we will need the label (i.e. the target, here the folders in which to classify)
# and we will need predictors (text body, from )

In [276]:
# let's work at the target first: the classification into subfolders
# the information is in the folder structure:
mails.file.loc[[10,150,3000,3500,40000,40500]]

10       stokley-c/chris_stokley/projects/ees_/ercot/8.
150                   stokley-c/chris_stokley/sent/117.
3000                                   sturm-f/sent/41.
3500                           symes-k/_sent_mail/1070.
40000                            weldon-c/sent_items/4.
40500                      whalley-g/all_documents/287.
Name: file, dtype: object

In [277]:
# we can see that quite some folders are of the "sent" type
# these are not relevant for the task because we are to classify the incoming
# emails
mails = mails.loc[mails.file.str.find('sent') == -1]
mails.reset_index(inplace=True)

In [132]:
# now we have much less emails left
print('Number of only received emails:')
mails.shape[0]

Number of only received emails:


46383

In [133]:
# let's look at the folder structures again
mails.file.loc[[10,150,2000,2400,3000,3500,4000,4500,5000,8000,10000,12000,33000,36000,40000]]

10       stokley-c/chris_stokley/projects/ees_/ercot/8.
150           stokley-c/chris_stokley/volume_mang_/116.
2000                              swerzbin-m/inbox/100.
2400                        symes-k/all_documents/1139.
3000                        symes-k/all_documents/1785.
3500                        symes-k/all_documents/2304.
4000                        symes-k/all_documents/2840.
4500                        symes-k/all_documents/3387.
5000                         symes-k/all_documents/477.
8000                   symes-k/discussion_threads/2184.
10000                                   symes-k/it/105.
12000                      taylor-m/all_documents/2403.
33000                       whalley-l/all_documents/81.
36000                        white-s/deleted_items/692.
40000                williams-w3/bill_williams_iii/636.
Name: file, dtype: object

In [156]:
# we can see here that users (always the first string before first '/')
# have all their personal way of sorting emails into their personal subfolders
# there are two ways now to proceed:

# find a GENERAL rule for classifiying emails that fits all employees:
# for this we'd need to find the GENERAL subfolders (e.g. 'finance', 'projects', 'personal')
# for this we'd need some unsupervised learning to find clusters of emails
# name those clusters; use them as labels and from this build a classfier...
# I do not think this is a good idea, because it means "one-size-fits-all"
# the categories will be blurry and won't really help the single employee

# the second way is to find PERSONAL rules for classifying the emails
# for this we'd assume that every person has their own way of sorting emails
# the email client (me now) will automatically learn how they classify
# the labels are then given by their wubfolder structure
# I think this is more helpful for every single person
# but it only works if we have sufficient emails per subject
# also the suggestion to automatically classify in a subfolder should only
# be made if a certain threshold performance is reached
# let's check the emails of a single person
print(mails.file.loc[mails.file.str.find('taylor-m') != -1].head(30))
print(mails.file.loc[mails.file.str.find('taylor-m') != -1].tail(30))

10442       taylor-m/all_documents/1.
10443      taylor-m/all_documents/10.
10444     taylor-m/all_documents/100.
10445    taylor-m/all_documents/1000.
10446    taylor-m/all_documents/1001.
10447    taylor-m/all_documents/1002.
10448    taylor-m/all_documents/1003.
10449    taylor-m/all_documents/1004.
10450    taylor-m/all_documents/1005.
10451    taylor-m/all_documents/1006.
10452    taylor-m/all_documents/1007.
10453    taylor-m/all_documents/1008.
10454    taylor-m/all_documents/1009.
10455     taylor-m/all_documents/101.
10456    taylor-m/all_documents/1010.
10457    taylor-m/all_documents/1011.
10458    taylor-m/all_documents/1012.
10459    taylor-m/all_documents/1013.
10460    taylor-m/all_documents/1014.
10461    taylor-m/all_documents/1015.
10462    taylor-m/all_documents/1016.
10463    taylor-m/all_documents/1017.
10464    taylor-m/all_documents/1018.
10465    taylor-m/all_documents/1019.
10466     taylor-m/all_documents/102.
10467    taylor-m/all_documents/1020.
10468    tay

In [172]:
# ok, so to keep it simple we will assume that the label
# is the folder path after the user name and before the number
# I started to implement the parsing myself but given the time constraint
# and given that the ENRON email data set is very well known and widely used
# on KAGGLE I turned to a ready made parsing solution using the python email package:
# https://www.kaggle.com/jaykrishna/topic-modeling-enron-email-dataset
# it helps especially to get the message ID and the content separated

# we can see above already, that there are certain folders that are very repetitive and uninformative:
# "all_documents"; similarly some users have "inbox"
# this will probably lead to problems down the road; but we first try out a classifier

In [278]:
# this is from: https://www.kaggle.com/jaykrishna/topic-modeling-enron-email-dataset
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [279]:
# this is from: https://www.kaggle.com/jaykrishna/topic-modeling-enron-email-dataset

# Parse the emails into a list email objects
messages = list(map(email.message_from_string, mails['message']))
mails.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    mails[key] = [doc[key] for doc in messages]
# Parse content from emails
mails['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
mails['From'] = mails['From'].map(split_email_addresses)
mails['To'] = mails['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
mails['user'] = mails['file'].map(lambda x:x.split('/')[0])
del messages

mails.head()

Unnamed: 0,index,file,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,...,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,0,stokley-c/chris_stokley/projects/ees_/brenda_h...,<32295689.1075858519201.JavaMail.evans@thyme>,"Sun, 22 Jul 2001 16:29:07 -0700 (PDT)",(f..herod@enron.com),"(chris.stokley@enron.com, kenny.ha@enron.com)",FW: Metered Usage,"vivian.hart@enron.com, patti.thompson@enron.com",1.0,text/plain; charset=us-ascii,...,"vivian.hart@enron.com, patti.thompson@enron.com","Herod, Brenda F. </O=ENRON/OU=NA/CN=RECIPIENTS...","Ha, Kenny </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Not...","Hart, Vivian </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...",,"\Stokley, Chris (Non-Privileged)\Chris Stokley...",Stokley-C,"Stokley, Chris (Non-Privileged).pst","As part of Project Ranger, it is my goal to ha...",stokley-c
1,1,stokley-c/chris_stokley/projects/ees_/ercot/1.,<11422706.1075858518802.JavaMail.evans@thyme>,"Tue, 24 Jul 2001 08:19:57 -0700 (PDT)",(preston.ochsner@enron.com),"(george.phillips@enron.com, chris.stokley@enro...",Settlements - What we've done in CA and what w...,"gary.nelson@enron.com, jeffrey.miller@enron.co...",1.0,text/plain; charset=us-ascii,...,"gary.nelson@enron.com, jeffrey.miller@enron.co...","Ochsner, Preston </O=ENRON/OU=NA/CN=RECIPIENTS...","Stokley, Chris </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Nelson, Gary </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...",,"\Stokley, Chris (Non-Privileged)\Chris Stokley...",Stokley-C,"Stokley, Chris (Non-Privileged).pst",EB640\t,stokley-c
2,2,stokley-c/chris_stokley/projects/ees_/ercot/10.,<24333819.1075858519016.JavaMail.evans@thyme>,"Wed, 1 Aug 2001 06:07:35 -0700 (PDT)",(preston.ochsner@enron.com),"(p..o'neil@enron.com, george.phillips@enron.co...",FW: Meeting Thurs. 8/2 on Texas retail issues,,1.0,text/plain; charset=us-ascii,...,,"Ochsner, Preston </O=ENRON/OU=NA/CN=RECIPIENTS...","Phillips, George </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Stokley, Chris (Non-Privileged)\Chris Stokley...",Stokley-C,"Stokley, Chris (Non-Privileged).pst",would like to have you there\n\n--------------...,stokley-c
3,3,stokley-c/chris_stokley/projects/ees_/ercot/11.,<17082846.1075858519041.JavaMail.evans@thyme>,"Mon, 30 Jul 2001 18:04:46 -0700 (PDT)",(preston.ochsner@enron.com),"(george.phillips@enron.com, michele.raque@enro...",ERCOT Physical Delivery Progress Report,"joseph.wagner@enron.com, gary.nelson@enron.com",1.0,text/plain; charset=us-ascii,...,"joseph.wagner@enron.com, gary.nelson@enron.com","Ochsner, Preston </O=ENRON/OU=NA/CN=RECIPIENTS...","O'Neil, Murray P. </O=ENRON/OU=NA/CN=RECIPIENT...","Wagner, Joseph </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,"\Stokley, Chris (Non-Privileged)\Chris Stokley...",Stokley-C,"Stokley, Chris (Non-Privileged).pst",\nI need your help to make September 1 physica...,stokley-c
4,4,stokley-c/chris_stokley/projects/ees_/ercot/2.,<16514884.1075858518826.JavaMail.evans@thyme>,"Tue, 24 Jul 2001 18:01:37 -0700 (PDT)",(preston.ochsner@enron.com),"(george.phillips@enron.com, jeff.merola@enron....",ERCOT Physical Delivery Progress Report,"jeffrey.miller@enron.com, joseph.wagner@enron....",1.0,text/plain; charset=us-ascii,...,"jeffrey.miller@enron.com, joseph.wagner@enron....","Ochsner, Preston </O=ENRON/OU=NA/CN=RECIPIENTS...","O'Neil, Murray P. </O=ENRON/OU=NA/CN=RECIPIENT...","Miller, Jeffrey </O=ENRON/OU=NA/CN=RECIPIENTS/...",,"\Stokley, Chris (Non-Privileged)\Chris Stokley...",Stokley-C,"Stokley, Chris (Non-Privileged).pst","\nWhile cautious of the issues we face, I'm ve...",stokley-c


In [280]:
# we have the user variable, but we need the subfolder variable as well
def get_subfolder(cur_str):
    parts = cur_str.split('/')
    return('/'.join(parts[1:-1]))

In [281]:
mails['subfolder'] = mails.file.apply(get_subfolder)

In [282]:
mails.subfolder[0:10]

0    chris_stokley/projects/ees_/brenda_herod
1           chris_stokley/projects/ees_/ercot
2           chris_stokley/projects/ees_/ercot
3           chris_stokley/projects/ees_/ercot
4           chris_stokley/projects/ees_/ercot
5           chris_stokley/projects/ees_/ercot
6           chris_stokley/projects/ees_/ercot
7           chris_stokley/projects/ees_/ercot
8           chris_stokley/projects/ees_/ercot
9           chris_stokley/projects/ees_/ercot
Name: subfolder, dtype: object

In [283]:
# so let's check how many users we have got and how many emails per user
all_users = set(mails.user)
print('We have {:f} users.'.format(len(all_users)))
print(' ')
print('We have per user that much data:')
print(mails.user.value_counts())

We have 23.000000 users.
 
We have per user that much data:
taylor-m       11466
symes-k         8178
williams-w3     2921
whalley-l       2883
white-s         2815
watson-k        1979
whalley-g       1748
ward-k          1712
wolfe-j         1494
zipper-a        1213
tholt-j         1201
ybarbo-p        1161
thomas-p        1130
weldon-c        1110
williams-j      1084
storey-g         876
sturm-f          752
tycholiz-b       737
townsend-j       564
whitt-m          504
stokley-c        354
swerzbin-m       282
zufferli-j       219
Name: user, dtype: int64


In [284]:
# in some users it can be tricky (less than 500 messages), but we will see
# this will further lead to very few messages in deep subfolders which makes it
# hard for a classifier (too few examples)

# let's turn to the message
# we are given a message ID, most importantly per employee,
# we need to know whether any message has been placed into two different folders
# i.e. duplicates check
def no_duplicates(cur_df):
    duplicateRowsDF = cur_df[cur_df.duplicated(['Message-ID'])]
    if (duplicateRowsDF.shape[0]>0):
        return False
    else:
        return True

In [285]:
# run for all users
dup = []
for u in all_users:
    cur_df = mails.loc[mails.user == u]
    dup.append(discard_duplicates(cur_df))
    
if all(dup):
    print('There are no duplicate problems.')
else:
    print('There are duplicate problems!')
    

There are duplicate problems!


In [286]:
# for classification into folders two variables seem most important:
# From whom is the email?
# What is the content??
# we merge this into one information "soup"
def create_soup(x):
    return ' '.join(list(x['From'])) + ' ' + x['content']

mails['soup'] = mails.apply(create_soup, axis=1)

In [287]:
mails.soup[0:20]

0     f..herod@enron.com As part of Project Ranger, ...
1                     preston.ochsner@enron.com EB640\t
2     preston.ochsner@enron.com would like to have y...
3     preston.ochsner@enron.com \nI need your help t...
4     preston.ochsner@enron.com \nWhile cautious of ...
5     preston.ochsner@enron.com as promised. \n\nonc...
6     preston.ochsner@enron.com as promised, below a...
7     preston.ochsner@enron.com see you there\n-----...
8     preston.ochsner@enron.com fyi\n---------------...
9     preston.ochsner@enron.com can we meet Wednesda...
10    preston.ochsner@enron.com did you guys ever fi...
11    preston.ochsner@enron.com have you had a chanc...
12    george.phillips@enron.com Per our discussion l...
13    neil.bresnan@enron.com \n---------------------...
14    neil.bresnan@enron.com Is EPMI involved with t...
15    edith.cross@enron.com We also need to add a Ph...
16    edith.cross@enron.com Here is a tentative list...
17    edith.cross@enron.com Please plan to atten

In [288]:
# This function I have adapted from a email spam classification
# I could use it for stemming (cause it is not in sklearn by default)
# but first try without
def process_message(message, lower_case = True, stem = True, stop_words = True, gram = 1):
    '''function to pre-process a message'''
    if lower_case:
        message = message.lower()
    words = word_tokenize(message)
    words = [w for w in words if len(w) > 2]
    if gram > 1:
        w = []
        for i in range(len(words) - gram + 1):
            w += [' '.join(words[i:i + gram])]
            return w # early return
    if stop_words:
        sw = stopwords.words('english')
        words = [word for word in words if word not in sw]
    if stem:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [289]:
# let's try the classification for a subject
cur_df = mails.loc[mails.user == 'taylor-m']

# predictors X and target y
X = cur_df.soup
y = cur_df.subfolder


# split in train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)



In [290]:
# from here on we will use the sklearn
# which does not seem to have stemming, but for now we won't use it
# form my experience stemming does not always increase performance

# vectorizing
vectorizer = TfidfVectorizer(stop_words='english', max_features=4000,strip_accents='unicode',lowercase=True)

# Construct the required TF-IDF matrix by fitting the transformation model
# and then applying the transformation
vectorizer.fit(X_train)
vect_matrix_train = vectorizer.transform(X_train)
vect_matrix_test = vectorizer.transform(X_test)

# Output the shape of tfidf_matrix of test data set
print(vect_matrix_test.shape)

(2867, 4000)


In [198]:
# fit the classifier and predict the training and test data
# maybe using complement NB to deal with unbalancedness
#clf = MultinomialNB()
clf = ComplementNB()
#clf = GaussianNB()
clf.fit(vect_matrix_train, y_train)
preds_ts = clf.predict(vect_matrix_test)
preds_tr = clf.predict(vect_matrix_train)

In [199]:
# check if classifier makes varied predictions
set(preds_ts)

{'all_documents',
 'archive',
 'archive/11_99',
 'archive/1_00',
 'archive/2001_07',
 'archive/2001_08',
 'archive/2001_10',
 'archive/2001_11',
 'archive/5_00',
 'archive/6_00',
 'archive/7_00',
 'archive/8_00',
 'archive/9_00',
 'archive/november1999',
 'brazil_trading',
 'credit_watch_list',
 'deleted_items',
 'inbox',
 'inbox/compression',
 'inbox/entouch',
 'inbox/esource',
 'inbox/ev',
 'inbox/john_mas',
 'inbox/lisa_j',
 'inbox/marketing',
 'inbox/mary_schoen',
 'inbox/methanol_plant',
 'inbox/nox_model',
 'inbox/parking',
 'inbox/passwords',
 'inbox/recruiting',
 'inbox/siegal',
 'inbox/so2',
 'inbox/social',
 'inbox/tnrcc',
 'inbox/training',
 'inbox/trevor',
 'kelly',
 'kiodex',
 'monmouth',
 'notes_inbox',
 'online_trading',
 'online_trading/content',
 'online_trading/credit_derivatives',
 'online_trading/no_more_confirms',
 'online_trading/product_descriptions',
 'restricted_list',
 'stored_messages',
 'swap_group_project_lists',
 'time_off',
 'travel',
 'uk_trading',
 'yod

In [239]:
# for now only use accuracy to ball-park
# since it is a multinomial problem accuracy is a good first-look measure 
def report(y,y_hat):
    '''function to print some metrics'''
    acc = metrics.accuracy_score(y,y_hat)
    print('Accuracy: {:f}'.format(acc))
    
    # also give the accuracy under 0-hypothesis
    y_s = pd.Series(y_s)
    acc_0 = 0
    for i in range(100):
        acc_0 += metrics.accuracy_score(y_hat_s.sample(frac=1),y_hat_s)
    
    print('Accuracy under assumption of null-hypothesis: {:f}'.format(acc_0/100))

In [240]:
# evaluation train and test
print('Performance Train:')
report(y_train, preds_tr)
print(' ')
print('Performance Test:')
report(y_test, preds_ts)



Performance Train:
Accuracy: 0.294569
Accuracy under assumption of null-hypothesis: 0.337583
 
Performance Test:
Accuracy: 0.307290
Accuracy under assumption of null-hypothesis: 0.391751


In [None]:
# the classifier performes not at all
# probelmatic is that the labels are heavily multinnomial and hiearchical
# i.e. an email that is in 'inbox/compression' cannot be in 'archive/2001_11'
# perhaps a decision tree is more useful

In [215]:
# first try a different one
clf = LogisticRegression(multi_class = 'multinomial', solver='saga')
clf.fit(vect_matrix_train, y_train)
preds_ts = clf.predict(vect_matrix_test)
preds_tr = clf.predict(vect_matrix_train)



In [216]:
# evaluation train and test
print('Performance Train:')
report(y_train, preds_tr)
print(' ')
print('Performance Test:')
report(y_test, preds_ts)

Performance Train:
Accuracy: 0.565415
 
Performance Test:
Accuracy: 0.396931


In [241]:
# also very badly
# let's try the random forest
clf = RandomForestClassifier(n_estimators = 100,max_features = 0.1)
clf.fit(vect_matrix_train, y_train)
preds_ts = clf.predict(vect_matrix_test)
preds_tr = clf.predict(vect_matrix_train)

In [242]:
# evaluation train and test
print('Performance Train:')
report(y_train, preds_tr)
print(' ')
print('Performance Test:')
report(y_test, preds_ts)

Performance Train:
Accuracy: 0.697756
Accuracy under assumption of null-hypothesis: 0.319016
 
Performance Test:
Accuracy: 0.307290
Accuracy under assumption of null-hypothesis: 0.374437


In [None]:
# the random forest at least fits the training data, but overfits
# it does not generalize well to new data; it even over-learns;
# i.e. it learns things in the training data that are completely
# different in the test-data

In [None]:
# now I am turning online to find out more about this problem
# there is a paper
# let's see how these colleagues dealt with some of the issues:
# https://www.semanticscholar.org/paper/...
# ...Automatic-Categorization-of-Email-into-Folders%3A-on-Bekkerman/b52e782f894e9d0223618db6c01aa381627ff61b

# okay in this paper the people also complain about the deep, small folders and the repetitive, uninformative folders
# that were not created by the employees
# they disregard those; how exactly to do this is unclear to me: disregard completely those messages; or just
# remove the folder from path? "All_Documents/project_nox" --> here All_documents is uninformative but project nox is

# I would create a new subfolder variable: i.e. messages in very deep but poorly populated folders will be assgined
# the folder one level up: this way messages do not get lost

# further they argue that the train and test split should be time-dependent: i.e. the classifier should only
# learn form past emails and be only assessed on future emails

# They use the bag-of-word method like me (i.e. creating a word soup) and turning the
# each soup in to a vector that counts word occurences
# they prune some words

# They use maximum entropy with quasi newton optimization,
# Naive Bayes (as suggested by me already); support vector machine (I often end up
# not using because too slow in training) and Wide-Margin Winnow, an online 
# learning algo that can deal with high dimensionality data; do not know it yet but sounds interesting!

In [292]:
# let's implement the subfolder cleaning
# I keep the original one because the mapping back to original labels
# can help later on
mails['subf_clean'] = mails.subfolder
mails.subf_clean = mails.subf_clean.str.lower()
mails.subf_clean = mails.subf_clean.str.replace('all_documents/','')
mails.subf_clean = mails.subf_clean.str.replace('inbox/','')
mails.subf_clean = mails.subf_clean.str.replace('discussion_threads/','')

In [293]:
# now discard emails that are in folders with just 3 or less emails
def create_full_folder(x):
    return x['user']+'/' +x['subf_clean']

mails['full_folder'] = mails.apply(create_full_folder, axis=1)
freqs = mails['full_folder'].value_counts()

In [294]:
# From here I would continue the cleaning of the labels and then retry naive bayes and random forest; and then also the other
# ones suggested in the paper...

In [295]:
# continue after 2pm submission:
frqs = pd.DataFrame(mails.subf_clean.value_counts().reset_index())
frqs.columns = ['subf_clean', 'count']

In [296]:
mails = pd.merge(mails, frqs, on='subf_clean')

In [297]:
mails = mails.loc[mails['count'] > 5]

In [298]:
# let's try the classification for a subject
cur_df = mails.loc[mails.user == 'taylor-m']

# predictors X and target y
X = cur_df.soup
y = cur_df.subf_clean

# split in train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [299]:
set(y)

{'all_documents',
 'analyst_prog',
 'archive',
 'archive/11_99',
 'archive/1_00',
 'archive/2001_05',
 'archive/2001_06',
 'archive/2001_07',
 'archive/2001_08',
 'archive/2001_09',
 'archive/2001_10',
 'archive/2001_11',
 'archive/2_00',
 'archive/5_00',
 'archive/6_00',
 'archive/7_00',
 'archive/8_00',
 'archive/9_00',
 'archive/november1999',
 'australia_trading',
 'contacts',
 'credit_watch_list',
 'deleted_items',
 'entouch',
 'esource',
 'ev',
 'inbox',
 'isda',
 'kelly',
 'kiodex',
 'lisa_j',
 'marketing',
 'mary_schoen',
 'methanol_plant',
 'monmouth',
 'notes_inbox',
 'nox_model',
 'nox_model/websites',
 'online_trading',
 'online_trading/content',
 'online_trading/credit_derivatives',
 'online_trading/eta_amendments',
 'online_trading/no_more_confirms',
 'online_trading/product_descriptions',
 'oslo',
 'passwords',
 'recruiting',
 's_a__trading',
 'so2',
 'social',
 'stored_messages',
 'swap_group_project_lists',
 'tasks',
 'time_off',
 'tnrcc',
 'to_do',
 'training',
 'trav

In [300]:
# from here on we will use the sklearn
# which does not seem to have stemming, but for now we won't use it
# form my experience stemming does not always increase performance

# vectorizing
vectorizer = TfidfVectorizer(stop_words='english', max_features=4000,strip_accents='unicode',lowercase=True)

# Construct the required TF-IDF matrix by fitting the transformation model
# and then applying the transformation
vectorizer.fit(X_train)
vect_matrix_train = vectorizer.transform(X_train)
vect_matrix_test = vectorizer.transform(X_test)

# Output the shape of tfidf_matrix of test data set
print(vect_matrix_test.shape)

(2852, 4000)


In [301]:
# fit the classifier and predict the training and test data
# maybe using complement NB to deal with unbalancedness
#clf = MultinomialNB()
clf = ComplementNB()
#clf = GaussianNB()
clf.fit(vect_matrix_train, y_train)
preds_ts = clf.predict(vect_matrix_test)
preds_tr = clf.predict(vect_matrix_train)

In [302]:
# evaluation train and test
print('Performance Train:')
report(y_train, preds_tr)
print(' ')
print('Performance Test:')
report(y_test, preds_ts)


Performance Train:
Accuracy: 0.527706
Accuracy under assumption of null-hypothesis: 0.263183
 
Performance Test:
Accuracy: 0.434081
Accuracy under assumption of null-hypothesis: 0.274148


In [308]:
a = list(range(0,20))

In [309]:
b = list(range(0,20))

In [311]:
acc_0 = 0
for i in range(0,100):
    random.shuffle(b)
    acc_0 += np.mean(a==b)

print(acc_0/100)

0.0


In [312]:
a

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [313]:
b

[16, 17, 0, 15, 12, 6, 7, 13, 10, 14, 4, 2, 18, 3, 19, 8, 9, 1, 5, 11]