In [None]:
# This script loads all emails from all directories into a list and then data frame and determines which email address
# belongs to the folder owner-based on exact string match with first and second word in X-from of each email. This
# also allows us to dermine which emails are inbox and which out. Directories structure or names are not sufficient for 
# a safe procedure.

# Further, we strip emails with forwarded emails in their body from all non-body parts in all forwards in the therad.
# Finally we strip environmental and provacy warnings at the bottom of emails.

# We save result dat aframe on the disk.


In [None]:
import os
from collections import Counter

# we will use email package to extract (half) structured data from emails
from email.parser import Parser
rootdir = "/notebooks/LDA models and data/Data Frames and lists/Enron3/maildir/"

# helper function to parse email and produce a list with needed fields
def email_analyse(inputfile,  email_list):
    with open(inputfile, "r") as f:
        data = f.read()
        
    email = Parser().parsestr(data)
    X_from = email['X-from']
    email_from= email['From']
    email_date= email['date']
    email_body = email.get_payload()
    email_list.append([os.path.join(directory, filename), email_from, X_from, email_date, email_body])


email_list = []

for directory, subdirectory, filenames in  os.walk(rootdir):
    for filename in filenames:
        email_analyse(os.path.join(directory, filename), email_list)

In [None]:
import pandas as pd

# make a data frame from emails list
df_emails = pd.DataFrame(email_list, columns=['dirpath' ,'from', 'Xfrom', 'date', 'body'])
df_emails=df_emails.drop_duplicates()

In [None]:
# extract folder name that contains surname and first letter of name of executive
df_emails['dirpath']=df_emails['dirpath'].apply(lambda x: x.split('/')[])

# make empty string out of X-from fileds that are NonType for processing
df_emails['Xfrom'] =  df_emails['Xfrom'].apply(lambda x: x if type(x) == str else '')
# print df_emails[0:25]

In [None]:
# helper function to extract
def f1(x):
    lst = x.replace('"', '').strip().upper().split(' ')
    result = ''
    if len(lst) > 1:
        result =  lst[1]
    return result 

# helper functions to remove undesired characters in data frame columns
def f2(x):
    str2 = x.replace(',', '').strip().upper()
    result = ''
    if len(str2) > 1:
        result =  str2
    return result 

def f3(x):
    str3 = x.replace(';', '').strip().upper()
    result = ''
    if len(str3) > 1:
        result =  str3
    return result

def f4(x):
    str4 = x.replace('\\', '').strip().upper()
    result = ''
    if len(str4) > 1:
        result =  str4
    return result

In [None]:
## process Xfrom0
# extract first word (mostly name or sometimes surname from X-from and process it
df_emails['Xfrom0']=df_emails['Xfrom'].apply(lambda x: x.replace('"', '').strip().upper().split(' ')[0])
# remove undesired characters and cast to upper case for string distance evaluation
df_emails['Xfrom0']=df_emails['Xfrom0'].apply(lambda x: f2(x))
# remove undesired characters and cast to upper case for string distance evaluation
df_emails['Xfrom0']=df_emails['Xfrom0'].apply(lambda x: f3(x))

# extrcat second word form X-from where there is one
df_emails['Xfrom1']=df_emails['Xfrom'].apply(lambda x: f1(x))
# remove undesired characters and cast to upper case for string distance evaluation
df_emails['Xfrom1']=df_emails['Xfrom1'].apply(lambda x: f2(x))
# remove undesired characters and cast to upper case for string distance evaluation
df_emails['Xfrom1']=df_emails['Xfrom1'].apply(lambda x: f3(x))

# extact actual surname
df_emails['dirpath_surname']=df_emails['dirpath'].apply(lambda x: x.strip().upper().split('-')[0])
# print df_emails[0:25]

In [None]:
import editdistance as ed

# computre Levenshtein distance between direcory surname and second and first word in X-from as these seem to appear on both orders
# on my machine with 256GB RAM and 24 cores processor (AZURE HPC) it takes a few minutes to complete
df_emails['dist1'] = df_emails[['Xfrom1', 'dirpath_surname']].apply(lambda x: ed.eval(x['Xfrom1'], x['dirpath_surname']), axis=1)
df_emails['dist0'] = df_emails[['Xfrom0', 'dirpath_surname']].apply(lambda x: ed.eval(x['Xfrom0'], x['dirpath_surname']), axis=1)

In [None]:
# compute minimal Levenshtein distance at directory surname level
df_emails_grouped = df_emails.groupby(['dirpath_surname']).agg({'dist1':'min', 'dist0':'min'}).reset_index().\
rename(columns={'dist1':'dist1_min', 'dist0':'dist0_min'})

df_emails = pd.merge(df_emails, df_emails_grouped, how='left', on=['dirpath_surname'])

# select emails with minimal distance
# it turns out that not all folders actually contain a from email with a matching name (out of 135 we matched 127) and we choose for
# our recomendation engine mentors and mentees only out of those executives who's name matches completely in the above way
df_emails_mindist = df_emails[(df_emails['dist1_min'] == 0) | (df_emails['dist0_min'] == 0)].reset_index()

In [None]:
# CHECK
print df_emails_mindist['dirpath_surname'].drop_duplicates().shape
print df_emails['dirpath_surname'].drop_duplicates().shape

In [None]:
# ANOTHER CHECK
# check how many mails are dropped this way; 2.5% emails dropped OK!
print df_emails_mindist.shape
print df_emails.shape

In [None]:
# in order to (attempt to) keep only the party of email body written by the sender in out email, which we later on need for experise and preferances
# determination we will chop all email bodies from first appearance of any of the typical strings as in functions below. Our method is far from exhaustive r this
# from aspect of text prpcessing but due to lack of time we will proceed with it.

def h1(x):
    return x.split('********************************')[0]

def h2(x):
    return x.split('-----Original Message-----')[0]

def h3(x):
    return x.split('__________________________')[0]

def h4(x):
    return x.split('---Forwarded by')[0]

def h5(x):
    return x.split('---Forwarded By')[0]

def h6(x):
    return x.split('--- Forwarded by')[0]

def h7(x):
    return x.split('--- Forwarded By')[0]

In [None]:
# assign copy of df_emails to prcess further for preferances/expertise detrmination
df_emails0=df_emails
# chop email bodies for preferances/experise determination. Notice that we keep these parts for the topics modeling part.
df_emails0['body']=df_emails0['body'].apply(lambda x: h1(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h2(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h3(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h4(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h5(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h6(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h7(x))

In [None]:
# Emails can have forwarded emails below or environment cautions/pricay warnings at the bottom of them. Since this can be the case with
# many emails we need to strip these pieces of text to avoid noise in our topic modeling.
# we start with defining regex objects that we need for this processing step.

import re

email_pat = re.compile(".+@.+")
to_pat = re.compile("To:.+\n")
cc_pat = re.compile("cc:.+\n")
subject_pat = re.compile("Subject:.+\n")
from_pat = re.compile("From:.+\n")
sent_pat = re.compile("Sent:.+\n")
received_pat = re.compile("Received:.+\n")
ctype_pat = re.compile("Content-Type:.+\n")
reply_pat = re.compile("Reply- Organization:.+\n")
date_pat = re.compile("Date:.+\n")
xmail_pat = re.compile("X-Mailer:.+\n")
mimver_pat = re.compile("MIME-Version:.+\n")
dash_pat = re.compile("--+.+--+", re.DOTALL)
star_pat = re.compile('\*\*+.+\*\*+', re.DOTALL)
uscore_pat = re.compile(" __+.+__+", re.DOTALL)
equals_pat = re.compile("==+.+==+", re.DOTALL)

In [None]:
# Next we define a function that takes the body of am email (possibly containing forward email threads and/or 
# environment warnings/provacy cautions) and returns sole email text (also from emails of a full forward theard
# where applicable)

def clean_forward_email(email):    
    etype=''
    if '.nsf' in email:
        etype = ".nsf"
    elif '.pst' in email:
        etype = '.pst'
    email_new = email[email.find(etype)+4:]
    email_new = to_pat.sub('', email_new)
    email_new = cc_pat.sub('', email_new)
    email_new = subject_pat.sub('', email_new)
    email_new = from_pat.sub('', email_new)
    email_new = sent_pat.sub('', email_new)
    email_new = received_pat.sub('', email_new)
    email_new = email_pat.sub('', email_new)
    email_new = ctype_pat.sub('', email_new)
    email_new = reply_pat.sub('', email_new)
    email_new = date_pat.sub('', email_new)
    email_new = xmail_pat.sub('', email_new)
    email_new = mimver_pat.sub('', email_new)
    email_new = dash_pat.sub('', email_new)
    email_new = star_pat.sub('', email_new)
    email_new = uscore_pat.sub('', email_new)
    email_new = equals_pat.sub('', email_new)
    return email_new

In [None]:
# process email body column in the full dat aframe containing all emails. These will be used for topic modeling and it seems
# perfectly sound to consider full thread as one document with common topic(s)
# The reduced table df_emails_mindist need additional step. Namely there we need to determine preferances and for that we need
# to be more precise about what each executive writes and what he/she reads
df_emails['body']=df_emails['body'].apply(lambda x: clean_forward_email(x))

# just to be sure we do the same to df_emails0
df_emails0['body']=df_emails0['body'].apply(lambda x: clean_forward_email(x))

In [None]:
# add in vs out box id column to both frames
def inoutfunct(x):
    result=0
    if (x['dist1']==0 or x['dist0']==0):
        result= 1
    return result
df_emails0['inout_id']=df_emails.apply(inoutfunct, axis=1)
df_emails['inout_id']=df_emails.apply(inoutfunct, axis=1)

In [None]:
import pickle
df_emails.to_pickle('/notebooks/LDA models and data/Data Frames and lists/df_emails.pkl')
df_email0s.to_pickle('/notebooks/LDA models and data/Data Frames and lists/df_emails0.pkl')