# Script content
### This script loads all emails from all directories into a list and then a data frame and determines which email address belongs to the folder owner.
### We do that based on exact string match with first and second word in X-from of each email. This also allows us to determine which emails are inbox and which out. Directory structures or names are not sufficient for a safe procedure.
### Further, we strip emails that contain forwarded emails in their body from these parts for a version of processing to be used for recommendation engine.
### Finally we strip environmental and privacy warnings at the bottom of emails.
### At last we save result data frame on the disk.

### We will use email package to extract (half) structured data from emails

In [21]:
import os
from collections import Counter


from email.parser import Parser
rootdir = "/notebooks/LDA models and data/Data Frames and lists/Enron3/maildir/"


### We first write a helper function to parse email and produce a list with needed fields

In [22]:
def email_analyse(inputfile,  email_list):
    with open(inputfile, "r") as f:
        data = f.read()
        
    email = Parser().parsestr(data)
    X_from = email['X-from']
    email_from= email['From']
    email_date= email['date']
    email_body = email.get_payload()
    email_list.append([os.path.join(directory, filename), email_from, X_from, email_date, email_body])

### Then extract all emails from all folders to a list

In [None]:
email_list = []

for directory, subdirectory, filenames in  os.walk(rootdir):
    for filename in filenames:
        email_analyse(os.path.join(directory, filename), email_list)

### Make a data frame from emails list

In [None]:
import pandas as pd
df_emails = pd.DataFrame(email_list, columns=['dirpath' ,'from', 'Xfrom', 'date', 'body'])
df_emails=df_emails.drop_duplicates()

### Extract folder name that contains surname and first letter of name of executive

#### We make empty string out of X-from fields that are NonType for processing

In [None]:
df_emails['dirpath']=df_emails['dirpath'].apply(lambda x: x.split('/')[])

df_emails['Xfrom'] =  df_emails['Xfrom'].apply(lambda x: x if type(x) == str else '')
# print df_emails[0:25]

### Create helper function to extract and helper functions to remove undesired characters in data frame columns



In [None]:

def f1(x):
    lst = x.replace('"', '').strip().upper().split(' ')
    result = ''
    if len(lst) > 1:
        result =  lst[1]
    return result 


def f2(x):
    str2 = x.replace(',', '').strip().upper()
    result = ''
    if len(str2) > 1:
        result =  str2
    return result 

def f3(x):
    str3 = x.replace(';', '').strip().upper()
    result = ''
    if len(str3) > 1:
        result =  str3
    return result

def f4(x):
    str4 = x.replace('\\', '').strip().upper()
    result = ''
    if len(str4) > 1:
        result =  str4
    return result

### Extract and first and second word (mostly name or sometimes surname from X-from and process it

In [None]:
# First word
df_emails['Xfrom0']=df_emails['Xfrom'].apply(lambda x: x.replace('"', '').strip().upper().split(' ')[0])
# remove undesired characters and cast to upper case for string distance evaluation
df_emails['Xfrom0']=df_emails['Xfrom0'].apply(lambda x: f2(x))
# remove undesired characters and cast to upper case for string distance evaluation
df_emails['Xfrom0']=df_emails['Xfrom0'].apply(lambda x: f3(x))

# Second word where there is one
df_emails['Xfrom1']=df_emails['Xfrom'].apply(lambda x: f1(x))
# remove undesired characters and cast to upper case for string distance evaluation
df_emails['Xfrom1']=df_emails['Xfrom1'].apply(lambda x: f2(x))
# remove undesired characters and cast to upper case for string distance evaluation
df_emails['Xfrom1']=df_emails['Xfrom1'].apply(lambda x: f3(x))

# extact actual surname
df_emails['dirpath_surname']=df_emails['dirpath'].apply(lambda x: x.strip().upper().split('-')[0])

### Compute Levenshtein distance between directory surname and second and first word in X-from as these seem to appear on both orders
#### On my machine with 256GB RAM and 24 cores processor it takes a few minutes to complete

#### After all it turned out that we could have directly done a an exact string match, but that we did not know in advance. As it doesn’t take too long to process we keep this part as it is



In [None]:
import editdistance as ed

df_emails['dist1'] = df_emails[['Xfrom1', 'dirpath_surname']].apply(lambda x: ed.eval(x['Xfrom1'], x['dirpath_surname']), axis=1)
df_emails['dist0'] = df_emails[['Xfrom0', 'dirpath_surname']].apply(lambda x: ed.eval(x['Xfrom0'], x['dirpath_surname']), axis=1)

In [None]:
# compute minimal Levenshtein distance at directory surname level
df_emails_grouped = df_emails.groupby(['dirpath_surname']).agg({'dist1':'min', 'dist0':'min'}).reset_index().\
rename(columns={'dist1':'dist1_min', 'dist0':'dist0_min'})

df_emails = pd.merge(df_emails, df_emails_grouped, how='left', on=['dirpath_surname'])

df_emails_mindist = df_emails[(df_emails['dist1_min'] == 0) | (df_emails['dist0_min'] == 0)].reset_index()

### Select emails with minimal distance.
### It turns out that not all folders actually contain an from email with a matching name (out of 135 we matched 127) and we choose for our recommendation engine mentors and mentees only out of those executives who's name matches completely in a way as described above


In [None]:
# CHECK
print df_emails_mindist['dirpath_surname'].drop_duplicates().shape
print df_emails['dirpath_surname'].drop_duplicates().shape

### ANOTHER CHECK

#### check how many mails are dropped this way; 2.5% emails dropped OK!

In [None]:
print df_emails_mindist.shape
print df_emails.shape

### In order to keep only the part of email body written by the sender in out email, which we later on need for expertise and preferences determination we will chop all email bodies from first appearance of any of the typical strings as in functions below. 

### Our method is far from exhaustive from this from aspect of text preprocessing but due to lack of time we will proceed with it.



In [None]:

def h1(x):
    return x.split('********************************')[0]

def h2(x):
    return x.split('-----Original Message-----')[0]

def h3(x):
    return x.split('__________________________')[0]

def h4(x):
    return x.split('---Forwarded by')[0]

def h5(x):
    return x.split('---Forwarded By')[0]

def h6(x):
    return x.split('--- Forwarded by')[0]

def h7(x):
    return x.split('--- Forwarded By')[0]

### Assign copy of df_emails to process further for preferences/expertise determination and chop email bodies in order to determine preferences/expertise. Notice that we keep these parts for the topics modeling part.

In [None]:
df_emails0=df_emails

df_emails0['body']=df_emails0['body'].apply(lambda x: h1(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h2(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h3(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h4(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h5(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h6(x))
df_emails0['body']=df_emails0['body'].apply(lambda x: h7(x))

### Emails can have forwarded emails below or environment cautions/privacy warnings at the bottom of them. Since this can be the case with many emails we need to strip these pieces of text to avoid noise in our topic modeling. We start with defining regex objects that we need for this processing step.

In [None]:
import re

email_pat = re.compile(".+@.+")
to_pat = re.compile("To:.+\n")
cc_pat = re.compile("cc:.+\n")
subject_pat = re.compile("Subject:.+\n")
from_pat = re.compile("From:.+\n")
sent_pat = re.compile("Sent:.+\n")
received_pat = re.compile("Received:.+\n")
ctype_pat = re.compile("Content-Type:.+\n")
reply_pat = re.compile("Reply- Organization:.+\n")
date_pat = re.compile("Date:.+\n")
xmail_pat = re.compile("X-Mailer:.+\n")
mimver_pat = re.compile("MIME-Version:.+\n")
dash_pat = re.compile("--+.+--+", re.DOTALL)
star_pat = re.compile('\*\*+.+\*\*+', re.DOTALL)
uscore_pat = re.compile(" __+.+__+", re.DOTALL)
equals_pat = re.compile("==+.+==+", re.DOTALL)

### Next we define a function that takes the body of am email (possibly containing forward email threads and/or environment warnings/privacy cautions) and returns sole email text (also from emails of a full forward where applicable)


In [None]:
def clean_forward_email(email):    
    etype=''
    if '.nsf' in email:
        etype = ".nsf"
    elif '.pst' in email:
        etype = '.pst'
    email_new = email[email.find(etype)+4:]
    email_new = to_pat.sub('', email_new)
    email_new = cc_pat.sub('', email_new)
    email_new = subject_pat.sub('', email_new)
    email_new = from_pat.sub('', email_new)
    email_new = sent_pat.sub('', email_new)
    email_new = received_pat.sub('', email_new)
    email_new = email_pat.sub('', email_new)
    email_new = ctype_pat.sub('', email_new)
    email_new = reply_pat.sub('', email_new)
    email_new = date_pat.sub('', email_new)
    email_new = xmail_pat.sub('', email_new)
    email_new = mimver_pat.sub('', email_new)
    email_new = dash_pat.sub('', email_new)
    email_new = star_pat.sub('', email_new)
    email_new = uscore_pat.sub('', email_new)
    email_new = equals_pat.sub('', email_new)
    return email_new

### Process email body column in the full data frame containing all emails. These will be used for topic modeling and it seems perfectly sound to consider full thread as one document with common topic(s)


In [None]:
df_emails['body']=df_emails['body'].apply(lambda x: clean_forward_email(x))

# just to be sure we do the same to df_emails0
df_emails0['body']=df_emails0['body'].apply(lambda x: clean_forward_email(x))

### Add in vs out box id column to both frames

In [None]:
def inoutfunct(x):
    result=0
    if (x['dist1']==0 or x['dist0']==0):
        result= 1
    return result
df_emails0['inout_id']=df_emails.apply(inoutfunct, axis=1)
df_emails['inout_id']=df_emails.apply(inoutfunct, axis=1)

### Save result to the disk.

In [None]:
import pickle
df_emails.to_pickle('/notebooks/LDA models and data/Data Frames and lists/df_emails.pkl')
df_email0s.to_pickle('/notebooks/LDA models and data/Data Frames and lists/df_emails0.pkl')