In [None]:
import numpy as np
import pandas as pd
import email
import re
import os
import fnmatch
from unidecode import unidecode

In [None]:
pattern = '*.mbox'
emails_path = 'path_here'
email_files=[]
for root, dirs, files in os.walk(emails_path):
    for filename in fnmatch.filter(files, pattern):
        email_files.append((filename, os.path.join(root, filename)))

In [None]:
emails_df = pd.DataFrame(columns=['file', 'message'])

for email_file in email_files:
    with open(email_file[1], 'r', encoding="ISO-8859-1") as f:
        content = f.read()
        content = unidecode(content)
        emails_df=emails_df.append({'file': email_file[1], 'message': content}, ignore_index=True)
emails_df.head()

In [None]:
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [None]:
messages = list(map(email.message_from_string, emails_df['message']))
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]
emails_df['content'] = list(map(get_text_from_email, messages))
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)
emails_df['user'] = emails_df['file'].map(lambda x:x.split('/')[0])
del messages

emails_df.head()

In [None]:
OUTFILE = open('out_path', 'w+')

headers = ['Date', 'From', 'Sent', 'To', 'X-To', 'Subject', 'cc', 'Cc', 'CC', 'Attachments',
           'X-Mailer', 'MIME-Version', 'Content-Type', 'Precedence', 'Importance', 'FROM', 'TO', 'DATE', 'SENT']
skiplinestarters = ["<!--", '========', '?', '________', '********', '-----']
skiplinetokens = ['[IMAGE]']
htmltokens = ['HTML', 'HEAD', 'TITLE', 'A']

replacements = {
    '=09': ''
}
replacement_starters = {
    '> ': '',
    '>': ''
}

valedictions = ['Best', 'Sincerely', 'Regards', 'Kind regards', 'Thanks', 'Cheers']

eos_punc = ['.', '!', '?']
sent_accumulator = []
prev_line_was_val = False

for email_body in emails_df["content"]:
    if 'Forwarded by' in email_body:
        continue
    for line in email_body.splitlines():
        line = line.strip()
        
        if any(('<%s>' % htmltoken) in line for htmltoken in htmltokens):
            break
        if any(('</%s>' % htmltoken) in line for htmltoken in htmltokens):
            break
        
        if not line:
            continue
        if any(skiptoken in line for skiptoken in skiplinetokens):
            continue
        if any(("%s:" % header) in line for header in headers):
            continue
        if any(starter in line for starter in skiplinestarters):
            continue
            
        for key in replacements:
            line = line.replace(key, replacements[key])
        
        for key in replacement_starters:
            if line.startswith(key):
                line = line[len(key):]
            
        if not line:
            continue

        if line.endswith('='):
            OUTFILE.write(line[:-1])
        elif any(line.startswith(val) for val in valedictions):
            OUTFILE.write('%s\n' % line)
            prev_line_was_val = True
        elif prev_line_was_val:
            prev_line_was_val = False
            OUTFILE.write('%s\n' % line)
        elif len(line.split(" "))<=1: 
            OUTFILE.write('%s\n' % line)
        elif re.match(r"[0-9\-\(\)]", line) and len(line.split(" "))<=3:
            OUTFILE.write('%s\n' % line)
        else:
            if (any(line.endswith(punc) for punc in eos_punc) or re.search(r"(\d{5})", line)):
                if len(sent_accumulator)>0:
                    sent_accumulator.append(line)
                    accumulation = ' '.join(sent_accumulator)
                    OUTFILE.write('%s\n' % accumulation)
                    sent_accumulator=[]
                else:
                    OUTFILE.write('%s\n' % line)
            else:
                sent_accumulator.append(line)