# SMART EMAIL CLASSIFIER

# Ingest Data
In this notebook I am going to explore and ingest the ENRON email dataset.

## 1) Import libraries

In [1]:
import pandas as pd
import os
import codecs
from tqdm import tqdm
import email
import numpy as np
import multiprocessing as mp


## 2) Explore the ENRON data structure
- We check how the emails are distributed in folders etc.
- We see that there are 150 folders (each one corresponding to one person).
- We see that inside the folder 'stockley-c', there is a folder called 'Christian Stockley", so we manually move all the subfolders 'stockley-c'
- We also see that in the ingestion, we should not include the hidden files created by MAC ".DS_Store"
- The folder (tree) structure is different for each person (i.e., different depths per user).
- We take into account the coding, which is 'ISO-8859-15'


In [2]:
def read_email_by_path(path):
# Function to read one file/email and print its content
    with codecs.open(path,'r','ISO-8859-1') as f:
        content = f.read()
        print(content)



## 3) Ingest the data
We write the functions that will be called afterwards

In [28]:
datadir='../data/maildir'


def folder_to_df_tree(path):
# Reads one file and creates a dataframe with one column per information about the email (recipient, sender, content...)
    if not path.endswith("DS_Store"):
        with codecs.open(path,'r','ISO-8859-1') as f:
            observation = f.read()
            msg = email.parser.Parser().parsestr(observation)
            return pd.DataFrame({'person':[path.split(datadir)[1].split('/',1)[1].split('/',1)[0]],
                                 'directory':[path.split(datadir)[1].split('/',1)[1].split('/',1)[1].rsplit('/',1)[0]],
                                 'email_num':[path.split(datadir)[1].split('/',1)[1].split('/',1)[1].rsplit('/',1)[1].split('.')[0]],
                                 'email_ID': [msg['Message-ID']],
                                 'from': [msg['from']],
                                 'to':[msg['to']],
                                 'cc':[msg['cc']],
                                 'bcc':[msg['bcc']],
                                 'subject':[msg['subject']],
                                 'date':[pd.to_datetime(msg['Date'])],
                                 'message':[msg.get_payload()],
                                 'attachment':[msg['X-FileName']]})
def folder_to_df_no_tree(path):
# Same as folder_to_df_tree but where the email is not inside a folder but in the user's folder
    if not path.endswith("DS_Store"):
        with codecs.open(path,'r','ISO-8859-1') as f:
            observation = f.read()
            msg = email.parser.Parser().parsestr(observation)
            return pd.DataFrame({'person':[path.split(datadir)[1].split('/',1)[1].split('/',1)[0]],
                                 'directory':'parent',
                                 'email_num':[path.split(datadir)[1].split('/',1)[1].split('/',1)[1].split('.')[0]],
                                 'email_ID': [msg['Message-ID']],
                                 'from': [msg['from']],
                                 'to':[msg['to']],
                                 'cc':[msg['cc']],
                                 'bcc':[msg['bcc']],
                                 'subject':[msg['subject']],
                                 'date':[pd.to_datetime(msg['Date'])],
                                 'message':[msg.get_payload()],
                                 'attachment':[msg['X-FileName']]})

def split_list(alist, wanted_parts=20):
# Splits the list alist into different parts
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]

def reading_files(i,paths):
# Creates one dataframe with all the files inside each segment
    df = pd.DataFrame(columns=[ 'person', 'to', 'from', 'bcc', 'cc', 'subject','attachment', 'date', 'directory', 'email_ID', 'email_num', 'message'])
    for path in tqdm(paths):
        try:
            df=df.append(folder_to_df_tree(path),sort=True)
# If the file is not in a tree structure, it raises an error
        except:
            df=df.append(folder_to_df_no_tree(path),sort=True)
# We save the dataframes for each segment in separate csv files
# Please comment the following lines if you don't want the intermediate steps to be saved into the disk
    try:
        os.mkdir('../data/ingest')
    except:
        pass 
    try:
        os.mkdir('../data/ingest/groups')
    except:
        pass 
    df.to_csv('../data/ingest/groups/out_%s.csv'%i,index_label=False)
    print("iteration %s finished"%i)
    return df

def load_splitted_df(n_thread):
#This function recovers the dataframes from the csv splitted
    df = pd.DataFrame(columns=[ 'person', 'to', 'from', 'bcc', 'cc', 'subject','attachment', 'date', 'directory', 'email_ID', 'email_num', 'message'])
    try:
        os.mkdir('../data/ingest/all')
    except:
        pass          

    for i in range(n_thread):
        df=df.append(pd.read_csv('../data/ingest/groups/out_%s.csv'%i),sort=True)
    df.to_csv('../data/ingest/all/emails.csv',index_label=False)       

def load_ingested_df():
#This function recovers the final dataframe from email.csv
    try:
        df=pd.read_csv('../data/ingest/all/emails.csv')
        return df
    except:
        print('Error to read final csv file')          


def ingest_data_to_csv():
# This function creates emails.csv file with all the emails from all the folders  
# Creates one text file with all the email files using the UNIX command find, and saves it to the file paths.txt     
    !find ../data/maildir -type f -name "*." > ../data/maildir/paths.txt
# Creates a list of paths, each one containing the path for each email  
    paths= [line.rstrip('\n') for line in open('../data/maildir/paths.txt')]
# To reduce the memory load, we divide the number of files in n_tread segments, and at the end we want to join them
    n_thread=20
    paths_splitted=split_list(paths,n_thread)
# We multiprocess to speed up the data ingest          
    pool = mp.Pool(processes=n_thread)
    results = pool.starmap(reading_files, [(i, j) for i, j in enumerate(paths_splitted)]) 
# We join all the segments, creating one final dataframe and csv file for all the emails
    df_final = pd.DataFrame(columns=[ 'person', 'to', 'from', 'bcc', 'cc', 'subject','attachment', 'date', 'directory', 'email_ID', 'email_num', 'message'])
    for result in results:
        df_final = df_final.append(result,sort=True)
    try:
        os.mkdir('../data/ingest/all')
    except:
        pass          

    df_final.to_csv('../data/ingest/all/emails.csv',index_label=False)       


In [29]:
if __name__ == '__main__':

# We select whether we want to check an email, read the csv from the splitted database or directly ingest the data    
    check_first_email=True
    loading_splitted=False
    ingest_data=False
    check_result=False

    
    if check_first_email == True:
        example='../data/maildir/hyatt-k/personal/20.'   # Change the folder 
        read_email_by_path(example)
    if loading_splitted == True:
        load_splitted_df(20)
    if ingest_data == True:
        ingest_data_to_csv()
    if check_result == True:
        df=load_ingested_df()

Message-ID: <20368457.1075860850650.JavaMail.evans@thyme>
Date: Fri, 1 Jun 2001 12:33:59 -0700 (PDT)
From: julie.armstrong@enron.com
To: eric.gadd@enron.com, robert.hayes@enron.com, steven.harris@enron.com, 
	robert.kilmer@enron.com, kay.miller@enron.com, 
	dave.neubauer@enron.com, john.millar@enron.com, 
	kevin.hyatt@enron.com
Subject: Updated Contact list of Direct Reports for Danny McCarty
Cc: cindy.stark@enron.com, susan.wadle@enron.com, tammy.kovalcik@enron.com, 
	marian.salinas@enron.com, audrey.robertson@enron.com, 
	zelda.paschal@enron.com, sharon.solon@enron.com, 
	deborah.cappiello@enron.com, connie.hook@enron.com
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Bcc: cindy.stark@enron.com, susan.wadle@enron.com, tammy.kovalcik@enron.com, 
	marian.salinas@enron.com, audrey.robertson@enron.com, 
	zelda.paschal@enron.com, sharon.solon@enron.com, 
	deborah.cappiello@enron.com, connie.hook@enron.com
X-From: Armstrong, 