# SMART EMAIL CLASSIFIER

## Preprocess Data
The goal of this notebook is to preprocess the data.

## 1) Import libraries

In [1]:
import pandas as pd
import os
import numpy as np
import gensim
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
import string
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# Uncomment the following line to install the nltk data
# nltk.download()

## 2) Functions to clean and preprocess the text

In [2]:
def clean(original):
# This function cleans a text by stopping, lemmatizing on verbs and excluding punctuations, email addresses, numbers, webpages and dates with "/"
# It keeps only the nouns
    lower = " ".join([i for i in original.lower().split()])
    no_fwd = lower.split('>from',1)[0].split('---',1)[0]    #removes forwarded emails and notes
    cleantext = BeautifulSoup(no_fwd, "lxml").text
    nouns = [word for word,pos in nltk.pos_tag(nltk.word_tokenize(' '.join([i for i in cleantext.split()]))) if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    lemmat = " ".join(lemma.lemmatize(lemma.lemmatize(word),'v').translate({ord(ch): None for ch in '0123456789'}) for word in nouns.split()  if ("@" not in word and '.com' not in word and 'www' not in word and 'http' not in word and '/' not in word and ':' not in word))
    normalized = re.sub('[%s]' % re.escape(string.punctuation), ' ', lemmat).split()
    x= [s for s in normalized if s not in stop]
    y = [s for s in x if (len(s) > 2 and len(s) < 15) ]
    return y




def preprocessing(infile,outfile,n_sample,restriction,restriction_by_area):
    df=pd.read_csv(infile).fillna(value=0)
    if n_sample == None:
        n_sample=len(df)

    if restriction == True:
        # To restrict the emails to those 1-1 emails sent to a different person, and are not fwd or reply emails
        df['senders_num']=df['to'].str.count('@').fillna(value=0)
        df_no_fwd=df.loc[~df['subject'].str.contains('fwd',case=False,na=False)]
        df_one_to_one=df_no_fwd.loc[df_no_fwd['senders_num']==1]
        df_one_to_one.to_csv('../data/ingest/all/1-1.csv',index_label=False)      
        df_no_re=df_one_to_one.loc[~df_one_to_one['subject'].str.contains('re',case=False,na=False)]
        df_restricted=df_no_re.loc[df_no_re['to']!=df_no_re['from']].reset_index(drop=True)      
        df_restricted.to_csv('../data/ingest/all/subset.csv',index_label=False)      
        df_sample=df_restricted.sample(n=n_sample)
        if restriction_by_area == True:
            df_enron=pd.read_csv('../data/employees.csv').fillna(value=0)
            df_areas=df_enron.loc[df_enron['area']!=0]
            df1=df_areas[['email1','area']]
            df2=df_areas[['email2','area']]
            df3=df_areas[['email3','area']]
            df4=df_areas[['email4','area']]
            df1_comb=df_restricted.merge( df1, how='inner', left_on=['from'], right_on=['email1']).drop(columns=['email1'])
            df2_comb=df_restricted.merge( df2, how='inner', left_on=['from'], right_on=['email2']).drop(columns=['email2'])
            df3_comb=df_restricted.merge( df3, how='inner', left_on=['from'], right_on=['email3']).drop(columns=['email3'])
            df4_comb=df_restricted.merge( df4, how='inner', left_on=['from'], right_on=['email4']).drop(columns=['email4'])
            df1_comb2=df_restricted.merge( df1, how='inner', left_on=['to'], right_on=['email1']).drop(columns=['email1'])
            df2_comb2=df_restricted.merge( df2, how='inner', left_on=['to'], right_on=['email2']).drop(columns=['email2'])
            df3_comb2=df_restricted.merge( df3, how='inner', left_on=['to'], right_on=['email3']).drop(columns=['email3'])
            df4_comb2=df_restricted.merge( df4, how='inner', left_on=['to'], right_on=['email4']).drop(columns=['email4'])
            df_comb=df1_comb.append(df2_comb).append(df3_comb).append(df4_comb).append(df1_comb2).append(df2_comb2).append(df3_comb2).append(df4_comb2)
            
            
    else:     
        df_sample=df.sample(n=n_sample)
    if restriction_by_area == True:
        for area in df_comb['area'].unique():
            print(area, len(df_comb[df_comb['area']==area]))
            emails_clean = [clean(email) for email in df_comb['message'][df_comb['area'] == area].tolist()]
            with open('../data/preprocessed/preprocessed_%s.csv'%area, 'w', newline='') as g:
                for email in emails_clean:
                    if len(email) > 0:                    
                        g.write(" ".join(email)+'\n')
    else:     
        emails_clean = [clean(email) for email in df_sample['message'].tolist()]
        # We save the clean text in a csv file without a dataframe structure, as it is much faster to read/write
        with open(outfile, 'w', newline='') as g:
            for email in emails_clean:
                if len(email) > 0:
                    g.write(" ".join(email)+'\n')

## 3) Main function

In [3]:
if __name__ == '__main__':

    
    try:
        os.mkdir('../data/preprocessed')
    except:
        pass

    # We define the following variables for the preprocessing
    # We remove the most common words in the English dictionary
    stoplist = ['also use make people know many call include part find become like mean often different usually take with come give well get since type list say change see refer actually iii kinds ask would way something need things want every str =09 0909 image'.split(' ')][0]
    stop = set(list(stopwords.words('english'))+stoplist)
    # We will also exclude the punctuation signs
    exclude = set(string.punctuation) 
    # We will also lemmatize
    lemma = WordNetLemmatizer()
       
    infile = '../data/ingest/all/emails.csv'
    outfile = "../data/preprocessed/preprocessed_pos.csv"
    n_sample = 70000     #number of random emails to preprocess, select a number if only a sample is desired
#     n_sample = None    #if this line is uncommented, all the emails would be processed
    restriction=True     # set True if we want to restrict the analysis to 1-1 emails that are not fwd or reply
    restriction_by_area=False   # set True if the 1-1 emails are restricted to ENRON emails and processed by area
    
    preprocessing(infile,outfile,n_sample,restriction,restriction_by_area)