### Notebook Imports

In [1]:
from os import walk
from os.path import join
import pandas as pd

### Constants

In [2]:
EXAMPLE_FILE = 'SpamData/01_Processing/practice_email.txt'

PATH_SPAM_1 = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
PATH_SPAM_2 = 'SpamData/01_Processing/spam_assassin_corpus/spam_2'
PATH_HAM_1 = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_1'
PATH_HAM_2 = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_2'

CAT_HAM = 0
CAT_SPAM = 1

# Phase: Data Gathering

### Reading Files

In [3]:
fileStream = open(EXAMPLE_FILE, encoding='latin-1')
message = fileStream.read()
fileStream.close()

print(message)

From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002
Return-Path: <exmh-workers-admin@spamassassin.taint.org>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36
	for <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)
Received: from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MBYrZ04811 for
    <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002 12:34:53 +0100
Received: from listman.spamassassin.taint.org (localhost.localdomain [127.0.0.1]) by
    listman.redhat.com (Postfix) with ESMTP id 8386540858; Thu, 22 Aug 2002
    07:35:02 -0400 (EDT)
Delivered-To: exmh-workers@listman.spamassassin.taint.org
Received: from int-mx1.corp

### Extracting only the 'Body' of the email

In [4]:
fileStream = open(EXAMPLE_FILE, encoding='latin-1')

is_body = False
lines = []

for line in fileStream:
    if is_body:
        lines.append(line)
    elif line == '\n':
        is_body = True
        
fileStream.close()

email_body = '\n'.join(lines)
print(email_body)



Dear Mr Still



Good tidings to you and all your staff for the festive season ahead (Christmas).

Now to the crux of the matter-in-hand: I am a fully qualified Santa Claus and am wondering whether you might consider me to run my own "Santa's Grotto" in your store.

But WAIT! You're probably thinking: "What makes him so special?"

Well, first of all, I have made several changes to the characterisation of Father Christmas. Rather than greeting the children with shouts of "Ho, ho, ho!" I prefer to whisper the phrase "Dependence is not unfathomable in this cruel world we live in". In addition, my gifts are ALL hand-made, ranging from felt hoops to vanilla-pod holders.

You will note also, from the enclosed sketch, that I have radically redesigned Santa's outfit and have renamed my character "Lord Buckles". Would you be interested in employing me? I promise NEVER to let you down.

I look forward to hearing from you.



Best wishes

Robin Cooper

[Excerpt from the book: The Timewaster Let

### Generator Function to extract emails body

In [5]:
def emailGenerator(filePath):
    
    fileStream = open(filePath, encoding='latin-1')

    is_body = False
    lines = []

    for line in fileStream:
        if is_body:
            lines.append(line)
        elif line == '\n':
            is_body = True

    fileStream.close()

    email_body = '\n'.join(lines)
    
    yield email_body

### Packing the extracted email bodies into a DataFrame

In [6]:
def loadDataFrame(folderPath, classification):
    
    rows = []
    rows_names = []
    
    for root, dirNames, fileNames in walk(folderPath):
        for fileName in fileNames:
        
            filePath = join(root, fileName)
        
            for message_body in emailGenerator(filePath):
                rows.append({'CATAGORY': classification, 'MESSAGE': message_body})
                rows_names.append(fileName)
                
            df = pd.DataFrame(rows, index=rows_names)
        
    return df
        

### Load the Spam and Ham email messages

In [7]:
df_spam_emails = loadDataFrame(PATH_SPAM_1, CAT_SPAM)
df_spam_emails = df_spam_emails.append(loadDataFrame(PATH_SPAM_2, CAT_SPAM))

df_ham_emails = loadDataFrame(PATH_HAM_1, CAT_HAM)
df_ham_emails = df_ham_emails.append(loadDataFrame(PATH_HAM_2, CAT_HAM))

# Putting both Spam and Ham emails into a single Data Frame.
data = pd.concat([df_spam_emails, df_ham_emails])

data.head()

Unnamed: 0,CATAGORY,MESSAGE
00001.7848dde101aa985090474a91ec93fcf0,1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr..."
00002.d94f1b97e48ed3b553b3508d116e6a09,1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
00003.2ee33bc6eacdb11f38d052c44819ba6c,1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
00004.eac8de8d759b7e74154f142194282724,1,##############################################...
00005.57696a39d7d84318ce497886896bf90d,1,I thought you might like these:\n\n1) Slim Dow...


In [8]:
data.tail()

Unnamed: 0,CATAGORY,MESSAGE
01396.61983fbe6ec43f55fd44e30fce24ffa6,0,http://news.bbc.co.uk/1/hi/england/2515127.stm...
01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7,0,"> >-- be careful when using this one.) Also, t..."
01398.169b51731fe569f42169ae8f948ec676,0,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ..."
01399.ca6b00b7b341bbde9a9ea3dd6a7bf896,0,"So then, ""Mark Hammond"" <mhammond@skippinet.co..."
01400.f897f0931e461e7b2e964d28e927c35e,0,"Hi there,\n\n\n\nNow this is probably of no us..."


In [9]:
data.shape

(5799, 2)

In [10]:
data.info

<bound method DataFrame.info of                                         CATAGORY  \
00001.7848dde101aa985090474a91ec93fcf0         1   
00002.d94f1b97e48ed3b553b3508d116e6a09         1   
00003.2ee33bc6eacdb11f38d052c44819ba6c         1   
00004.eac8de8d759b7e74154f142194282724         1   
00005.57696a39d7d84318ce497886896bf90d         1   
...                                          ...   
01396.61983fbe6ec43f55fd44e30fce24ffa6         0   
01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7         0   
01398.169b51731fe569f42169ae8f948ec676         0   
01399.ca6b00b7b341bbde9a9ea3dd6a7bf896         0   
01400.f897f0931e461e7b2e964d28e927c35e         0   

                                                                                  MESSAGE  
00001.7848dde101aa985090474a91ec93fcf0  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...  
00002.d94f1b97e48ed3b553b3508d116e6a09  1) Fight The Risk of Cancer!\n\nhttp://www.adc...  
00003.2ee33bc6eacdb11f38d052c44819ba6c  1) Fight The Risk of Cancer

In [11]:
df_ham_emails.info

<bound method DataFrame.info of                                         CATAGORY  \
00001.7c53336b37003a9286aba55d2945844c         0   
00002.9c4069e25e1ef370c078db7ee85ff9ac         0   
00003.860e3c3cee1b42ead714c5c874fe25f7         0   
00004.864220c5b6930b209cc287c361c99af1         0   
00005.bf27cdeaf0b8c4647ecd61b1d09da613         0   
...                                          ...   
01396.61983fbe6ec43f55fd44e30fce24ffa6         0   
01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7         0   
01398.169b51731fe569f42169ae8f948ec676         0   
01399.ca6b00b7b341bbde9a9ea3dd6a7bf896         0   
01400.f897f0931e461e7b2e964d28e927c35e         0   

                                                                                  MESSAGE  
00001.7c53336b37003a9286aba55d2945844c      Date:        Wed, 21 Aug 2002 10:54:46 -05...  
00002.9c4069e25e1ef370c078db7ee85ff9ac  Martin A posted:\n\nTassos Papadopoulos, the G...  
00003.860e3c3cee1b42ead714c5c874fe25f7  Man Threatens Explosion In 

In [12]:
df_spam_emails.info

<bound method DataFrame.info of                                         CATAGORY  \
00001.7848dde101aa985090474a91ec93fcf0         1   
00002.d94f1b97e48ed3b553b3508d116e6a09         1   
00003.2ee33bc6eacdb11f38d052c44819ba6c         1   
00004.eac8de8d759b7e74154f142194282724         1   
00005.57696a39d7d84318ce497886896bf90d         1   
...                                          ...   
01397.f75f0dd0dd923faefa3e9cc5ecb8c906         1   
01398.8ca7045aae4184d56e8509dc5ad6d979         1   
01399.2319643317e2c5193d574e40a71809c2         1   
01400.b444b69845db2fa0a4693ca04e6ac5c5         1   
cmds                                           1   

                                                                                  MESSAGE  
00001.7848dde101aa985090474a91ec93fcf0  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...  
00002.d94f1b97e48ed3b553b3508d116e6a09  1) Fight The Risk of Cancer!\n\nhttp://www.adc...  
00003.2ee33bc6eacdb11f38d052c44819ba6c  1) Fight The Risk of Cancer

# Phase: Feature Engineering

### Checking for Emtpy Emails

In [13]:
# Checking if any of the column is empty
data.isnull().sum()

# data.isnull().values.any()         # Alternate way of checking empty value.

CATAGORY    0
MESSAGE     0
dtype: int64

In [14]:
# Checking if there are any empty emails.
(data['MESSAGE'].str.len() == 0).any()

True

In [15]:
# Checking how many are there in total.
(data['MESSAGE'].str.len() == 0).sum()

3

In [16]:
# List the rows with empty messages.
data[data['MESSAGE'].str.len() == 0]

Unnamed: 0,CATAGORY,MESSAGE
cmds,1,
cmds,1,
cmds,0,


In [17]:
# Removing rows that have empty messages.
data = data[data['MESSAGE'].str.len() != 0]

In [18]:
data.shape

(5796, 2)

### Manipulating the dataset to ease data analysis

In [19]:
# Creating DOCUMENT_ID column to hold the indexes.
data = data.assign(DOCUMENT_ID = lambda x: range(0, data['MESSAGE'].count()))

# Alternate method to do the same as above statement
# doc_ids = range(0, len(data.index))
# data['DOCUMENT_ID'] = doc_ids

In [20]:
# Renaming current index column to FILE_NAME and setting DOCUMENT_ID column as index
data['FILE_NAME'] = data.index

data.set_index('DOCUMENT_ID', inplace=True)

In [21]:
data.head()

Unnamed: 0_level_0,CATAGORY,MESSAGE,FILE_NAME
DOCUMENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",00001.7848dde101aa985090474a91ec93fcf0
1,1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,00002.d94f1b97e48ed3b553b3508d116e6a09
2,1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,00003.2ee33bc6eacdb11f38d052c44819ba6c
3,1,##############################################...,00004.eac8de8d759b7e74154f142194282724
4,1,I thought you might like these:\n\n1) Slim Dow...,00005.57696a39d7d84318ce497886896bf90d


In [22]:
data.tail()

Unnamed: 0_level_0,CATAGORY,MESSAGE,FILE_NAME
DOCUMENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5791,0,http://news.bbc.co.uk/1/hi/england/2515127.stm...,01396.61983fbe6ec43f55fd44e30fce24ffa6
5792,0,"> >-- be careful when using this one.) Also, t...",01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
5793,0,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",01398.169b51731fe569f42169ae8f948ec676
5794,0,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",01399.ca6b00b7b341bbde9a9ea3dd6a7bf896
5795,0,"Hi there,\n\n\n\nNow this is probably of no us...",01400.f897f0931e461e7b2e964d28e927c35e


# Saving the processed file into json

In [23]:
data.to_json('SpamHamEmails.json')