### Loading the dataset

Dataset used: Enron-Spam dataset

https://www.kaggle.com/datasets/wanderfj/enron-spam

In [63]:
import os
from glob import glob

In [5]:
def read_text_files(paths, classification):
    data = {
        'email': [],
        'class': []
    }
    for path in paths:
        try:
            with open(path, 'r', encoding='utf-8') as f:
                message = ' '.join(f.readlines())
                data['email'].append(message)
                data['class'].append(classification)
        except:
            continue
    return data

#### Spam

In [6]:
path = "static/dataset/spam/"
spamEmails = glob(os.path.join(path, '*.txt'))

spam = read_text_files(spamEmails, 'spam')

#### Not spam (ham)

In [7]:
path = "static/dataset/ham/"
nonSpamEmails = glob(os.path.join(path, '*.txt'))

nonSpam = read_text_files(nonSpamEmails, 'ham')

### Creating the dataframe

In [8]:
import pandas as pd

In [9]:
dfSpam = pd.DataFrame.from_dict(spam)

In [10]:
dfNonSpam = pd.DataFrame.from_dict(nonSpam)

In [11]:
df = pd.concat([dfSpam, dfNonSpam])

In [45]:
df = df.reset_index()

In [46]:
df.head()

Unnamed: 0,index,email,class
0,0,Subject: adv : space saving computer to replac...,spam
1,1,"Subject: advs\n greetings ,\n i am benedicta l...",spam
2,2,Subject: fw : account over due wfxu ppmfztdtet...,spam
3,3,Subject: spend too much on your phone bill ? 2...,spam
4,4,"Subject: \n h $ ello\n dea 54 r home owner ,\n...",spam


In [47]:
df.tail()

Unnamed: 0,index,email,class
32619,16539,"Subject: fw : abandoned pipe ownership\n fyi ,...",ham
32620,16540,Subject: start date : 2 / 7 / 02 ; hourahead h...,ham
32621,16541,"Subject: fw : tw question in amarillo\n fyi , ...",ham
32622,16542,Subject: start date : 2 / 6 / 02 ; hourahead h...,ham
32623,16543,"Subject: fw : re ivanhoe e . s . d\n fyi , kim...",ham


In [48]:
df.shape

(32624, 3)

### Data Pre-Processing

1. Word tokenization
2. Removing punctuations
3. Removing URLs
4. Removing stop-words
5. lower casing
6. Stemming or lematization
7. Vectorization

In [58]:
email_example = df['email'][0]

In [55]:
print(email_example)

Subject: adv : space saving computer to replace that big box on or under your desk ! !
 revolutionary ! ! ! full featured ! ! !
 space saving computer in a
 keyboard
 eliminate
 that big box computer forever !
 great
 forhome . . . . office . . . or students . . . any place where desk space is at a
 premium !
 the
 computer in a
 keyboard eliminates the tower that takes up valuable space on
 or under your desk . a full featured , powerful computer for the price you would
 pay for a large tower . comes standard with : 1 . 8 ghz intelt pentium 4
 processor ( upgradeable ) 40 gigabyte hard drive
 ( upgradeable ) 256 mb ramupgradeable to 2 gb cd - rw dvd combo drive
 64 bit hardware accelerated 3 d graphics soundmax integrated digital audio
 internal 56 k fax - modem serial , parallel , audio , 4 usb ports ( 2 side , and 2
 back ) 2 button ps / 2 scroll mouse microsoft xp home
 edition and a 15 " lcd flat screen
 monitor ( upgradeable )
 isalso included in the base configuration !
 click
 

### Step 1: Word Tokenization

In [None]:
from nltk.tokenize import word_tokenize

nltk.download('wordnet')

In [100]:
email_cleaned_1 = word_tokenize(example_email)

In [101]:
email_cleaned_1

['Subject',
 ':',
 'adv',
 ':',
 'space',
 'saving',
 'computer',
 'to',
 'replace',
 'that',
 'big',
 'box',
 'on',
 'or',
 'under',
 'your',
 'desk',
 '!',
 '!',
 'revolutionary',
 '!',
 '!',
 '!',
 'full',
 'featured',
 '!',
 '!',
 '!',
 'space',
 'saving',
 'computer',
 'in',
 'a',
 'keyboard',
 'eliminate',
 'that',
 'big',
 'box',
 'computer',
 'forever',
 '!',
 'great',
 'forhome',
 '.',
 '.',
 '.',
 '.',
 'office',
 '.',
 '.',
 '.',
 'or',
 'students',
 '.',
 '.',
 '.',
 'any',
 'place',
 'where',
 'desk',
 'space',
 'is',
 'at',
 'a',
 'premium',
 '!',
 'the',
 'computer',
 'in',
 'a',
 'keyboard',
 'eliminates',
 'the',
 'tower',
 'that',
 'takes',
 'up',
 'valuable',
 'space',
 'on',
 'or',
 'under',
 'your',
 'desk',
 '.',
 'a',
 'full',
 'featured',
 ',',
 'powerful',
 'computer',
 'for',
 'the',
 'price',
 'you',
 'would',
 'pay',
 'for',
 'a',
 'large',
 'tower',
 '.',
 'comes',
 'standard',
 'with',
 ':',
 '1',
 '.',
 '8',
 'ghz',
 'intelt',
 'pentium',
 '4',
 'processo

### Step 2: Removing anything other than words

In [102]:
import re

In [103]:
# Removing un-needed unicode characters

email_cleaned_1 = []
for word in email_tokenized:
    clean = re.sub(r'[^\w\s]', "", word)
    if clean != "":
        email_cleaned_1.append(clean)

In [107]:
email_cleaned_2[:10]

['subject',
 'adv',
 'space',
 'saving',
 'computer',
 'to',
 'replace',
 'that',
 'big',
 'box']

### Step 3: Removing any URLs

In [105]:
email_cleaned_3 = []
for word in email_cleaned_2:
    x = re.sub(r'http\S+', '', word)
    if x != '':
        email_cleaned_3.append(x)

In [108]:
email_cleaned_3[:10]

['subject',
 'adv',
 'space',
 'saving',
 'computer',
 'to',
 'replace',
 'that',
 'big',
 'box']

### Step 4: Stop word removal

(nltk provides stopwords but you can define yours too or find them from the internet)

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')

In [109]:
stop_words = stopwords.words('english')

In [110]:
print(stop_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [112]:
email_cleaned_4 = []
for word in email_cleaned_3:
    if word not in stop_words:
        email_cleaned_4.append(word)

In [131]:
email_cleaned_4[:10]

['subject',
 'adv',
 'space',
 'saving',
 'computer',
 'replace',
 'big',
 'box',
 'desk',
 'revolutionary']

### Step 5: Lower casing

In [113]:
email_cleaned_5 = []
for word in email_cleaned_4:
    email_cleaned_5.append(word.lower())

In [114]:
email_cleaned_5[0:10]

['subject',
 'adv',
 'space',
 'saving',
 'computer',
 'replace',
 'big',
 'box',
 'desk',
 'revolutionary']

### Step 6: Lemmatization

In [135]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [136]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [137]:
lemmatizer = WordNetLemmatizer()

In [138]:
email_cleaned_6 = []
for word in email_cleaned_5:
    word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
    email_cleaned_6.append(word)

In [141]:
email_cleaned_6[:10]

['subject',
 'adv',
 'space',
 'save',
 'computer',
 'replace',
 'big',
 'box',
 'desk',
 'revolutionary']

#### Defining a clean text function for all the emails now

In [145]:
def clean_text(data):
    data_cleaned = []
    for email in data:
        # tokenization
        email = word_tokenize(email)
        
        # removing unwanted characters like punctuations
        email = [re.sub(r'[^\w\s]', "", word) for word in email if word != '']
        
        # url removal
        email = [re.sub(r'http\S+', '', word) for word in email if word != '']
        
        # stop-word removal
        email = [word for word in email if (word not in stop_words)]
        
        # lower-casing
        email = [word.lower() for word in email]
        
        # lemmatization
        email = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in email]
        
        data_cleaned.append(email)
    
    return data_cleaned

In [146]:
emails = df['email']

In [147]:
emails_cleaned = clean_text(emails)

KeyboardInterrupt: 

### Feature Engineering

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
vectorizer = CountVectorizer()

In [15]:
counts = vectorizer.fit_transform(df['email'].values)

In [16]:
targets = df['class']

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(counts, targets, test_size=0.33, random_state=42)

### Training the model

In [19]:
from sklearn.naive_bayes import MultinomialNB

In [20]:
model = MultinomialNB()

In [21]:
model.fit(X_train, y_train)

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
y_predict = model.predict(X_test)

### Testing the model

In [24]:
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.9875534088798068


In [25]:
examples = ['''
We were unable to deliver your parcel as there was no one present to sign for the deliveryWe are here to inform you that we need an adress confirmation to reconfirm the parcel shipping.
''']

In [26]:
example_counts = vectorizer.transform(examples)

In [34]:
predictions = model.predict(example_counts)
print(predictions)

### Saving the model

In [28]:
from joblib import dump, load

In [29]:
dump((model, vectorizer), 'app/static/model.joblib')

['app/static/model.joblib']