### Loading the dataset

In [44]:
import os
from glob import glob

In [59]:
def read_text_files(paths, classification):
    data = {
        'email': [],
        'class': []
    }
    for path in paths:
        try:
            with open(path, 'r', encoding='utf-8') as f:
                message = ' '.join(f.readlines())
                data['email'].append(message)
                data['class'].append(classification)
        except:
            continue
    return data

#### Spam

In [61]:
path = "static/spam/"
spamEmails = glob(os.path.join(path, '*.txt'))

spam = read_text_files(spamEmails, 1)

#### Not spam

In [62]:
path = "static/ham/"
nonSpamEmails = glob(os.path.join(path, '*.txt'))

nonSpam = read_text_files(nonSpamEmails, 0)

### Creating the dataframe

In [63]:
import pandas as pd

In [64]:
dfSpam = pd.DataFrame.from_dict(spam)

In [66]:
dfNonSpam = pd.DataFrame.from_dict(nonSpam)

In [67]:
df = pd.concat([dfSpam, dfNonSpam])

In [68]:
df.head()

Unnamed: 0,email,class
0,Subject: adv : space saving computer to replac...,1
1,"Subject: advs\n greetings ,\n i am benedicta l...",1
2,Subject: fw : account over due wfxu ppmfztdtet...,1
3,Subject: spend too much on your phone bill ? 2...,1
4,"Subject: \n h $ ello\n dea 54 r home owner ,\n...",1


In [69]:
df.tail()

Unnamed: 0,email,class
16539,"Subject: fw : abandoned pipe ownership\n fyi ,...",0
16540,Subject: start date : 2 / 7 / 02 ; hourahead h...,0
16541,"Subject: fw : tw question in amarillo\n fyi , ...",0
16542,Subject: start date : 2 / 6 / 02 ; hourahead h...,0
16543,"Subject: fw : re ivanhoe e . s . d\n fyi , kim...",0


In [70]:
df.shape

(32624, 2)

In [71]:
df.isnull().sum()

email    0
class    0
dtype: int64

### Feature Engineering

In [72]:
from sklearn.feature_extraction.text import CountVectorizer

In [73]:
vectorizer = CountVectorizer()

In [75]:
counts = vectorizer.fit_transform(df['email'].values)

In [90]:
targets = df['class']

In [88]:
from sklearn.model_selection import train_test_split

In [98]:
X_train, X_test, y_train, y_test = train_test_split(counts, targets, test_size=0.33, random_state=42)

### Training the model

In [99]:
from sklearn.naive_bayes import MultinomialNB

In [100]:
model = MultinomialNB()

In [101]:
model.fit(X_train, y_train)

In [102]:
from sklearn.metrics import accuracy_score

In [103]:
y_predict = model.predict(X_test)

### Testing the model

In [106]:
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.9875534088798068


In [145]:
examples = ['''
We were unable to deliver your parcel as there was no one present to sign for the deliveryWe are here to inform you that we need an adress confirmation to reconfirm the parcel shipping.
''']

In [146]:
example_counts = vectorizer.transform(examples)

In [147]:
predictions = model.predict(example_counts)
print(predictions)

[1]


### Saving the model

In [149]:
from joblib import dump, load

In [152]:
dump(model, 'static/model.joblib')

['static/model.joblib']

In [153]:
model = load('static/model.joblib')