In [1]:
import os  
#The OS module in Python provides a way of using operating system dependent functionality.

from collections import Counter
# A Counter is a dict subclass for counting hashable objects. 
# It is an unordered collection where elements are stored as dictionary keys and their counts are stored as dictionary values.

from sklearn.naive_bayes import MultinomialNB   
# used to apply function of naive bayes

from sklearn.model_selection import train_test_split as tts 
# break the whole dataset into 2 segments randomly as training and testing dataset.

from sklearn.metrics import accuracy_score   
# to calculate accuracy of the classifier

import pickle as c
# for turning an arbitrary Python object into a series of bytes

In [27]:
def make_dictionary():
    directory = "emails/"
    files = os.listdir(directory)    # creates a list of all the mails in emails folder
    emails = [directory + email for email in files]  # emails/ham or spam name.
    words = []  #list of words
    length = len(emails)   # calculating length of list()
    
    for email in emails:  # iterating through each mail in emails list
        f = open(email, errors='ignore')   # f is file type variable
        blob = f.read()   # blob is string type variable to read each mail in emails list
        words += blob.split(" ")
        
    # now we have created a list of every word in the emails folder
    for i in range(len(words)):     
        if not words[i].isalpha(): # if word are not alphabet we replace it with an empty string
            words[i] = ""

    dictionary = Counter(words)  # this is the dictionary of words  : frequency(decreasing order)
    del dictionary[""]   # deleting empty strings from the dictionary
    return dictionary.most_common(1000)   # returning a list of 1000 key(words)-value(frequency) pairs
    # for e.g. [('the', 22003), ('to', 16118)]


In [28]:
d = make_dictionary()

In [50]:
def make_dataset(dictionary):
    directory = "emails/"
    files = os.listdir(directory)
    emails = [directory + email for email in files]
    feature_set = []    # feature_set list
    labels = []         # labels list

    for email in emails:
        data = []
        f = open(email,errors='ignore')
        words = f.read().split(' ')
        for entry in dictionary:  # iterating dictionary
            data.append(words.count(entry[0]))   # here we are appending the frequency of each dictionary key in the word list
                      
        feature_set.append(data) # is a list of list(each of length 1000 and frequncy of each word of dictionary in that mail)

        if "ham" in email:  # assigning labels based on the names of the files
            labels.append(0)  # appending 0 
        if "spam" in email:
            labels.append(1)  # appending 1 

    return feature_set, labels

In [51]:
features, labels = make_dataset(d)

In [87]:
features_train, features_test, labels_train, labels_test = tts(features, labels, test_size=0.20)

In [88]:
clf = MultinomialNB()
clf.fit(features_train, labels_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [89]:
preds = clf.predict(features_test)

In [90]:
print(accuracy_score(labels_test, preds))

0.9207729468599034


In [91]:
#saving the training of classifier
def save_classifier(clf, name):
    with open(name, 'wb') as fp:
        c.dump(clf, fp)
    print("Classifier Training Saved!!")

In [95]:
save_classifier(clf, "spam-classifier.sav")

Classifier Training Saved!!
