In [1]:
# Importing the required header files
import numpy as np
import os

In [2]:
# Function for reading the emails and extracting the message
def read_file(path):
    
    for root, dirname, filenames in os.walk(path):
        
        for filename in filenames:
            
            path=os.path.join(root,filename)
            f=open(path,'r')
            lines=[]
            for line in f:
                lines.append(line)
            f.close()
            message='\n'.join(lines)
            yield message

In [3]:
train_mail=[]

# adding spam mails in training dataset
for message in read_file('Training_Set/spam'):
    train_mail.append([message,'spam'])
    
# adding ham mails in training dataset
for message in read_file('Training_Set/ham'):
    train_mail.append([message,'ham'])
    
# converting train_mail to a numpy array
train_data=np.asarray(train_mail)

# Shows 700 mails with 2 columns (message,(spam/ham))
print (train_data.shape)

(700, 2)


In [4]:
word_count_ham={}    # a dictionary for storing count of word given it is ham
word_count_spam={}   # a dictionary for storing count of word given it is spam
word_ignore=[]       # stores useless words, so that we can ignore them in future
spam_num=0           # number of spam mails
ham_num=0            # number of ham mails

# iterating all mails
for j in range(train_data.shape[0]):
    
    # storing words from message body
    words = train_data[j][0].split(' ')
    
    #counting no. of spam/ham mails
    if train_data[j][1]=='spam':
        spam_num+=1
    else:
        ham_num+=1
    
    # iterating all words in that mail
    for i in words:
        
        #storing useless words
        if len(i)<2:
            if not i in word_ignore:
                word_ignore.append(i)
            continue
        
        # calculating word count given its a spam
        if train_data[j][1]=='spam':
            if i in word_count_spam:
                word_count_spam[i]+=1
            else:
                word_count_spam[i]=1
        # calculating word count given its a ham
        else:
            if i in word_count_ham:
                word_count_ham[i]+=1
            else:
                word_count_ham[i]=1

In [5]:
#probability of a mail being a spam
spam_prob=(float(spam_num))/(float(spam_num+ham_num))
#probability of a mail being a ham
ham_prob=(1.0-spam_prob)

In [6]:
test_mail=[]

# adding spam mails in testing dataset
for message in read_file('Testing_Set/spam'):
    test_mail.append([message,'spam'])
    
# adding ham mails in testing dataset
for message in read_file('Testing_Set/ham'):
    test_mail.append([message,'ham'])
    
# converting test_mail to a numpy array
test_data=np.asarray(test_mail)

# Shows 260 mails with 2 columns (message,(spam/ham))
print (test_data.shape)

(260, 2)


In [7]:
# list to store predicted labels
test_predict=[]

# iterating all testing mails
for j in range(test_data.shape[0]):
    
    # storing words from message body
    words = test_data[j][0].split(' ')
    
    # used to predict whether than word appears more often in spam or ham
    ham_contribution=0
    spam_contribution=0
    
    for i in words:
        
        # ignore the useless words and the unknown words
        if i in word_ignore or i not in word_count_spam or i not in word_count_ham:
            continue
            
        # if word is not spam, its definitely a ham (since unknown words have been taken care of above)    
        if i not in word_count_spam:
            ham_contribution+=1.0
            continue
            
        # if word is not ham, its definitely a spam
        if i not in word_count_ham:
            spam_contribution+=1.0
            continue
        
        # using bayes theorem to calculate the probabilities for spam/ham
        p1=spam_prob*word_count_spam[i]/(word_count_ham[i]+word_count_spam[i])
        p2=ham_prob*word_count_ham[i]/(word_count_ham[i]+word_count_spam[i])
        
        # Calculating the contribution of each word being a spam or a ham
        spam_contribution+=p1
        ham_contribution+=p2   
    
    # predicting labels
    if spam_contribution>ham_contribution:
        test_predict.append('spam')
    else:
        test_predict.append('ham')

In [8]:
# Calculating accuracy of predicted labels by comparing with actual labels
x=0
for i in range(test_data.shape[0]):
    if test_predict[i]==test_data[i][1]:
        x+=1
print ("Accuracy: ", 100*x/test_data.shape[0])

Accuracy:  92.3076923076923
