In [None]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
from sklearn.model_selection import train_test_split
import glob
import re

In [None]:
#insert the appropriate path of the dataset of your choice for training

Data = pd.read_csv("../input/spam-or-not-spam-dataset/spam_or_not_spam.csv")
Data.head()
Data.shape

In [None]:
#Functions

#function to clean the string (pre-processing)
def  clean_string(str_arg):
    cleaned_str=re.sub('[^a-z\s]+'," ",str_arg,flags=re.IGNORECASE) 
    cleaned_str=re.sub('(\s+)'," ",cleaned_str)
    cleaned_str=cleaned_str.lower() 
    return cleaned_str


#add words of email to bow_dicts
def add_To_Bag_of_Words(example,dict_index,bow_dicts):
        if isinstance(example,np.ndarray): example=example[0]
        for token_word in example.split(): 
            bow_dicts[dict_index][token_word]+=1
            

#calculate posterior probability for each class
def getTestMailProbability(test_example, cats_info):
    likelihood_prob=np.zeros(classes.shape[0]) #to store probability w.r.t each class
    for cat in classes: 
        for test_token in test_example.split(): #split the test example and get p of each test word
            test_token_counts=cats_info[cat][0].get(test_token,0)+1
            #now get likelihood of this test_token word                              
            test_token_prob=test_token_counts/float(cats_info[cat][2])                              
            #remember why taking log? To prevent underflow!
            likelihood_prob[cat]+=np.log(test_token_prob)
            
    post_prob=np.empty(classes.shape[0])
    for cat in classes:
        post_prob[cat]=likelihood_prob[cat]+np.log(cats_info[cat][1])
    return post_prob

#classifying for an example if spam or ham
def get_prediction(example, predictions, cats_info):
        #preprocess the test example the same way we did for training set examples 
        cleaned_example=clean_string(str(example)) 
        #get the posterior probability of every example                                  
        post_prob=getTestMailProbability(cleaned_example, cats_info) #get prob of this example for both classes
        predictions.append(classes[np.argmax(post_prob)])


In [None]:
#split data in train data and test data for training and testing
# taking 20% of the examples from the data set as testing examples

train, test= train_test_split(Data, test_size=0.2)
train.columns =["Training Examples","Training Labels"] 
test.columns =["Testing Examples","Testing Labels"]

#separating the data and labels of both the training and testing examples
train_data=train["Training Examples"]
train_labels=train["Training Labels"]
test_data=test["Testing Examples"]
test_labels=test["Testing Labels"]

In [None]:
#finding the unique labels

classes=np.unique(train_labels)
print("unique labels:", classes)

#creating bag of words
bow_dicts=np.array([defaultdict(lambda:0) for index in range(classes.shape[0])])
if not isinstance(train_data,np.ndarray): train_data=np.array(train_data)
if not isinstance(train_labels,np.ndarray):train_labels=np.array(train_labels)

for cat in classes:
    all_cat_examples=train_data[train_labels==cat]
    cleaned_examples=[clean_string(str(cat_example)) for cat_example in all_cat_examples]
    cleaned_examples=pd.DataFrame(data=cleaned_examples)
    np.apply_along_axis(add_To_Bag_of_Words,1,cleaned_examples,cat,bow_dicts)

In [None]:
prob_classes=np.empty(classes.shape[0])
all_words=[]
cat_word_counts=np.empty(classes.shape[0])

for cat in classes:
    prob_classes[cat]=np.sum(train_labels==cat)/float(train_labels.shape[0])
    #Calculating total counts of all the words of each class 
    count=list(bow_dicts[cat].values())
    cat_word_counts[cat]=np.sum( np.array( list(bow_dicts[cat].values()) ))+1 # |v| is remaining to be added
    #get all words of this category 
    print(cat_word_counts)
    all_words+=bow_dicts[cat].keys()
    
vocab=np.unique(np.array(all_words))
vocab_length=vocab.shape[0]
denoms=np.array([cat_word_counts[cat]+vocab_length+1 for cat in classes])
cats_info=[(bow_dicts[cat],prob_classes[cat],denoms[cat]) for cat in classes]                               
cats_info=np.array(cats_info)

In [None]:
predictions=[] #to store prediction of each test email

for example in test_data: 
    get_prediction(example, predictions, cats_info)
    
pclasses=np.array(predictions)
test_acc=np.sum(pclasses==test_labels)/float(test_labels.shape[0]) 
print ("Test Set Examples: ",test_labels.shape[0])
print ("Test Set Accuracy: ",test_acc*100,"%")

In [None]:
def desiredFunction(path_of_test_email_folder, cats_info):
    total_test_files = len(glob.glob1(path_of_test_email_folder,"*.txt"))

    predictions=[] #to store prediction of each test example
    for i in range(1,total_test_files+1):
        fpath = path_of_test_email_folder + "/email" +str(i) + ".txt"
        f=open(fpath, "r")
        if f.mode == 'r':
            example =f.read()
        get_prediction(example, predictions, cats_info)
    pclass=np.array(predictions)   


    path_of_test_email_folder = "/kaggle/working/"
    
    path_of_output_file = path_of_test_email_folder + "/output.txt"
    output_file = open(path_of_output_file, 'w') 

    col_name="test_email"+ "       " + "spam(1)/ham(0)\n\n\n"
    output_file.write(col_name)

    print(col_name)
    for i in range(0,len(pclass)):
        email="  email"+str(i+1)+"                "+str(pclass[i])+"\n"
        output_file.write(email)
        print(email)

In [None]:
# Insert appropriate path of test folder
# path = "test"
path = "../input/dataset"
desiredFunction(path, cats_info)