In [1]:
# Core Packages
from tkinter import *
from tkinter import ttk
from tkinter.scrolledtext import *
import tkinter.filedialog

In [2]:
# # Spam Filter In Python3 and Python4
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt 
from wordcloud import WordCloud 
from math import log,sqrt
import pandas as pd
import numpy as np 
import tkinter
from tkinter import messagebox

In [3]:
# # loading the dataset in the mails
mails=pd.read_csv('spam.csv',encoding='latin-1')

In [4]:
mails.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [5]:
mails.rename(columns={'v1':'labels','v2':'text'},inplace=True)

In [6]:
mails['label']=mails['labels'].map({'ham':0,'spam':1})

In [7]:
mails.drop(['labels'],axis=1,inplace=True)

In [8]:
totalMails=4825+747
trainIndex,testIndex=list(),list()
for i in range(mails.shape[0]):
    if np.random.uniform(0,1)<0.75:
        trainIndex+=[i]
    
    else:
        testIndex+=[i]

In [9]:
trainData=mails.loc[trainIndex]
testData=mails.loc[testIndex]
trainData.reset_index(inplace=True)
trainData.drop(['index'],axis=1,inplace=True)

In [10]:
testData.reset_index(inplace=True)
testData.drop('index',axis=1,inplace=True)

In [11]:
def output():
    spam_words=''.join(list(mails[mails['label']==1]['text']))
    spam_wc=WordCloud(width=512,height=512).generate(spam_words)
    #doing the same thing for the ham words too
    ham_words=''.join(list(mails[mails['label']==0]['text']))
    ham_wc=WordCloud(width=512,height=512).generate(ham_words)
    
    def process_message(text, lower_case = True, stem = True, stop_words = True, gram = 2):
        if lower_case:
            text = text.lower()
        words = word_tokenize(text)
        words = [w for w in words if len(w) > 2]
        if gram > 1:
            w = []
            for i in range(len(words) - gram + 1):
                w += [' '.join(words[i:i + gram])]
            return w
        if stop_words:
            sw = stopwords.words('english')
            words = [word for word in words if word not in sw]
        if stem:
            stemmer = PorterStemmer()
            words = [stemmer.stem(word) for word in words]   
        return words
    
    
    class SpamClassifier(object):
        def __init__(self, trainData, method = 'tf-idf'):
            self.mails, self.labels = trainData['text'], trainData['label']
            self.method = method

        def train(self):
            self.calc_TF_and_IDF()
            if self.method == 'tf-idf':
                self.calc_TF_IDF()
            else:
                self.calc_prob()

        def calc_prob(self):
            self.prob_spam =dict()
            self.prob_ham = dict()
            for word in self.tf_spam:
                self.prob_spam[word] = (self.tf_spam[word] + 1) / (self.spam_words +len(list(self.tf_spam.keys())))
            for word in self.tf_ham:
                self.prob_ham[word] = (self.tf_ham[word] + 1) / (self.ham_words + len(list(self.tf_ham.keys())))
            self.prob_spam_mail, self.prob_ham_mail = self.spam_mails / self.total_mails, self.ham_mails / self.total_mails 


        def calc_TF_and_IDF(self):
            noOfMessages = self.mails.shape[0]
            self.spam_mails, self.ham_mails = self.labels.value_counts()[1], self.labels.value_counts()[0]
            self.total_mails = self.spam_mails + self.ham_mails
            self.spam_words = 0
            self.ham_words = 0
            self.tf_spam = dict()
            self.tf_ham = dict()
            self.idf_spam = dict()
            self.idf_ham = dict()
            for i in range(noOfMessages):
                message_processed = process_message(self.mails[i])
                count = list() #To keep track of whether the word has ocured in the message or not.
                           #For IDF
                for word in message_processed:
                    if self.labels[i]:
                        self.tf_spam[word] = self.tf_spam.get(word, 0) + 1
                        self.spam_words += 1
                    else:
                        self.tf_ham[word] = self.tf_ham.get(word, 0) + 1
                        self.ham_words += 1
                    if word not in count:
                        count += [word]
                for word in count:
                    if self.labels[i]:
                        self.idf_spam[word] = self.idf_spam.get(word, 0) + 1
                    else:
                        self.idf_ham[word] = self.idf_ham.get(word, 0) + 1

        def calc_TF_IDF(self):
            self.prob_spam = dict()
            self.prob_ham = dict()
            self.sum_tf_idf_spam = 0
            self.sum_tf_idf_ham = 0
            for word in self.tf_spam:
                self.prob_spam[word] = (self.tf_spam[word]) * log((self.spam_mails + self.ham_mails) \
                                                          / (self.idf_spam[word] + self.idf_ham.get(word, 0)))
                self.sum_tf_idf_spam += self.prob_spam[word]
            for word in self.tf_spam:
                self.prob_spam[word] = (self.prob_spam[word] + 1) / (self.sum_tf_idf_spam + len(list(self.prob_spam.keys())))
            
            for word in self.tf_ham:
                self.prob_ham[word] = (self.tf_ham[word]) * log((self.spam_mails + self.ham_mails) \
                                                          / (self.idf_spam.get(word, 0) + self.idf_ham[word]))
                self.sum_tf_idf_ham += self.prob_ham[word]
            for word in self.tf_ham:
                self.prob_ham[word] = (self.prob_ham[word] + 1) / (self.sum_tf_idf_ham + len(list(self.prob_ham.keys())))
            
    
            self.prob_spam_mail, self.prob_ham_mail = self.spam_mails / self.total_mails, self.ham_mails / self.total_mails 
                    
        def classify(self, processed_message):
            pSpam, pHam = 0, 0
            for word in processed_message:                
                if word in self.prob_spam:
                    pSpam += log(self.prob_spam[word])
                else:
                    if self.method == 'tf-idf':
                        pSpam -= log(self.sum_tf_idf_spam + len(list(self.prob_spam.keys())))
                    else:
                        pSpam -= log(self.spam_words + len(list(self.prob_spam.keys())))
                if word in self.prob_ham:
                    pHam += log(self.prob_ham[word])
                else:
                    if self.method == 'tf-idf':
                        pHam -= log(self.sum_tf_idf_ham + len(list(self.prob_ham.keys()))) 
                    else:
                        pHam -= log(self.ham_words + len(list(self.prob_ham.keys())))
                pSpam += log(self.prob_spam_mail)
                pHam += log(self.prob_ham_mail)
            return pSpam >= pHam
    
        def predict(self, testData):
            result = dict()
            for (i, message) in enumerate(testData):
                processed_message = process_message(message)
                result[i] = int(self.classify(processed_message))
            return result
    
    sc_tf_idf=SpamClassifier(trainData,'tf-idf')
    sc_tf_idf.train()
    preds_tf_idf=sc_tf_idf.predict(testData['text'])
    
    raw_text = str(entry.get('1.0',tkinter.END))
    pm=process_message(raw_text)
    def alldonewithflyingcolors(pm,raw_text):
        raw_text = str(entry.get('1.0',tkinter.END))
        pm=process_message(raw_text)
        if sc_tf_idf.classify(pm)==True:
            final_text="It is SPAM EMAIL"
            result = '\nChecking Details:{}'.format(final_text)
            tab1_display.insert(tkinter.END,result)         
        else:
            final_text2="It is GOOD EMAIL (Not SPAM / HAM)"
            result2 = '\nChecking Details:{}'.format(final_text2)
            tab1_display.insert(tkinter.END,result2)
    alldonewithflyingcolors(pm,raw_text)

In [12]:
#sc_bow=SpamClassifier(trainData,'bow')
#sc_bow.train()
#preds_bow=sc_bow.predict(testData['text'])

In [13]:
# Clear entry widget
def clear_text():
	entry.delete('1.0',END)
	tab1_display.delete('1.0',END)

def clear_display_result():
	tab1_display.delete('1.0',END)


In [14]:
root1 = Tk()
root1.title("SPAM Prediction")
#root.configure(background='pink')

''

In [15]:
#root = Tk()
#root.title("SPAM Prediction")
root = Canvas(root1,width=1800,height=1800)
root.pack()
photo = PhotoImage(file ='img1.png')
root.create_image(10,10,image=photo,anchor=NW)

1

In [16]:
# Heading
w2 = Label(root, justify=CENTER, text="SPAM prediction using machine learning",fg="white", bg="black")
w2.config(font=("Elephant", 16))
w2.grid(row=1, column=0, columnspan=5, padx=15)

In [17]:
# MAIN NLP TAB
l1=Label(root,text="Enter Message:",fg="yellow", bg="black")
l1.grid(row=3,column=0,pady=15, sticky=W)

In [18]:

entry=Text(root,height=10, width = 100, bg="black",fg="white")
entry.grid(row=5,column=0,columnspan=9,padx=15,pady=15)

In [19]:
# Functions 
def get_summary():
    output()


In [20]:
# BUTTONS
button1=Button(root,text="Reset",command=clear_text, width=12,bg="red",fg="black")
button1.grid(row=13,column=1,padx=10,pady=10)

button2=Button(root,text="Check it!",command=get_summary, width=18,bg="white",fg="black")
button2.grid(row=13,column=0,padx=10,pady=10)

button3=Button(root,text="Clear Result", command=clear_display_result,width=12,bg="cyan",fg="black")
button3.grid(row=13,column=2,padx=10,pady=10)

In [21]:
# Display Screen For Result
tab1_display = Text(root, height=2, width = 60, bg="black",fg="orange")
tab1_display.grid(row=15,column=0, columnspan=2,padx=15,pady=15)

In [22]:
root1.mainloop()