In [139]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [140]:
dataset = pd.read_csv('spam.csv',encoding='latin-1')
dataset = dataset.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)   # Removing other columns.
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [141]:
dataset.shape

(5572, 2)

In [142]:
dataset.drop_duplicates(inplace=True)

In [143]:
dataset.shape


(5169, 2)

In [144]:
dataset.rename(columns = {'v1':'labels','v2':'message'},inplace = True)
dataset.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [145]:
dataset.isnull().sum()

labels     0
message    0
dtype: int64

In [146]:
dataset['labels'] = dataset['labels'].replace(to_replace=['spam', 'ham'], value=[1,0]).values
#dataset['labels'] = dataset['labels'].astype(int)

In [147]:
'''
1. Remove punctuations like ,.!# we will do this by using regex.
2. Removing stopwords
3. Converting each string into a list of words sepereted by comma . '''

import string
#1.
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

def process_text(text):
  res = [char for char in text if char not in string.punctuation]
  
  # So basically we split words and delete punctuation after each word the ' ' space is there so in below line we just join words again to see print(res)
  res = ''.join(res)
  p = [w for w in res.split() if  w.lower()  not in stopwords.words('english')]
  return p
  # In this basically we loop through the list[w] and then search words which is in stop words and if words is not in stop words so appending 
  # it in the list and returing that list.


In [148]:
dataset['message'].apply(process_text)

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, å£750, Po...
5568                   [Ì, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: message, Length: 5169, dtype: object

In [149]:
# As we have messages on string format now we want to convert it down into matrix format.
# For this we are using library called as extraction text count vectorizer this library converts text into 
# In this also we have analyzer basically we are passing our whole set from the function called as process text so it will get process and then we will just fit this
# into our message text.

from sklearn.feature_extraction.text import CountVectorizer
X = dataset['message']
y = dataset['labels']

vectorizer = CountVectorizer(analyzer=process_text)  # We have to give analyzer.
X = vectorizer.fit_transform(X)


In [150]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [151]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [152]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_pred,y_test)) 
print(accuracy_score(y_pred,y_test))    

[[861   8]
 [ 28 137]]
0.965183752417795


In [169]:
# Prediction thing
def predict():
    countrydata = T.get("1.0", "end-1c")
    # Vectorization.
    text = vectorizer.transform([countrydata])
    # Applying actual model.
    y_pred = classifier.predict(text)

    if y_pred == 1:
        return tmsg.showinfo("Notification","Spam mail  !!! Please be aware")
    else:
        return tmsg.showinfo("Notification","Ham mail  !!! Good to go")

In [170]:
from tkinter import *
import tkinter as tk
import tkinter.messagebox as tmsg

def clear():
   T.delete("1.0","end")

root = tk.Tk()

root.geometry('500x500+375+125')
root.title('Scam and ham classifier')
val = ""

head = Label(root,text = 'Scam and Ham Classifier',fg='White',bg='Black', font = ("3ds", 30, "bold"), relief = SUNKEN, bd=2,width=20)
head.pack(fill="x")

EntryLabel = Label(root,text = "<< Enter text below >>",fg = "Blue", font=("3ds",20,"bold"))
EntryLabel.place(x=100,y=100)


countrydata = StringVar()
bt = Button(root,text="Check",bg = "#b2ff59", font=("3ds",15,"bold"),relief = "sunken",activebackground="#ffd54f",activeforeground="blue"
                ,bd=2,width=10,command=predict)
bt.place(x=185,y=400)

T = Text(root, height = 13, width = 47, bg = 'black',fg = 'white', font = ("courier",12),cursor ="man",insertbackground='Blue' )
T.place(x = 13, y = 150)
T.insert(END, 'This is default ham mail !!!')

bt = Button(root,text="Clear",bg = "#b2ff59", font=("3ds",15,"bold"),relief = "sunken",activebackground="#ffd54f",activeforeground="blue"
                ,bd=2,width=5,command=clear)
bt.place(x=400,y=400)

root.mainloop()