In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pickle
from win32com.client import Dispatch
import tkinter as tk

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\venug\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\venug\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data = pd.read_csv('spam.csv', encoding="latin-1")

In [5]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [7]:
data = data.loc[:,~data.columns.str.contains('^Unnamed')]

In [8]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
data['v1'] = data['v1'].map({'ham':0,'spam':1})

In [10]:
data.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [12]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  
    text = text.lower()  
    text = text.split()  
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return ' '.join(text)

In [13]:
data['v2'] = data['v2'].apply(preprocess_text)

In [14]:
tfidf = TfidfVectorizer(max_features=3000)
x = tfidf.fit_transform(data['v2']).toarray()
y = data['v1']

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [16]:
model = MultinomialNB()
model.fit(x_train, y_train)

In [17]:
y_pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.979372197309417


In [18]:
pickle.dump(model, open('spam_model.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf_vectorizer.pkl', 'wb'))

In [19]:
#def speak(text):
#   speaker = Dispatch("SAPI.SpVoice")
#   speaker.Speak(text)

In [20]:
from win32com.client import Dispatch

from win32com.client import Dispatch

def speak(text):
    speaker = Dispatch("SAPI.SpVoice")
    
    voices = speaker.GetVoices()
    
    for voice in voices:
        if "Zira" in voice.GetDescription():  
            speaker.Voice = voice
            break
    
    speaker.Speak(text)



In [21]:
def result():
    message = text.get()
    vect = tfidf.transform([preprocess_text(message)]).toarray()
    prediction = model.predict(vect)
    if prediction[0] == 1:
        output_label.config(text="This is a Spam mail", bg='#FF4040')
        speak("This is a Spam mail")
        print("Prediction: Spam mail")  
    else:
        output_label.config(text="This is not a Spam mail", bg='lightgreen')
        speak("This is not a Spam mail")
        print("Prediction: Not a Spam mail") 

In [22]:
import tkinter as tk
from tkinter import Label
from PIL import Image,ImageTk
import requests
root = tk.Tk()
root.geometry("500x500")
root.configure(bg='lightblue')
root.title("ZeroSpam AI")

frame = tk.Frame(root, bg='lightblue')
frame.pack(expand=True)

l2 = tk.Label(frame, text="Email Spam Classification Application", font=("Times New Roman", 16), bg='lightblue')
l2.pack(pady=5)

l1 = tk.Label(frame, text="Enter Your Message:", font=("Times New Roman", 14), bg='lightblue')
l1.pack(pady=5)

text = tk.Entry(frame, font=("Times New Roman", 14), bg='white')
text.pack(pady=5)

output_label = tk.Label(frame, text="", font=("Times New Roman", 16), bg='lightblue')
output_label.pack(pady=10)

B = tk.Button(frame, text="Check Spam", command=result, font=("Times New Roman", 12), bg='pink')
B.pack(pady=10)

root.mainloop()