In [185]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import joblib

In [186]:
# Load dataset 
# Sample Data taken from kaggle.com
data = pd.read_csv('C:\\Users\\sreej\\Desktop\\ds prjct\\spamcollection\\spamhamdata.csv', sep='\t', names=['label', 'message'])

In [187]:
data.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [188]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [189]:
data.shape

(5572, 2)

In [190]:
data.head

<bound method NDFrame.head of      label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [191]:
data.isnull().count()

label      5572
message    5572
dtype: int64

In [192]:
data.isnull().sum()

label      0
message    0
dtype: int64

In [193]:
data[data.duplicated()].count()

label      403
message    403
dtype: int64

In [194]:
# Convert labels to binary (spam = 1, ham = 0)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [195]:
# Feature extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['message'])
y = data['label']

In [196]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [197]:
# Train Naive Bayes Model
model = MultinomialNB()
model.fit(X_train, y_train)

In [198]:
# Evaluate Model
predictions = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, predictions)}')

Accuracy: 0.9856502242152466


In [199]:
# Save the model and vectorizer
joblib.dump(model, 'spam_classifier.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

# GUI using Tkinter

In [200]:
import tkinter as tk
from tkinter import messagebox
import joblib

In [201]:
# Load the pre-trained model and vectorizer
model = joblib.load('spam_classifier.pkl')
vectorizer = joblib.load('vectorizer.pkl')

def classify_email():
    email_text = email_input.get("1.0", tk.END).strip()
    if not email_text:
        messagebox.showerror("Input Error", "Please enter email text.")
        return

    # Vectorize the input text and classify
    email_vector = vectorizer.transform([email_text])
    prediction = model.predict(email_vector)[0]

    # Display the result
    if prediction == 1:
        result_label.config(text="Spam", fg="red")
    else:
        result_label.config(text="Ham", fg="green")



In [202]:
# Create the main Tkinter window
root = tk.Tk()
root.title("Email Spam Classifier")

''

In [203]:
# GUI Components
tk.Label(root, text="Enter Email Text:", font=("Arial", 14)).pack(pady=10)

email_input = tk.Text(root, height=10, width=50, font=("Arial", 12))
email_input.pack(pady=10)

classify_button = tk.Button(root, text="Classify", command=classify_email, font=("Arial", 12), bg="blue", fg="white")
classify_button.pack(pady=10)

result_label = tk.Label(root, text="", font=("Arial", 16))
result_label.pack(pady=10)

root.mainloop()