In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib
import tkinter as tk
from tkinter import messagebox

In [32]:
file_path = 'spambase.csv'
data = pd.read_csv(file_path, header=None)
print("Dataset structure:")
print(data.head())
print(f"Dataset shape: {data.shape}")


Dataset structure:
                                                   0
0  0,0.64,0.64,0,0.32,0,0,0,0,0,0,0.64,0,0,0,0.32...
1  0.21,0.28,0.5,0,0.14,0.28,0.21,0.07,0,0.94,0.2...
2  0.06,0,0.71,0,1.23,0.19,0.19,0.12,0.64,0.25,0....
3  0,0,0,0,0.63,0,0.31,0.63,0.31,0.63,0.31,0.31,0...
4  0,0,0,0,0.63,0,0.31,0.63,0.31,0.63,0.31,0.31,0...
Dataset shape: (4601, 1)


In [33]:
X = data.apply(lambda row: pd.Series([float(x) for x in row[0].split(',')]), axis=1)
y = X.iloc[:, -1] 
X = X.iloc[:, :-1]
num_features = X.shape[1]
print(f"The number of features is: {num_features}")

The number of features is: 57


In [34]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=40)

In [22]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
pred_nb = nb_classifier.predict(X_test)
accuracy_nb = accuracy_score(y_test, pred_nb)
precision_nb = precision_score(y_test, pred_nb, average='binary', zero_division=1)
recall_nb = recall_score(y_test, pred_nb, average='binary', zero_division=1)
f1_nb = f1_score(y_test, pred_nb, average='binary', zero_division=1)
cm_nb = confusion_matrix(y_test, pred_nb)

print("Naive Bayes Classifier:")
print(f"Accuracy: {accuracy_nb:.3f}, Precision: {precision_nb:.3f}, Recall: {recall_nb:.3f}, F1-score: {f1_nb:.3f}")
print("Confusion Matrix:")
print(cm_nb)

Naive Bayes Classifier:
Accuracy: 0.813, Precision: 0.680, Recall: 0.952, F1-score: 0.793
Confusion Matrix:
[[629 233]
 [ 25 494]]


In [24]:
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)
pred_lr = lr_classifier.predict(X_test)
accuracy_lr = accuracy_score(y_test, pred_lr)
precision_lr = precision_score(y_test, pred_lr, average='macro', zero_division=1)
recall_lr = recall_score(y_test, pred_lr, average='macro', zero_division=1)
f1_lr = f1_score(y_test, pred_lr, average='macro', zero_division=1)
conf_matrix_lr = confusion_matrix(y_test, pred_lr)

print("Logistic Regression:")
print(f"Accuracy: {accuracy_lr:.3f}, Precision: {precision_lr:.3f}, Recall: {recall_lr:.3f}, F1-score: {f1_lr:.3f}")
print("Confusion Matrix:")
print(conf_matrix_lr)
print("\n")


Logistic Regression:
Accuracy: 0.928, Precision: 0.927, Recall: 0.920, F1-score: 0.923
Confusion Matrix:
[[822  40]
 [ 59 460]]




In [25]:
# 3. Support Vector Machines (SVM)
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)
pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, pred_svm)
precision_svm = precision_score(y_test, pred_svm, average='macro', zero_division=1)
recall_svm = recall_score(y_test, pred_svm, average='macro', zero_division=1)
f1_svm = f1_score(y_test, pred_svm, average='macro', zero_division=1)
conf_matrix_svm = confusion_matrix(y_test, pred_svm)

print("Support Vector Machines (SVM):")
print(f"Accuracy: {accuracy_svm:.3f}, Precision: {precision_svm:.3f}, Recall: {recall_svm:.3f}, F1-score: {f1_svm:.3f}")
print("Confusion Matrix:")
print(conf_matrix_svm)
print("\n")



Support Vector Machines (SVM):
Accuracy: 0.928, Precision: 0.925, Recall: 0.921, F1-score: 0.923
Confusion Matrix:
[[818  44]
 [ 55 464]]




In [26]:
def predict_spam():
    try:
        input_text = entry_text.get()  #preprocess the input text
        input_data = np.array([float(x) for x in input_text.split(',')]) #split and converting to float
        
        
        if len(input_data) != X.shape[1]:
            raise ValueError("Invalid input length")
        
        
        input_data_scaled = scaler.transform([input_data])  #standardization
        
        
        prediction = svm_classifier.predict(input_data_scaled) #making prediction
        
        # Display prediction
        if prediction[0] == 1:
            result_label.config(text="Prediction: Spam")
        else:
            result_label.config(text="Prediction: Ham(not spam)")
    except ValueError as e:
        messagebox.showerror("Error", str(e))   

In [44]:
window = tk.Tk()
window.title("Email Spam Classification")

entry_label = tk.Label(window, text="Enter email content (57 comma-separated values):") #found it was 57 by finding number of features
entry_label.pack()

entry_text = tk.Entry(window, width=60)
entry_text.pack()

predict_button = tk.Button(window, text="Predict", command=predict_spam)
predict_button.pack()
result_label = tk.Label(window, text="")
result_label.pack()
window.mainloop()

In [56]:
pwd

'C:\\Users\\Anand'

In [46]:
#Building Naive Bayes Classifier without using any libraries

def standard_scaler(X_train, X_test):
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    X_train_scaled = (X_train - mean) / std
    X_test_scaled = (X_test - mean) / std
    return X_train_scaled, X_test_scaled

#calculating Gaussian probability density function
def gaussian_probability(x, mean, std):
    exponent = np.exp(-((x - mean)**2) / (2 * std**2))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

#training the model
def train_naive_bayes(X_train, y_train):
    n_samples, n_features = X_train.shape
    classes = np.unique(y_train)
    n_classes = len(classes)
    
    # The class priors represent the probability of each class occurring in the dataset.
    class_priors = {}
    for c in classes:
        class_priors[c] = np.sum(y_train == c) / float(n_samples)
    
    # Calculating mean and standard deviation for each feature in each class
    parameters = {}
    for c in classes:
        X_c = X_train[y_train == c]
        class_params = []
        for feature in range(n_features):
            mean = np.mean(X_c[:, feature])
            std = np.std(X_c[:, feature])
            class_params.append((mean, std))
        parameters[c] = class_params
    
    return class_priors, parameters

def predict_naive_bayes(X_test, class_priors, parameters):
    predictions = []
    for sample in X_test:
        posteriors = {}
        for c, params in parameters.items():
            class_prior = class_priors[c]
            likelihood = 1.0
            for feature, (mean, std) in enumerate(params):
                likelihood *= gaussian_probability(sample[feature], mean, std)
            posterior = class_prior * likelihood
            posteriors[c] = posterior
        prediction = max(posteriors, key=posteriors.get)
        predictions.append(prediction)
    
    return predictions

def accuracy_score(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / float(total)


data = pd.read_csv(file_path, header=None)
X = data.apply(lambda row: pd.Series([float(x) for x in row[0].split(',')]), axis=1)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
X_train, X_test = standard_scaler(X_train, X_test)

class_priors, parameters = train_naive_bayes(X_train, y_train)


y_pred_nb = predict_naive_bayes(X_test, class_priors, parameters)


# calculating precision
def precision_score(y_true, y_pred, pos_label=1):
    true_positives = np.sum((y_true == pos_label) & (y_pred == pos_label))
    predicted_positives = np.sum(y_pred == pos_label)
    if predicted_positives == 0:
        return 0  
    return true_positives / float(predicted_positives)

# Function to calculate recall
def recall_score(y_true, y_pred, pos_label=1):
    true_positives = np.sum((y_true == pos_label) & (y_pred == pos_label))
    actual_positives = np.sum(y_true == pos_label)
    if actual_positives == 0:
        return 0 
    return true_positives / float(actual_positives)

# Function to calculate F1-score
def f1_score(y_true, y_pred, pos_label=1):
    precision = precision_score(y_true, y_pred, pos_label=pos_label)
    recall = recall_score(y_true, y_pred, pos_label=pos_label)
    if precision + recall == 0:
        return 0 
    return 2 * (precision * recall) / (precision + recall)

# Function to calculate confusion matrix
def confusion_matrix(y_true, y_pred):
    classes = np.unique(np.concatenate((y_true, y_pred)))
    cm = np.zeros((len(classes), len(classes)), dtype=int)
    for i in range(len(classes)):
        true_class = classes[i]
        for j in range(len(classes)):
            predicted_class = classes[j]
            cm[i, j] = np.sum((y_true == true_class) & (y_pred == predicted_class))
    return cm

accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, pos_label=1)
recall_nb = recall_score(y_test, y_pred_nb, pos_label=1)
f1_nb = f1_score(y_test, y_pred_nb, pos_label=1)
cm_nb = confusion_matrix(y_test, y_pred_nb)

print("Naive Bayes Classifier:")
print(f"Accuracy: {accuracy_nb:.3f}, Precision: {precision_nb:.3f}, Recall: {recall_nb:.3f}, F1-score: {f1_nb:.3f}")
print("Confusion Matrix:")
print(cm_nb)

Naive Bayes Classifier:
Accuracy: 0.017, Precision: 0.000, Recall: 0.000, F1-score: 0.000
Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [47]:
pwd

'C:\\Users\\Anand'