<a href="https://colab.research.google.com/github/rnf45/Email-Spam-Classification/blob/main/Email_Spam_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

1. load csv file




In [8]:
# import NumPy library for numerical operations
import numpy as np

# import Pandas library for data manipulation
import pandas as pd

# import train-test split and K-Fold cross-validation from scikit-learn
from sklearn.model_selection import train_test_split, KFold

def load_data(file_path):

    # read CSV file using Pandas
    data = pd.read_csv(file_path)

    # extract feature matrix (excluding the last column)
    X = data.iloc[:, :-1].values

    # extract target variable (last column)
    y = data.iloc[:, -1].values

    # return feature matrix and target variable
    return X, y

2. split dataset

In [9]:
def split_data(X, y, test_size=0.2):

    # split the data using train_test_split from scikit-learn
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # return the training and test sets
    return X_train, X_test, y_train, y_test

#Naive bayes


   

   

In [10]:
# Naive Bayes algorithm
def naive_bayes(X_train, y_train, X_test):

    # remove last four attributes from training set
    X_train = X_train[:, :-4]

    # remove last four attributes from test set
    X_test = X_test[:, :-4]

    # calculate prior probabilities of each class
    class_prob = np.bincount(y_train) / len(y_train)

    # initialize attribute probabilities matrix
    attr_prob = np.zeros((2, X_train.shape[1]))

    # iterate over classes (0 and 1)
    for c in range(2):

        # select instances belonging to class c
        X_c = X_train[y_train == c]

        # calculate attribute probabilities using Laplace smoothing
        attr_prob[c] = (X_c.sum(axis=0) + 1) / (X_c.shape[0] + 2)

    # initialize empty list to store predictions
    predictions = []

    # iterate over each instance in test set
    for x in X_test:

        # calculate log probabilities for each class
        probs = [np.log(class_prob[c]) + np.sum(np.log(attr_prob[c, x > 0])) for c in range(2)]

        # append predicted class with highest probability to predictions list
        predictions.append(np.argmax(probs))

    # return predictions as a NumPy array
    return np.array(predictions)

#KNN - k-Nearest Neighbor

In [11]:
# cosine similarity function
def cosine_similarity(x1, x2):

    # calculate cosine similarity between x1 and x2 using numpy.cos()
    cos_sim = np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))

    return np.cos(np.arccos(cos_sim))



# K-Nearest Neighbor
def knn(X_train, y_train, X_test, k=5):

    # initialize empty list to store predictions
    predictions = []

    # iterate over each test instance
    for x in X_test:

        # calculate cosine similarities between x and each instance in X_train
        similarities = [cosine_similarity(x, x_train) for x_train in X_train]

        # find indices of k nearest neighbors
        nearest_indices = np.argsort(similarities)[-k:]

        # retrieve labels of k nearest neighbors
        nearest_labels = y_train[nearest_indices]

        # predict class based on majority vote of k nearest neighbors
        prediction = np.bincount(nearest_labels).argmax()

        # append predicted class to predictions list
        predictions.append(prediction)

    # return predictions as a NumPy array
    return np.array(predictions)

# LR - Logistic Regression



In [12]:
# sigmoid activation function
def sigmoid(z):

    # calculate the sigmoid of z
    return 1 / (1 + np.exp(-z))

# Logistic Regression algorithm
def logistic_regression(X_train, y_train, X_test, epochs=100, learning_rate=0.01):

    # add column of ones to training set for bias term
    X_train = np.c_[np.ones((len(X_train), 1)), X_train]

    # add column of ones to test set for bias term
    X_test = np.c_[np.ones((len(X_test), 1)), X_test]

    # initialize weight matrix with random values
    M = np.random.randn(X_train.shape[1], 1)

    # train logistic regression model for specified number of epochs
    for _ in range(epochs):

        # calculate predicted probabilities using sigmoid function
        pred_y = sigmoid(np.dot(X_train, M))

        # calculate cross-entropy loss
        loss = -np.mean(y_train * np.log(pred_y) + (1 - y_train) * np.log(1 - pred_y))

        # calculate gradient of loss with respect to weights
        gm = np.dot(X_train.T, (pred_y - y_train.reshape(-1, 1))) * 2 / len(X_train)

        # update weights using gradient descent
        M -= learning_rate * gm

    # make predictions on test set using trained weights
    predictions = (sigmoid(np.dot(X_test, M)) >= 0.5).astype(int).flatten()

    # return predictions
    return predictions

# Model Evaluation


In [13]:
def evaluate_model(y_true, y_pred):

    # calculate accuracy of model
    accuracy = np.mean(y_true == y_pred)

    # calculate number of false positives
    fp = np.sum((y_pred == 1) & (y_true == 0))

    # calculate number of true positives
    tp = np.sum((y_pred == 1) & (y_true == 1))

    # calculate number of false negatives
    fn = np.sum((y_pred == 0) & (y_true == 1))

    # calculate number of true negatives
    tn = np.sum((y_pred == 0) & (y_true == 0))

    # calculate false positive rate
    fpr = fp / (fp + tn)

    # calculate true positive rate
    tpr = tp / (tp + fn)

    # calculate area under ROC curve
    auc = 0.5 * (tpr + 1 - fpr)

    # return evaluation metrics
    return accuracy, fpr, tpr, auc

In [14]:

def main():
    # load the dataset
    X, y = load_data("spambase.csv")

    # split dataset into training and test sets
    X_train, X_test, y_train, y_test = split_data(X, y)

    # perform 5-fold cross-validation on training set
    kf = KFold(n_splits=5)

    # iterate over each fold
    for train_index, val_index in kf.split(X_train):

        # split training set into training and validation subsets for current fold
        X_sub_train, X_sub_val = X_train[train_index], X_train[val_index]
        y_sub_train, y_sub_val = y_train[train_index], y_train[val_index]

        # train and evaluate Naive Bayes model
        nb_predictions = naive_bayes(X_sub_train, y_sub_train, X_sub_val)
        nb_accuracy, nb_fpr, nb_tpr, nb_auc = evaluate_model(y_sub_val, nb_predictions)
        print("Naive Bayes - Accuracy: {:.2f}, FPR: {:.2f}, TPR: {:.2f}, AUC: {:.2f}".format(
            nb_accuracy, nb_fpr, nb_tpr, nb_auc))

        # train and evaluate KNN model
        knn_predictions = knn(X_sub_train, y_sub_train, X_sub_val)
        knn_accuracy, knn_fpr, knn_tpr, knn_auc = evaluate_model(y_sub_val, knn_predictions)
        print("KNN - Accuracy: {:.2f}, FPR: {:.2f}, TPR: {:.2f}, AUC: {:.2f}".format(
            knn_accuracy, knn_fpr, knn_tpr, knn_auc))

        # train and evaluate Logistic Regression model
        lr_predictions = logistic_regression(X_sub_train, y_sub_train, X_sub_val)
        lr_accuracy, lr_fpr, lr_tpr, lr_auc = evaluate_model(y_sub_val, lr_predictions)
        print("Logistic Regression - Accuracy: {:.2f}, FPR: {:.2f}, TPR: {:.2f}, AUC: {:.2f}".format(
            lr_accuracy, lr_fpr, lr_tpr, lr_auc))

    # evaluate Naive Bayes model on test set
    nb_test_predictions = naive_bayes(X_train, y_train, X_test)
    nb_test_accuracy, nb_test_fpr, nb_test_tpr, nb_test_auc = evaluate_model(y_test, nb_test_predictions)
    print("Naive Bayes (Test Set) - Accuracy: {:.2f}, FPR: {:.2f}, TPR: {:.2f}, AUC: {:.2f}".format(
        nb_test_accuracy, nb_test_fpr, nb_test_tpr, nb_test_auc))

    # evaluate KNN model on test set
    knn_test_predictions = knn(X_train, y_train, X_test)
    knn_test_accuracy, knn_test_fpr, knn_test_tpr, knn_test_auc = evaluate_model(y_test, knn_test_predictions)
    print("KNN (Test Set) - Accuracy: {:.2f}, FPR: {:.2f}, TPR: {:.2f}, AUC: {:.2f}".format(
        knn_test_accuracy, knn_test_fpr, knn_test_tpr, knn_test_auc))

    # evaluate Logistic Regression model on test set
    lr_test_predictions = logistic_regression(X_train, y_train, X_test)
    lr_test_accuracy, lr_test_fpr, lr_test_tpr, lr_test_auc = evaluate_model(y_test, lr_test_predictions)
    print("Logistic Regression (Test Set) - Accuracy: {:.2f}, FPR: {:.2f}, TPR: {:.2f}, AUC: {:.2f}".format(
        lr_test_accuracy, lr_test_fpr, lr_test_tpr, lr_test_auc))

if __name__ == "__main__":
    # Calling the main function
    main()

Naive Bayes - Accuracy: 0.86, FPR: 0.21, TPR: 0.97, AUC: 0.88


  return np.cos(np.arccos(cos_sim))


KNN - Accuracy: 0.84, FPR: 0.18, TPR: 0.87, AUC: 0.85


  loss = -np.mean(y_train * np.log(pred_y) + (1 - y_train) * np.log(1 - pred_y))
  loss = -np.mean(y_train * np.log(pred_y) + (1 - y_train) * np.log(1 - pred_y))
  return 1 / (1 + np.exp(-z))


Logistic Regression - Accuracy: 0.62, FPR: 0.02, TPR: 0.03, AUC: 0.51
Naive Bayes - Accuracy: 0.85, FPR: 0.22, TPR: 0.96, AUC: 0.87
KNN - Accuracy: 0.80, FPR: 0.23, TPR: 0.86, AUC: 0.82
Logistic Regression - Accuracy: 0.43, FPR: 0.88, TPR: 0.95, AUC: 0.53
Naive Bayes - Accuracy: 0.89, FPR: 0.16, TPR: 0.96, AUC: 0.90
KNN - Accuracy: 0.84, FPR: 0.18, TPR: 0.86, AUC: 0.84
Logistic Regression - Accuracy: 0.46, FPR: 0.93, TPR: 1.00, AUC: 0.53
Naive Bayes - Accuracy: 0.87, FPR: 0.18, TPR: 0.97, AUC: 0.89
KNN - Accuracy: 0.82, FPR: 0.19, TPR: 0.85, AUC: 0.83
Logistic Regression - Accuracy: 0.45, FPR: 0.86, TPR: 0.96, AUC: 0.55
Naive Bayes - Accuracy: 0.88, FPR: 0.16, TPR: 0.94, AUC: 0.89
KNN - Accuracy: 0.86, FPR: 0.15, TPR: 0.89, AUC: 0.87
Logistic Regression - Accuracy: 0.60, FPR: 0.60, TPR: 0.92, AUC: 0.66
Naive Bayes (Test Set) - Accuracy: 0.89, FPR: 0.18, TPR: 0.98, AUC: 0.90
KNN (Test Set) - Accuracy: 0.85, FPR: 0.18, TPR: 0.88, AUC: 0.85
Logistic Regression (Test Set) - Accuracy: 0.51,