In [12]:
#PREDICTIVE MODELING WITH CLASSIFICATION

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib

In [3]:
file_path = 'spambase.csv'
data = pd.read_csv(file_path, header=None)
print("Dataset structure:")
print(data.head())

Dataset structure:
                                                   0
0  0,0.64,0.64,0,0.32,0,0,0,0,0,0,0.64,0,0,0,0.32...
1  0.21,0.28,0.5,0,0.14,0.28,0.21,0.07,0,0.94,0.2...
2  0.06,0,0.71,0,1.23,0.19,0.19,0.12,0.64,0.25,0....
3  0,0,0,0,0.63,0,0.31,0.63,0.31,0.63,0.31,0.31,0...
4  0,0,0,0,0.63,0,0.31,0.63,0.31,0.63,0.31,0.31,0...


In [6]:
X = data.apply(lambda row: pd.Series([float(x) for x in row[0].split(',')]), axis=1)
y = X.iloc[:, -1] 
X = X.iloc[:, :-1]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
pred_nb = nb_classifier.predict(X_test)
accuracy_nb = accuracy_score(y_test, pred_nb)
precision_nb = precision_score(y_test, pred_nb, average='binary', zero_division=1)
recall_nb = recall_score(y_test, pred_nb, average='binary', zero_division=1)
f1_nb = f1_score(y_test, pred_nb, average='binary', zero_division=1)
cm_nb = confusion_matrix(y_test, pred_nb)

print("Naive Bayes Classifier:")
print(f"Accuracy: {accuracy_nb:.4f}, Precision: {precision_nb:.4f}, Recall: {recall_nb:.4f}, F1-score: {f1_nb:.4f}")
print("Confusion Matrix:")
print(cm_nb)

Naive Bayes Classifier:
Accuracy: 0.8219, Precision: 0.7233, Recall: 0.9385, F1-score: 0.8170
Confusion Matrix:
[[391 140]
 [ 24 366]]


In [10]:
#logistic regression
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)
pred_lr = lr_classifier.predict(X_test)
accuracy_lr = accuracy_score(y_test, pred_lr)
precision_lr = precision_score(y_test, pred_lr, average='macro', zero_division=1)
recall_lr = recall_score(y_test, pred_lr, average='macro', zero_division=1)
f1_lr = f1_score(y_test, pred_lr, average='macro', zero_division=1)
conf_matrix_lr = confusion_matrix(y_test, pred_lr)

print("Logistic Regression:")
print(f"Accuracy: {accuracy_lr:.4f}, Precision: {precision_lr:.4f}, Recall: {recall_lr:.4f}, F1-score: {f1_lr:.4}")
print("Confusion Matrix:")
print(conf_matrix_lr)
print("\n")


Logistic Regression:
Accuracy: 0.9197, Precision: 0.9217, Recall: 0.9136, F1-score: 0.917
Confusion Matrix:
[[506  25]
 [ 49 341]]




In [15]:
#Support Vector Machines (SVM)
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)
pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, pred_svm)
precision_svm = precision_score(y_test, pred_svm, average='macro', zero_division=1)
recall_svm = recall_score(y_test, pred_svm, average='macro', zero_division=1)
f1_svm = f1_score(y_test, pred_svm, average='macro', zero_division=1)
conf_matrix_svm = confusion_matrix(y_test, pred_svm)

print("Support Vector Machines (SVM):")
print(f"Accuracy: {accuracy_svm:.4f}, Precision: {precision_svm:.4f}, Recall: {recall_svm:.4f}, F1-score: {f1_svm:.4f}")
print("Confusion Matrix:")
print(conf_matrix_svm)
print("\n")


Support Vector Machines (SVM):
Accuracy: 0.9251, Precision: 0.9267, Recall: 0.9197, F1-score: 0.9227
Confusion Matrix:
[[507  24]
 [ 45 345]]




In [16]:
# Decision Tree classifier
accuracy_dt = accuracy_score(y_test, pred_dt)
precision_dt = precision_score(y_test, pred_dt, average='macro', zero_division=1)
recall_dt = recall_score(y_test, pred_dt, average='macro', zero_division=1)
f1_dt = f1_score(y_test, pred_dt, average='macro', zero_division=1)
conf_matrix_dt = confusion_matrix(y_test, pred_dt)

print("Decision Tree Classifier:")
print(f"Accuracy: {accuracy_dt:.4f}, Precision: {precision_dt:.4f}, Recall: {recall_dt:.4f}, F1-score: {f1_dt:.4f}")
print("Confusion Matrix:")
print(conf_matrix_dt)
print("\n")

Decision Tree Classifier:
Accuracy: 0.9175, Precision: 0.9175, Recall: 0.9131, F1-score: 0.9151
Confusion Matrix:
[[500  31]
 [ 45 345]]


