In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
file_path = '/content/spam.csv'
df = pd.read_csv(file_path, encoding='latin-1')

# Clean the data by selecting only relevant columns
sms_df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})

# Encode labels: ham (0), spam (1)
sms_df['label'] = sms_df['label'].map({'ham': 0, 'spam': 1})

# Split the data
X_train, X_test, y_train, y_test = train_test_split(sms_df['message'], sms_df['label'], test_size=0.2, random_state=42, stratify=sms_df['label'])

# Vectorize the text using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)

# Define a function to evaluate models
def evaluate_model(name, model):
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['Ham', 'Spam'])
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"\n{name} Model Results:")
    print(f"Accuracy: {accuracy:.4f}\n")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", conf_matrix)

# Evaluate all models
evaluate_model("Naive Bayes", nb_model)
evaluate_model("SVM", svm_model)
evaluate_model("Logistic Regression", lr_model)





Naive Bayes Model Results:
Accuracy: 0.9740

Classification Report:
               precision    recall  f1-score   support

         Ham       0.97      1.00      0.99       966
        Spam       0.99      0.81      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
 [[965   1]
 [ 28 121]]

SVM Model Results:
Accuracy: 0.9821

Classification Report:
               precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       966
        Spam       0.98      0.88      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[964   2]
 [ 18 131]]

Logistic Regression Model Results:
Accuracy: 0.9731

Classification Report:
               precision    recall  f1-score   suppor