In [1]:
# -----------------------------------------------------------
# 1. Import Libraries
# -----------------------------------------------------------
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [9]:
# -----------------------------------------------------------
# 2. Load Dataset
# -----------------------------------------------------------
df = pd.read_csv("C:/Users/Saurabh/Trim 4/ML/Codsoft/archive (9)/spam.csv", encoding='latin-1')

df = df[['v1', 'v2']]
df.columns = ['v1', 'v2']

df['v1'] = df['v1'].map({'spam':1, 'ham':0})

print("Table=",df.head())
print("Data Shape:",df.shape)

Table=    v1                                                 v2
0   0  Go until jurong point, crazy.. Available only ...
1   0                      Ok lar... Joking wif u oni...
2   1  Free entry in 2 a wkly comp to win FA Cup fina...
3   0  U dun say so early hor... U c already then say...
4   0  Nah I don't think he goes to usf, he lives aro...
Data Shape: (5572, 2)


In [15]:
# -----------------------------------------------------------
# 3. Text Cleaning Function
# -----------------------------------------------------------
def clean_text(v2):
    text = v2.lower()
    text = re.sub(r"http\S+|www\S+", " ", v2)
    text = v2.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\d+", " ", v2)
    text = re.sub(r"\s+", " ", v2).strip()
    return text

df['clean_text'] = df['v2'].apply(clean_text)

In [19]:
# -----------------------------------------------------------
# 4. Train-test Split
# -----------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['v1'], test_size=0.2, random_state=42
)

# -----------------------------------------------------------
# 5. TF-IDF Vectorization
# -----------------------------------------------------------
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [21]:
# -----------------------------------------------------------
# 6. Train Models
# -----------------------------------------------------------

# Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Linear SVM
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

In [23]:
# -----------------------------------------------------------
# 7. Evaluate Models
# -----------------------------------------------------------
def evaluate_model(model, X_test, y_test, name):
    print(f"\n============= {name} =============")
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

evaluate_model(nb_model, X_test_tfidf, y_test, "Multinomial Naive Bayes")
evaluate_model(lr_model, X_test_tfidf, y_test, "Logistic Regression")
evaluate_model(svm_model, X_test_tfidf, y_test, "Linear SVM")


Accuracy: 0.9713004484304932

Confusion Matrix:
 [[965   0]
 [ 32 118]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Accuracy: 0.9730941704035875

Confusion Matrix:
 [[965   0]
 [ 30 120]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.80      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115


Accuracy: 0.9847533632286996

Confusion Matrix:
 [[962   3]
 [ 14 136]]

Classification Report:
               precision    recall  f1-score   suppor

In [31]:
# -----------------------------------------------------------
# 8. Predict Spam/Ham for Any Message
# -----------------------------------------------------------
def predict_message(model, message):
    cleaned = clean_text(message)
    vector = tfidf.transform([cleaned])
    prediction = model.predict(vector)[0]

    if prediction == 1:
        return "ðŸ”¥ SPAM"
    else:
        return "âœ… HAM (Legitimate)"

# Example usage:
print("\n================ Example Prediction (LR_Model ================")
print(predict_message(lr_model, "Congratulations! You won a free iPhone. Click here now"))
print(predict_message(lr_model, "Hi Ishaan, are we meeting tomorrow?"))
print("\n================ Example Prediction (NB_Model ================")
print(predict_message(nb_model, "Congratulations! You won a free iPhone. Click here now"))
print(predict_message(nb_model, "Hi Ishaan, are we meeting tomorrow?"))
print("\n================ Example Prediction (SVM_Model ================")
print(predict_message(svm_model, "Congratulations! You won a free iPhone. Click here now"))
print(predict_message(svm_model, "Hi Ishaan, are we meeting tomorrow?"))


âœ… HAM (Legitimate)
âœ… HAM (Legitimate)

ðŸ”¥ SPAM
âœ… HAM (Legitimate)

ðŸ”¥ SPAM
âœ… HAM (Legitimate)
