In [4]:
#Loading the Dataset

import pandas as pd
df = pd.read_csv("data/SMSSpamCollection", sep='\t', names=["label", "text"])
df.head()
print("Dataset shape", df.shape)
print(df['label'].value_counts())

Dataset shape (5572, 2)
label
ham     4825
spam     747
Name: count, dtype: int64


In [14]:
#Step 2: Basic Text Cleaning
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '',text)
    text = text.translate(str.maketrans('','',string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text']= df['text'].apply(clean_text)
df.head()


Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [16]:
#Step 3: Tokenization & Stopword Removal
import  nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def tok(text):
    tokens = word.tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Purnima\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Purnima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
#Step 4: Feature Extraction with BoW & TF-IDF
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['clean_text'])
print("Shape of Bow matrix:", X_bow.shape)
print("First 20 features:", bow_vectorizer.get_feature_names_out()[:20])
print( )
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])
print("Shape of tf-idf matrix:", X_tfidf.shape)
print("First 20 features:", tfidf_vectorizer.get_feature_names_out()[:20])


Shape of Bow matrix: (5572, 8608)
First 20 features: ['aa' 'aah' 'aaniye' 'aaooooright' 'aathilove' 'aathiwhere' 'ab' 'abbey'
 'abdomen' 'abeg' 'abelu' 'aberdeen' 'abi' 'ability' 'abiola' 'abj' 'able'
 'abnormally' 'about' 'aboutas']

Shape of tf-idf matrix: (5572, 8608)
First 20 features: ['aa' 'aah' 'aaniye' 'aaooooright' 'aathilove' 'aathiwhere' 'ab' 'abbey'
 'abdomen' 'abeg' 'abelu' 'aberdeen' 'abi' 'ability' 'abiola' 'abj' 'able'
 'abnormally' 'about' 'aboutas']


In [23]:
# --- 1. Train/Test Split ---
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

print("Training size:", X_train.shape)
print("Test size:", X_test.shape)

# --- 2. Multinomial Naive Bayes ---
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predictions
y_pred_nb = nb_model.predict(X_test)

# Metrics
print("Naive Bayes Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Precision:", precision_score(y_test, y_pred_nb, pos_label='spam'))
print("Recall:", recall_score(y_test, y_pred_nb, pos_label='spam'))
print("F1 Score:", f1_score(y_test, y_pred_nb, pos_label='spam'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

# --- 3. Logistic Regression ---
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

y_pred_log = log_model.predict(X_test)

print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log, pos_label='spam'))
print("Recall:", recall_score(y_test, y_pred_log, pos_label='spam'))
print("F1 Score:", f1_score(y_test, y_pred_log, pos_label='spam'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


Training size: (4457, 8608)
Test size: (1115, 8608)
Naive Bayes Metrics:
Accuracy: 0.9506726457399103
Precision: 1.0
Recall: 0.6308724832214765
F1 Score: 0.7736625514403292
Confusion Matrix:
 [[966   0]
 [ 55  94]]

Logistic Regression Metrics:
Accuracy: 0.9650224215246637
Precision: 1.0
Recall: 0.738255033557047
F1 Score: 0.8494208494208495
Confusion Matrix:
 [[966   0]
 [ 39 110]]


In [27]:
#Step 6: Transformer Embeddings with DistilBERT

%pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [33]:
%pip install --upgrade typing_extensions

Note: you may need to restart the kernel to use updated packages.


In [10]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np
from tqdm import tqdm
import pandas as pd

# ✅ Step 1: Load dataset correctly
df = pd.read_csv("data/SMSSpamCollection", sep='\t', header=None, names=['label', 'text'])
df.columns = ['label', 'text']

# ✅ Step 2: Clean the text
df['clean_text'] = df['text'].str.replace(r'[^a-zA-Z ]', '', regex=True).str.lower()

# ✅ Step 3: Load BERT tokenizer & model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# ✅ Step 4: Function to extract embedding from BERT
def get_bert_embedding(text):
    tokens = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=50,
        return_tensors='pt'
    )
    with torch.no_grad():
        outputs = bert_model(**tokens)
    # Take the mean across all tokens to get 1 vector per text
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# ✅ Step 5: Apply to the whole dataset
X_bert = np.array([get_bert_embedding(t) for t in tqdm(df['clean_text'])])
print("BERT Embeddings shape:", X_bert.shape)


100%|██████████████████████████████████████████████████████████████████████████████| 5572/5572 [09:06<00:00, 10.20it/s]

BERT Embeddings shape: (5572, 768)





In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
df['label_num'] = df['label'].map({'ham':0,'spam':1})
X_train, X_test, y_train, y_test = train_test_split(
    X_bert,
    df['label_num'],
    test_size=0.2,
    random_state=42
)

clf=LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))


Accuracy: 0.9838565022421525

Classification Report:
               precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       966
        Spam       0.96      0.92      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [22]:
def predict_spam(text):
    tokens= tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=50,
        return_tensors='pt'
    )
    with torch.no_grad():
        outputs=bert_model(**tokens)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy().reshape(1, -1)
    prediction = clf.predict(embedding)[0]
    return "Spam" if prediction == 1 else "Ham"

In [23]:
print(predict_spam("Congratulations! You've won a free ticket to Bahamas. Reply YES to claim."))
print(predict_spam("Win $1000 now! Just click this link."))
print(predict_spam("Hey, let's meet for lunch."))

Spam
Spam
Ham
