The main purpose is to build a deep learning model to classify SMS messages as spam or ham .It preprocesses text, vectorizes it using TF-IDF, trains a PyTorch ANN model, and evaluates its performance. The best precision achieved was 0.9432, and the model  predicts whether a new message is SPAM or HAM.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

nltk.download('all')

# -------------------
# 1. Load Dataset
# -------------------
df = pd.read_csv("sms.csv")

# Encode labels: ham = 0, spam = 1
# df['target'] = LabelEncoder().fit_transform(df['target'])
df['target'] = df['target'].map({'ham': 0, 'spam': 1})


# 2. Text Preprocessing


ps = PorterStemmer()

def preprocess(text):
    # Lowercase the text
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove non-alphanumeric characters
    tokens = [word for word in tokens if word.isalnum()]

    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english') and word not in string.punctuation]

    # Apply stemming
    tokens = [ps.stem(word) for word in tokens]

    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(preprocess)

# -------------------
# 3. Vectorize Text
# -------------------
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text']).toarray()
y = df['target'].values

# -------------------
# 4. Train-Test Split
# -------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# -------------------
# 5. Dataset & Dataloader
# -------------------
class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(SpamDataset(X_train_tensor, y_train_tensor), batch_size=16, shuffle=True)
test_loader = DataLoader(SpamDataset(X_test_tensor, y_test_tensor), batch_size=16)

# -------------------
# 6. Model Definition
# -------------------
class SpamANN(nn.Module):
    def __init__(self, input_dim):
        super(SpamANN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.model(x)

model = SpamANN(X_train.shape[1])

# -------------------
# 7. Training Setup
# -------------------
# pos_weight handles class imbalance: spam class is underrepresented
pos_weight = torch.tensor([(y_train == 0).sum() / (y_train == 1).sum()], dtype=torch.float32)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# -------------------
# 8. Evaluation Function
# -------------------
from sklearn.metrics import classification_report, precision_score

def evaluate(model, loader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            output = model(X_batch).squeeze(1)
            pred = torch.round(torch.sigmoid(output))
            preds.extend(pred.tolist())
            labels.extend(y_batch.tolist())
    print(classification_report(labels, preds, digits=4))

    # Return precision so training loop can compare and save best
    return precision_score(labels, preds, zero_division=0)

# -------------------
# 9. Training Loop
# -------------------
best_precision = 0
patience = 5
no_improve_epochs = 0
EPOCHS = 30

for epoch in range(EPOCHS):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch).squeeze(1)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

    precision = evaluate(model, test_loader)

    if precision > best_precision:
        best_precision = precision
        torch.save(model.state_dict(), "torch.pth")
        no_improve_epochs = 0
        improvement = " <-- Best Precision!"
    else:
        no_improve_epochs += 1
        improvement = ""

    print(f"Epoch {epoch+1}: Precision={precision:.4f}{improvement}")

    if no_improve_epochs >= patience:
        print("Early stopping due to no improvement in precision.")
        break

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

              precision    recall  f1-score   support

         0.0     0.9849    0.9768    0.9808      1206
         1.0     0.8579    0.9037    0.8802       187

    accuracy                         0.9670      1393
   macro avg     0.9214    0.9403    0.9305      1393
weighted avg     0.9679    0.9670    0.9673      1393

Epoch 1: Precision=0.8579 <-- Best Precision!
              precision    recall  f1-score   support

         0.0     0.9881    0.9677    0.9778      1206
         1.0     0.8160    0.9251    0.8672       187

    accuracy                         0.9620      1393
   macro avg     0.9021    0.9464    0.9225      1393
weighted avg     0.9650    0.9620    0.9629      1393

Epoch 2: Precision=0.8160
              precision    recall  f1-score   support

         0.0     0.9889    0.9635    0.9761      1206
         1.0     0.7982    0.9305    0.8593       187

    accuracy                         0.9591      1393
   macro avg     0.8936    0.9470    0.9177      1393
we

In [None]:
' '.join(['hi','hello','nepal'])

'hi hello nepal'

In [None]:
import pickle

# Save vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [None]:
import torch, pickle, re, string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords', quiet=True)

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Lowercase the text
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove non-alphanumeric characters
    tokens = [word for word in tokens if word.isalnum()]

    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english') and word not in string.punctuation]

    # Apply stemming
    tokens = [ps.stem(word) for word in tokens]

    return ' '.join(tokens)

with open("tfidf_vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

class SpamANN(torch.nn.Module):
    def __init__(self):
      super().__init__();
      self.model=torch.nn.Sequential(torch.nn.Linear(5000,128),torch.nn.ReLU(),torch.nn.Dropout(0.3),torch.nn.Linear(128,64),torch.nn.ReLU(),torch.nn.Linear(64,1))
    def forward(self,x): return self.model(x)

model = SpamANN()
model.load_state_dict(torch.load("torch.pth"))
model.eval()

def predict(text):
    x = vectorizer.transform([preprocess(text)]).toarray()
    with torch.no_grad():
        p = torch.sigmoid(model(torch.tensor(x, dtype=torch.float32))).item()
    return "SPAM" if p>=0.5 else "HAM"


In [None]:
predict("Congratulations! You've won a $1000 Walmart gift card. Click here to claim your prize.")

'SPAM'

In [None]:
predict("Win a trip to Paris! Text WIN to 12345 to enter the contest.")

'SPAM'

In [None]:
predict("Hey, how are you doing? Let's catch up soon!")

'HAM'

In [None]:
predict("Hey, are we still meeting for coffee tomorrow at 3 PM?")

'HAM'

In [None]:
predict("The restaurant reservation for 7 PM is confirmed. Looking forward to seeing you.")

'HAM'

In [None]:
print(predict("Free entry in 2 a weekly competition!"))

SPAM


In [None]:
predict("The restaurant reservation for 7 PM is confirmed. Looking forward to seeing you.")

'HAM'