<a href="https://colab.research.google.com/github/shiffa-04/NLP_SMS_Spam_Classifier/blob/main/SMS_Spam_Collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from collections import Counter
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords, wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

np.random.seed(42)

In [None]:
df = pd.read_csv("spam.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.shape

(5572, 2)

In [None]:
df.rename(columns={'v1': 'labels', 'v2': 'messages'}, inplace=True)
df.head()

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
classes = df['labels'].value_counts()
classes

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
df["messages"] = df["messages"].str.lower()
df.head(3)

Unnamed: 0,labels,messages
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...


In [None]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
df['messages'] = df['messages'].apply(lambda x: x.translate(str.maketrans('', '', exclude)))

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(messages, method='stemming'):
    processed_corpus = []
    for message in messages:
        # Remove non-alphabet characters
        cleaned_message = re.sub('[^a-zA-Z]', ' ', message).lower()
        # Split into words and remove stopwords
        words = cleaned_message.split()
        filtered_words = [word for word in words if word not in stopwords.words('english')]

        if method == 'stemming':
            # Apply stemming
            processed_words = [ps.stem(word) for word in filtered_words]
        elif method == 'lemmatization':
            # Apply lemmatization
            processed_words = [lemmatizer.lemmatize(word) for word in filtered_words]
        else:
            raise ValueError("Method should be either 'stemming' or 'lemmatization'")

        # Join processed words back into a single string
        processed_message = ' '.join(processed_words)
        processed_corpus.append(processed_message)
    return processed_corpus

In [None]:
stemmed_corpus = preprocess_text(df['messages'], method='stemming')
lemmatized_corpus = preprocess_text(df['messages'], method='lemmatization')

In [None]:
df['labels'] = df['labels'].map({'spam': 1, 'ham': 0})
print(df.head())
y = df['labels']

   labels                                           messages
0       0  go until jurong point crazy available only in ...
1       0                            ok lar joking wif u oni
2       1  free entry in 2 a wkly comp to win fa cup fina...
3       0        u dun say so early hor u c already then say
4       0  nah i dont think he goes to usf he lives aroun...


**BAG OF WORDS (BOW)  AND TF-IDF**

In [None]:
def vectorize_and_evaluate(corpus, vectorizer, vectorizer_name, method_name):
    X = vectorizer.fit_transform(corpus).toarray()

    smote = SMOTE(random_state=42, sampling_strategy=0.5)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"Classification report using {vectorizer_name} with {method_name}:")
    print(classification_report(y_test, y_pred))

In [None]:
vectorize_and_evaluate(stemmed_corpus, CountVectorizer(ngram_range=(1, 2), max_features=3000), "Bag of Words", "Stemming")
vectorize_and_evaluate(lemmatized_corpus, CountVectorizer(ngram_range=(1, 2), max_features=3000), "Bag of Words", "Lemmatization")

Classification report using Bag of Words with Stemming:
              precision    recall  f1-score   support

           0       0.91      0.99      0.94       950
           1       0.97      0.80      0.88       498

    accuracy                           0.92      1448
   macro avg       0.94      0.89      0.91      1448
weighted avg       0.93      0.92      0.92      1448

Classification report using Bag of Words with Lemmatization:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       950
           1       0.97      0.81      0.88       498

    accuracy                           0.93      1448
   macro avg       0.94      0.90      0.91      1448
weighted avg       0.93      0.93      0.92      1448



In [None]:
vectorize_and_evaluate(stemmed_corpus, TfidfVectorizer(ngram_range=(1, 2), max_features=3000), "TF-IDF", "Stemming")
vectorize_and_evaluate(lemmatized_corpus, TfidfVectorizer(ngram_range=(1, 2), max_features=3000), "TF-IDF", "Lemmatization")

Classification report using TF-IDF with Stemming:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       950
           1       0.97      0.91      0.94       498

    accuracy                           0.96      1448
   macro avg       0.96      0.95      0.96      1448
weighted avg       0.96      0.96      0.96      1448

Classification report using TF-IDF with Lemmatization:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       950
           1       0.98      0.91      0.94       498

    accuracy                           0.96      1448
   macro avg       0.97      0.95      0.96      1448
weighted avg       0.96      0.96      0.96      1448



**SENTENCE EMBEDDINGS**

In [None]:
!pip install sentence-transformers

In [None]:
# Function to generate sentence embeddings and evaluate model
def embed_and_evaluate(corpus, model_name, method_name):
    # Load the pre-trained SBERT model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate sentence embeddings
    embeddings = model.encode(corpus)

    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=42, sampling_strategy=0.5)
    X_resampled, y_resampled = smote.fit_resample(embeddings, y)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    # Train and evaluate the model
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    # Print classification report
    print(f"Classification report using {model_name} embeddings with {method_name}:")
    print(classification_report(y_test, y_pred))


In [None]:
# Evaluate using SBERT embeddings
embed_and_evaluate(stemmed_corpus, "SBERT", "Stemming")
embed_and_evaluate(lemmatized_corpus, "SBERT", "Lemmatization")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Classification report using SBERT embeddings with Stemming:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       950
           1       0.97      0.96      0.96       498

    accuracy                           0.98      1448
   macro avg       0.97      0.97      0.97      1448
weighted avg       0.98      0.98      0.98      1448

Classification report using SBERT embeddings with Lemmatization:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       950
           1       0.98      0.95      0.97       498

    accuracy                           0.98      1448
   macro avg       0.98      0.97      0.97      1448
weighted avg       0.98      0.98      0.98      1448

