<a href="https://colab.research.google.com/github/shiffa-04/NLP_SMS_Spam_Classifier/blob/main/SMS_Spam_Collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from collections import Counter
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords, wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

np.random.seed(42)

In [29]:
df = pd.read_csv("spam.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [30]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
df.shape

(5572, 2)

In [32]:
df.rename(columns={'v1': 'labels', 'v2': 'messages'}, inplace=True)
df.head()

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [33]:
classes = df['labels'].value_counts()
classes

labels
ham     4825
spam     747
Name: count, dtype: int64

In [34]:
df["messages"] = df["messages"].str.lower()
df.head(3)

Unnamed: 0,labels,messages
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...


In [35]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [38]:
df['messages'] = df['messages'].apply(lambda x: x.translate(str.maketrans('', '', exclude)))

In [39]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [41]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [42]:
def preprocess_text(messages, method='stemming'):
    processed_corpus = []
    for message in messages:
        # Remove non-alphabet characters
        cleaned_message = re.sub('[^a-zA-Z]', ' ', message).lower()
        # Split into words and remove stopwords
        words = cleaned_message.split()
        filtered_words = [word for word in words if word not in stopwords.words('english')]

        if method == 'stemming':
            # Apply stemming
            processed_words = [ps.stem(word) for word in filtered_words]
        elif method == 'lemmatization':
            # Apply lemmatization
            processed_words = [lemmatizer.lemmatize(word) for word in filtered_words]
        else:
            raise ValueError("Method should be either 'stemming' or 'lemmatization'")

        # Join processed words back into a single string
        processed_message = ' '.join(processed_words)
        processed_corpus.append(processed_message)
    return processed_corpus

In [43]:
stemmed_corpus = preprocess_text(df['messages'], method='stemming')
lemmatized_corpus = preprocess_text(df['messages'], method='lemmatization')

In [44]:
y = df['labels']

In [45]:
def vectorize_and_evaluate(corpus, vectorizer, vectorizer_name, method_name):
    X = vectorizer.fit_transform(corpus).toarray()

    smote = SMOTE(random_state=42, sampling_strategy=0.5)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"Classification report using {vectorizer_name} with {method_name}:")
    print(classification_report(y_test, y_pred))

In [46]:
vectorize_and_evaluate(stemmed_corpus, CountVectorizer(ngram_range=(1, 2), max_features=3000), "Bag of Words", "Stemming")
vectorize_and_evaluate(lemmatized_corpus, CountVectorizer(ngram_range=(1, 2), max_features=3000), "Bag of Words", "Lemmatization")

Classification report using Bag of Words with Stemming:
              precision    recall  f1-score   support

         ham       0.91      0.99      0.94       950
        spam       0.97      0.80      0.88       498

    accuracy                           0.92      1448
   macro avg       0.94      0.89      0.91      1448
weighted avg       0.93      0.92      0.92      1448

Classification report using Bag of Words with Lemmatization:
              precision    recall  f1-score   support

         ham       0.91      0.99      0.95       950
        spam       0.97      0.81      0.88       498

    accuracy                           0.93      1448
   macro avg       0.94      0.90      0.91      1448
weighted avg       0.93      0.93      0.92      1448



In [47]:
vectorize_and_evaluate(stemmed_corpus, TfidfVectorizer(ngram_range=(1, 2), max_features=3000), "TF-IDF", "Stemming")
vectorize_and_evaluate(lemmatized_corpus, TfidfVectorizer(ngram_range=(1, 2), max_features=3000), "TF-IDF", "Lemmatization")

Classification report using TF-IDF with Stemming:
              precision    recall  f1-score   support

         ham       0.96      0.99      0.97       950
        spam       0.97      0.91      0.94       498

    accuracy                           0.96      1448
   macro avg       0.96      0.95      0.96      1448
weighted avg       0.96      0.96      0.96      1448

Classification report using TF-IDF with Lemmatization:
              precision    recall  f1-score   support

         ham       0.96      0.99      0.97       950
        spam       0.98      0.91      0.94       498

    accuracy                           0.96      1448
   macro avg       0.97      0.95      0.96      1448
weighted avg       0.96      0.96      0.96      1448

