<a href="https://colab.research.google.com/github/shiffa-04/NLP_SMS_Spam_Classifier/blob/main/SMS_Spam_Collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [142]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from collections import Counter
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

np.random.seed(42)

In [64]:
df = pd.read_csv("spam.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [65]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [66]:
df.shape

(5572, 2)

In [113]:
classes = df['labels'].value_counts()
classes

labels
ham     4825
spam     747
Name: count, dtype: int64

In [67]:
df.rename(columns={'v1': 'labels', 'v2': 'messages'}, inplace=True)
df.head()

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [68]:
df["messages"] = df["messages"].str.lower()
df.head(3)

Unnamed: 0,labels,messages
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...


In [69]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [70]:
def remove_punc(text):
  return text.translate(str.maketrans('', '', exclude))

df['messages'] = df['messages'].apply(remove_punc)
df.head(10)

Unnamed: 0,labels,messages
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
5,spam,freemsg hey there darling its been 3 weeks now...
6,ham,even my brother is not like to speak with me t...
7,ham,as per your request melle melle oru minnaminun...
8,spam,winner as a valued network customer you have b...
9,spam,had your mobile 11 months or more u r entitled...


In [71]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
ps = PorterStemmer()

In [73]:
messages = df['messages']

In [74]:
corpus = []

for message in messages:
    # Remove non-alphabet characters and convert to lowercase
    cleaned_message = re.sub('[^a-zA-Z0-9]', ' ', message).lower()

    # Split into words and remove stopwords
    words = cleaned_message.split()
    # print(words)
    filtered_words = [word for word in words if word not in stopwords.words('english')]

    # Apply stemming
    stemmed_words = [ps.stem(word) for word in filtered_words]
    # print(stemmed_words)

    # Join the processed words back into a single string and add to the corpus
    processed_message = ' '.join(stemmed_words)
    corpus.append(processed_message)
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah dont think goe usf live around though',
 'freemsg hey darl 3 week word back id like fun still tb ok xxx std chg send 150 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'im gonna home soon dont want talk stuff anymor tonight k ive cri enough today',
 'six chanc win cash 100 20000 pound txt csh11 send 87575 cost 150pday 6day 16 tsandc appli repli hl 4 info',
 'urgent

In [76]:
y = df['labels']

In [178]:
vectorizer_bow = CountVectorizer(ngram_range=(1, 2), max_features= 3000)

X_bow = vectorizer_bow.fit_transform(corpus).toarray()

In [179]:
smote = SMOTE(random_state=42, sampling_strategy= 0.5)
X_resampled, y_resampled = smote.fit_resample(X_bow, y)

In [180]:
resampled_class_distribution = Counter(y_resampled)
print("Resampled class distribution:", resampled_class_distribution)

Resampled class distribution: Counter({'ham': 4825, 'spam': 2412})


In [181]:
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [182]:
model_bow = MultinomialNB()
model_bow.fit(X_train_bow, y_train)
y_pred_bow = model_bow.predict(X_test_bow)

In [183]:
print("Classification report of Bag of words:")
print(classification_report(y_test, y_pred_bow))

Classification report of Bag of words:
              precision    recall  f1-score   support

         ham       0.90      0.99      0.94       950
        spam       0.97      0.80      0.88       498

    accuracy                           0.92      1448
   macro avg       0.94      0.89      0.91      1448
weighted avg       0.93      0.92      0.92      1448



In [136]:
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(corpus).toarray()

In [138]:
smote = SMOTE(random_state=42, sampling_strategy= 0.5)
X_resampled_tf, y_resampled_tf = smote.fit_resample(X_tfidf, y)

In [139]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_resampled_tf, y_resampled_tf, test_size=0.2, random_state=42)

In [148]:
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("Classification report of Bag of words:")
print(classification_report(y_test_tfidf, y_pred_tfidf))

Classification report of Bag of words:
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       950
        spam       0.99      0.96      0.98       498

    accuracy                           0.98      1448
   macro avg       0.98      0.98      0.98      1448
weighted avg       0.98      0.98      0.98      1448

