# Spam SMS classification
## Name: Soumyadeep Deb

## Importing Libraries and downloading resources

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Error loading WordnetLemmatizer: Package
[nltk_data]     'WordnetLemmatizer' not found in index


False

## Getting some info on the dataset

In [81]:
df= pd.read_csv('spam.csv',encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


### We can simply drop columns 2-4

In [82]:
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
df.sample(5)

Unnamed: 0,v1,v2
3090,spam,LORD OF THE RINGS:RETURN OF THE KING in store ...
3984,ham,"Whatever, juliana. Do whatever you want."
4658,ham,I cant pick the phone right now. Pls send a me...
5315,ham,Hahaha..use your brain dear
1527,ham,No screaming means shouting..


## Text Pre-processing
## Steps:
1. Remove Punctuations
2. Lowercase each word
3. Word Tokenization
4. Remove stopwords eg: the, a, is etc.
5. Lemmatize / Stem each word
6. Build the vocabulary

In [37]:
vocab = []      # The complete list of processed words will stored here
sentences = df['v2'].values     # Sentences
y = df['v1']                    # Label column

In [38]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re  # For Regular expression

tokenize = word_tokenize
stemmer = PorterStemmer()

In [None]:
# the Preprossecing part
for i in range(len(sentences)):              
    sent = sentences[i]
    sent = re.sub('[^a-zA-Z]',' ',sent)     # Remove punctuations
    sent = sent.lower()                     # Lowercas each word
    words = tokenize(sent)                  # Tokenize each word
    # Now, Remove stopwords and Stem
    words = [stemmer.stem(word) for word in words if not word in stopwords.words(['english','french','german','spanish'])]

    # Reverting back to form a complete sentence of processed words
    sent = ' '.join(words)

    # At last append this to vocabulary
    vocab.append(sent)

print(vocab)

In [45]:
len(vocab)

5572


In [73]:
y.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [83]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

## ham - 0 , spam - 1 ~ Encoding of target variable

array([0, 0, 1, ..., 0, 0, 0])

## Building the model
### We use the function deploy_and_evaluate to test the bag of words and tf-idf models on it.
### Multinomial Naive Bayes has been used here.

In [79]:
def deploy_and_evaluate(X,y,test_size=0.3,random_state=5):
    # Train test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = test_size, random_state = random_state )

    from sklearn.naive_bayes import MultinomialNB
    from sklearn.metrics import confusion_matrix,accuracy_score, precision_score, recall_score

    model = MultinomialNB()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    # Evaluating the model
    print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
    print("\nAccuracy Score:",accuracy_score(y_test,y_pred)*100,"%")
    print("\nPrecision Score:",precision_score(y_test,y_pred)*100,"%")
    print("\nRecall Score:",recall_score(y_test,y_pred)*100,"%")

    return model

## Now, let's form a Bag of Words model and test it.

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
bag_vectorizer = CountVectorizer()

X_bag = bag_vectorizer.fit_transform(vocab)
X_bag = X_bag.toarray()
X_bag.shape

(5572, 6177)

In [84]:
model_bag = deploy_and_evaluate(X_bag , y)

Confusion Matrix:
 [[1440   25]
 [  11  196]]

Accuracy Score: 97.84688995215312 %

Precision Score: 88.68778280542986 %

Recall Score: 94.68599033816425 %


## Now, let's form a TF-IDF model and test it.

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

X_tfidf = tfidf_vectorizer.fit_transform(vocab)
X_tfidf.toarray()
X_tfidf.shape

(5572, 6177)

In [85]:
model_tfidf = deploy_and_evaluate(X_tfidf , y)

Confusion Matrix:
 [[1465    0]
 [  48  159]]

Accuracy Score: 97.1291866028708 %

Precision Score: 100.0 %

Recall Score: 76.81159420289855 %


## Custom SMS test

In [75]:
sms = input("Enter the text message: ")

Enter the text message: Congo! you won 10,000 pounds


In [98]:
text = re.sub('[^a-zA-Z]',' ',sms)
text = text.lower()
text = tokenize(text)
words = [stemmer.stem(word) for word in text if not word in stopwords.words(['english','french','spanish','german'])]
text = ' '.join(words)
text = [text]
print("Processed text:",text)

Processed text: ['congo pound']


In [99]:
text_bag = text
text_tfidf = text

In [100]:
text_bag = bag_vectorizer.transform(text)
print(text_bag)
prediction = model_bag.predict(text_bag)
if prediction[0] == 1:
    print("Spam")
else:
    print("Harmless")

  (0, 4058)	1
Spam


In [101]:
text_tfidf = tfidf_vectorizer.transform(text_tfidf)
print(text_tfidf)
prediction = model_tfidf.predict(text_tfidf)
if prediction[0] == 1:
    print("Spam")
else:
    print("Harmless")

  (0, 4058)	1.0
Harmless


## As is clear that the Bag of Words model is performing better than the TF-IDF model in this case.

## **In the TF-IDF model, the recall score came low.**
## That means that it is miss-classifying many spam messages as harmless.
## But since the precision came 100% in TF-IDF, we can infer that no harmless message was miss-classified as spam.