In [38]:
# The idea is to quickly perform basic NLP related tasks using NLTK library for the spam classification task
# DATASET USED: SMS Spam Collection Data Set (Source: https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)

import pandas as pd
import numpy as np

In [39]:
msg=pd.read_csv('/content/drive/MyDrive/SMSSpamCollection',sep='\t',names=['label','text']) #read the tab seperated file

In [40]:
msg.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
msg.shape

(5572, 2)

In [42]:
msg.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [8]:
# STEPS
#1. Clean Text: Remove punctuations, numbers, stopwords. Also, Stemming/lemmatizatino is performed
#2. Create BOW/TFIDF to generate vectorized independent features
#3. Create and Train model for spam detection

In [21]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [ ] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)
  [ ] cess_cat............ CESS-CAT Treebank
  [

    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to /root/nltk_data...
       |   Unzipping grammars/book_gr


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [43]:
# DATA PRE_PROCESSING
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

stemmer=PorterStemmer()

sentences=list(msg.text)

for i in range(len(sentences)):
  sentences[i]=re.sub('[^a-zA-Z]',' ',sentences[i]).lower() #Substitute non-alphabetic char with space and convert char into lowercase
  words=nltk.word_tokenize(sentences[i])  #Tokenize sentences into words
  words=[stemmer.stem(x) for x in words if x not in stopwords.words('english')] #Perform Stemming/Lemmatization after stop words removal
  sentences[i]=' '.join(words)
  


In [44]:
#VECTORIZATION
from sklearn.feature_extraction.text import CountVectorizer #BOW technique
cv=CountVectorizer()
X=cv.fit_transform(sentences).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [45]:
y=pd.get_dummies(msg.label)
y=y.iloc[:,0].values

In [48]:
from sklearn.model_selection import train_test_split #Train Test SPlit process
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [56]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB() #Create model
model.fit(X_train,y_train) #Train model

MultinomialNB()

In [57]:
y_pred=model.predict(X_test) #Prediction

In [62]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score #Model evaluation
print(f"****CONFUSION MATRIX****\n {confusion_matrix(y_pred,y_test)}")
print(f"*******CLASSIFICATION REPORT******\n {classification_report(y_pred,y_test)}")
print(f"*******ACCURACY SCORE******\n {accuracy_score(y_pred,y_test)}")

****CONFUSION MATRIX****
 [[141  16]
 [  6 952]]
*******CLASSIFICATION REPORT******
               precision    recall  f1-score   support

           0       0.96      0.90      0.93       157
           1       0.98      0.99      0.99       958

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

*******ACCURACY SCORE******
 0.9802690582959641
