In [1]:
import pandas as pd
import nltk

In [2]:
from nltk.corpus import stopwords

In [3]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [4]:
messages = pd.read_csv('SMSSpamCollection',sep='\t', names = ['labels','messages'])

In [5]:
messages.head()

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
messages.groupby('labels').describe()

Unnamed: 0_level_0,messages,messages,messages,messages
Unnamed: 0_level_1,count,unique,top,freq
labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


Calculate polarity of message

In [7]:
from textblob import TextBlob

In [8]:
def get_polarity(text):
    textblob = TextBlob(str(text))
    pol = textblob.sentiment.polarity
    return pol

messages['polarity'] = messages['messages'].apply(get_polarity)

In [9]:
messages.head()

Unnamed: 0,labels,messages,polarity
0,ham,"Go until jurong point, crazy.. Available only ...",0.15
1,ham,Ok lar... Joking wif u oni...,0.5
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0.3
3,ham,U dun say so early hor... U c already then say...,0.1
4,ham,"Nah I don't think he goes to usf, he lives aro...",0.0


Text Processing

Remove all the punctuation from the messages 

In [10]:
import string

In [11]:
def punctuation_removal(text):
    clean_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

messages['messages']=messages['messages'].apply(punctuation_removal)

Remove all the numbers from messages

In [12]:
import re
def drop_number(text):
    list_text_new = []
    for i in text:
        if not re.search('\d',i):
            list_text_new.append(i)
    return ''.join(list_text_new)

messages['messages'] = messages['messages'].apply(drop_number)

Convert all messages to lowercase and lemmitize them

In [13]:
from nltk.tokenize import word_tokenize

In [14]:
from nltk.stem import WordNetLemmatizer

In [15]:
lm = WordNetLemmatizer()

In [16]:
def lemmatize(text):
    text_tokens = word_tokenize(text)
    text_lemm = [lm.lemmatize(word.lower()) for word in text_tokens]
    return ' '.join(text_lemm)

messages['messages'] = messages['messages'].apply(lemmatize)

In [17]:
messages.head()

Unnamed: 0,labels,messages,polarity
0,ham,go until jurong point crazy available only in ...,0.15
1,ham,ok lar joking wif u oni,0.5
2,spam,free entry in a wkly comp to win fa cup final ...,0.3
3,ham,u dun say so early hor u c already then say,0.1
4,ham,nah i dont think he go to usf he life around h...,0.0


Remove all stopwords from messages

In [18]:
from nltk.corpus import stopwords

In [20]:
def remove_stopwords(text):
    text_tokens  = word_tokenize(text)
    tokens = [word for word in text_tokens if not word in stopwords.words('english')]
    tokens_text = ' '.join(tokens)
    return tokens_text

messages['messages'] = messages['messages'].apply(remove_stopwords)

In [23]:
messages.messages

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry wkly comp win fa cup final tkts st ...
3                     u dun say early hor u c already say
4                nah dont think go usf life around though
                              ...                        
5567    nd time tried contact u u £ pound prize claim ...
5568                          ü b going esplanade fr home
5569                        pity wa mood soany suggestion
5570    guy bitching acted like id interested buying s...
5571                                       rofl true name
Name: messages, Length: 5572, dtype: object

Convert textual data to numbers

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
bow_transformer = CountVectorizer().fit(messages['messages'])

In [26]:
len(bow_transformer.vocabulary_)

7954

In [27]:
message_bow = bow_transformer.transform(messages['messages'])

In [28]:
message_bow.shape

(5572, 7954)

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

In [31]:
tfidftransformer = TfidfTransformer().fit(message_bow)

In [32]:
messagesTfidf = tfidftransformer.transform(message_bow)

In [48]:
messagesTfidf.shape

(5572, 7954)

Naive Bayes Classifier

In [34]:
from sklearn.naive_bayes import MultinomialNB

In [35]:
spamDetectionModel = MultinomialNB().fit(messagesTfidf,messages['labels'])

Model Evaluation

In [36]:
allPrediction = spamDetectionModel.predict(messagesTfidf)

In [37]:
allPrediction

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [38]:
from sklearn.metrics import classification_report

In [39]:
print(classification_report(messages['labels'], allPrediction))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      4825
        spam       1.00      0.83      0.91       747

    accuracy                           0.98      5572
   macro avg       0.99      0.92      0.95      5572
weighted avg       0.98      0.98      0.98      5572



Train_Test_split

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
msg_train, msg_test, label_train, label_test = train_test_split(messages['messages'], messages['labels'], test_size=0.2)

In [59]:
print(len(msg_train), len(msg_test), len(label_train), len(label_test))

4457 1115 4457 1115


Using Pipelines

In [60]:
from sklearn.pipeline import Pipeline

In [61]:
pipeline = Pipeline([('bow',CountVectorizer()),
                     ('tfidf',TfidfTransformer()),
                     ('classifier',MultinomialNB())
                    ])

In [62]:
pipeline.fit(msg_train,label_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [66]:
piplinedPred = pipeline.predict(msg_test)

In [68]:
piplinedPred

array(['ham', 'spam', 'ham', ..., 'spam', 'ham', 'ham'], dtype='<U4')

In [70]:
print(classification_report(piplinedPred, label_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98       986
        spam       0.77      1.00      0.87       129

    accuracy                           0.97      1115
   macro avg       0.88      0.98      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [71]:
piplinedPred = pipeline.predict(["this was all about Natural language processing"])

In [74]:
piplinedPred[0]

'ham'