In [56]:
import pandas as pd
import numpy as np
import nltk

In [57]:
raw_data = open('/Users/pranayrastogi/Downloads/smsspamcollection/SMSSpamCollection').read()

parsed_data = raw_data.replace('\t','\n').split('\n')
parsed_data[:10]

['ham',
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'ham',
 'Ok lar... Joking wif u oni...',
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham',
 'U dun say so early hor... U c already then say...',
 'ham',
 "Nah I don't think he goes to usf, he lives around here though"]

The data can be downloaded from the link https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection . The data is unstructured and so the first task is to convert it into structured data.

In [58]:
label_list = parsed_data[0::2]
msg_list = parsed_data[1::2]
label_list[:5],msg_list[:5]
len(label_list),len(msg_list)

(5575, 5574)

In [59]:
label_list = label_list[:-1]
df = pd.DataFrame({'sms':msg_list,'label':label_list})
df.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


## Data Exploration

In [60]:
df.shape

(5574, 2)

In [61]:
df.describe()

Unnamed: 0,sms,label
count,5574,5574
unique,5171,2
top,"Sorry, I'll call later",ham
freq,30,4827


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
sms      5574 non-null object
label    5574 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


In [63]:
df['label'].value_counts(normalize=True)

ham     0.865985
spam    0.134015
Name: label, dtype: float64

In [64]:
df.isnull().sum()

sms      0
label    0
dtype: int64

## Text Preprocessing

We can pre-process the text data by removing the stop words,punctuations and then going for Normalization,Stemming and Lemmatization

In [68]:
import string
def remove_punctuation(txt):
    txt_nopunt = [c for c in txt if c not in string.punctuation]   
    return "".join(txt_nopunt)
    
#df['msg_clean'] = df['sms'].apply(lambda x:remove_punctuation(x))
#df.head()

In [67]:
import re
def tokenize(txt):
    tokens = re.split('\W+',txt)
    return tokens

#df['msg_clean_tokenized'] = df['msg_clean'].apply(lambda x:tokenize(x.lower()))
#df.head()

In [66]:
nltk.download('stopwords')
def remove_stopwords(txt):
    nostopwords = [word for word in txt if word not in nltk.corpus.stopwords.words('english')]
    return nostopwords

#df['msg_no_sw'] = df['msg_clean_tokenized'].apply(lambda x:remove_stopwords(x))
#df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pranayrastogi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
def lemmatization(txt):
    text = [wn.lemmatize(word) for word in txt]
    return text

#df['msg_lemmatized'] = df['msg_no_sw'].apply(lambda x:lemmatization(x))
#df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pranayrastogi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [69]:
def clean_text(txt):
    txt = "".join([c.lower() for c in txt if c not in string.punctuation])
    tokens = re.split('\W+',txt)
    txt = [word for word in tokens if word not in nltk.corpus.stopwords.words('english')]
    txt = [wn.lemmatize(word) for word in txt]
    return txt

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
tfidf = TfidfVectorizer(analyzer=clean_text)
X = tfidf.fit_transform(df['sms'])
X.shape

(5574, 8917)

In [71]:
from sklearn.naive_bayes import MultinomialNB
#spam_detect_model = MultinomialNB().fit(X,df['label'])

In [72]:
from sklearn.model_selection import train_test_split
msg_train,msg_test,label_train,label_test = train_test_split(df['sms'],df['label'],test_size=0.2)

In [76]:
from sklearn.pipeline import Pipeline   #( 'bow',CountVectorizer(analyzer=clean_text)),
pipeline = Pipeline([
    ('tfidf',TfidfVectorizer(analyzer=clean_text)),
    ('classifier',MultinomialNB()),
])

In [77]:
msg_train.head()

2632    URGENT! Your mobile No 077xxx WON a £2,000 Bon...
5041    Natalie (20/F) is inviting you to be her frien...
542     A famous quote : when you develop the ability ...
1862    Aiyah then i wait lor. Then u entertain me. He...
3044           Your bill at 3 is £33.65 so thats not bad!
Name: sms, dtype: object

In [78]:
pipeline.fit(msg_train,label_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer=<function clean_text at 0x1a15c5b1e0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [79]:
from sklearn.metrics import classification_report
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1011
        spam       0.72      1.00      0.84       104

    accuracy                           0.96      1115
   macro avg       0.86      0.98      0.91      1115
weighted avg       0.97      0.96      0.97      1115



In [81]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(label_test,predictions))

[[970   0]
 [ 41 104]]
