In [14]:
#import required libraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [15]:
#get the spamDataCollection using pandas
df_data_spam = pd.read_csv('SMSSpamCollection',sep = '\t', names = ['response','message'])

In [16]:
df_data_spam.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
df_data_spam.describe()

Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [18]:
df_data_spam.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [19]:
#add length as column to the data collection
df_data_spam['length'] = df_data_spam['message'].apply(len)

In [20]:
df_data_spam.head()

Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [21]:
#define a function to remove the stopwords
def remove_stopwords(mess):
    #check for the punctuations
    no_punctuation = [char for char in mess if char not in string.punctuation]
    #form the sentence
    no_punctuation = ''.join(no_punctuation)
    #eliminate the stopwords
    return[word for word in no_punctuation.split() if word.lower() not in stopwords.words('english') ]

In [22]:
#verify the function
df_data_spam['message'].head(5).apply(remove_stopwords)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [23]:
#start text processing
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
#use the bag of words by applying the function and fit the data(message) into it
bag_of_words = CountVectorizer(analyzer=remove_stopwords).fit(df_data_spam['message']);

In [25]:
print('length of Vocubulary',len(bag_of_words.vocabulary_))

length of Vocubulary 11425


In [26]:
#store the bag of words for messages using transform method
message_bag_words = bag_of_words.transform(df_data_spam['message'])

In [30]:
#apply tf-idf transformer and fit the bag of words into it
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bag_words)

In [32]:
#print the shape of the tfidf
message_tfidf = tfidf_transformer.transform(message_bag_words)
print(message_tfidf.shape)

(5572, 11425)


In [40]:
#use naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detection_model = MultinomialNB().fit(message_tfidf,df_data_spam['response'])

In [41]:
#use the model to predict the second message in the dataset is spam or not a spam
message_4 = df_data_spam['message'][4]
bag_message4 = bag_of_words.transform([message_4])
tfidf = tfidf_transformer.transform(bag_message4)

In [42]:
print("Predicted: ", spam_detection_model.predict(tfidf)[0])

Predicted:  ham


In [43]:
print ("Expected: ",df_data_spam.response[4])

Expected:  ham
