In [2]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrithkala\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
df_spam = pd.read_csv('NB.csv', encoding = 'ISO-8859-1', names = ['response','message'])

In [5]:
df_spam.head()

Unnamed: 0,response,message
0,type,text
1,ham,Hope you are having a good week. Just checking in
2,ham,K..give back my thanks.
3,ham,Am also doing in cbe only. But have to pay.
4,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."


In [6]:
df_spam.describe()

Unnamed: 0,response,message
count,5560,5560
unique,3,5157
top,ham,"Sorry, I'll call later"
freq,4812,30


In [7]:
df_spam.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4812,4503,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4
type,1,1,text,1


In [8]:
df_spam['length'] = df_spam['message'].apply(len)

In [9]:
df_spam.head()

Unnamed: 0,response,message,length
0,type,text,4
1,ham,Hope you are having a good week. Just checking in,49
2,ham,K..give back my thanks.,23
3,ham,Am also doing in cbe only. But have to pay.,43
4,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000...",150


In [10]:
def message_text_process(mess):
    no_punctuation = [char for char in mess if char not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [11]:
df_spam['message'].head(5).apply(message_text_process)

0                                               [text]
1                         [Hope, good, week, checking]
2                                [Kgive, back, thanks]
3                                     [also, cbe, pay]
4    [complimentary, 4, STAR, Ibiza, Holiday, å£100...
Name: message, dtype: object

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
bag_words = CountVectorizer(analyzer = message_text_process).fit(df_spam['message'])

In [20]:
bag_words

CountVectorizer(analyzer=<function message_text_process at 0x000002331EFC30D0>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [21]:
print(len(bag_words.vocabulary_))

11356


In [22]:
message_bagwords = bag_words.transform(df_spam['message'])

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(message_bagwords)

In [24]:
message_tfidf = tfidf_transformer.transform(message_bagwords)

In [25]:
print(message_tfidf.shape)

(5560, 11356)


In [26]:
from sklearn.naive_bayes import MultinomialNB

spam_detect = MultinomialNB().fit(message_tfidf,df_spam['response'])

In [27]:
predicted = spam_detect.predict(message_tfidf)

In [28]:
predicted

array(['ham', 'ham', 'ham', ..., 'spam', 'spam', 'ham'], dtype='<U4')

In [29]:
expected = df_spam['response']

In [31]:
from sklearn import metrics
print(metrics.classification_report(expected,predicted))
print(metrics.confusion_matrix(expected,predicted))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      4812
        spam       1.00      0.85      0.92       747
        type       0.00      0.00      0.00         1

   micro avg       0.98      0.98      0.98      5560
   macro avg       0.66      0.62      0.63      5560
weighted avg       0.98      0.98      0.98      5560

[[4812    0    0]
 [ 115  632    0]
 [   1    0    0]]
