In [1]:
import pandas as pd
import string
import nltk

In [2]:
from nltk.corpus import stopwords

In [3]:
#Read the SpamCollection file
df = pd.read_csv('SpamCollection',sep='\t',names=['Response','Message'])

In [4]:
df

Unnamed: 0,Response,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
#Display first 5 Records
df.head()

Unnamed: 0,Response,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Group by to find the count of spam and ham messages
df.groupby('Response').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [7]:
# Find the length of messages
df['length'] = df['Message'].apply(len)

In [8]:
df['length']

0       111
1        29
2       155
3        49
4        61
       ... 
5567    160
5568     36
5569     57
5570    125
5571     26
Name: length, Length: 5572, dtype: int64

In [9]:
# STEPS:
# 1. Remove punctuations and stopwords. Split the sentence into words. This process is called tokenization. 
# 2. Apply CountVectorizer and Transform. This converts the words to a integer or float. This process is called as Feature Extraction.
# 3. Apply TF/IDF transform - Term Frequency and Inverse Document Frequency 
# 4. Split data into train and test
# 5. Using Naive Bayes Classification, first train the model with train data.
# 6. Test the model and get the prediction.
# 7. Compare prediction vs Actual and get the confusion matrix.
# 8. Get the accuracy score for the model.

In [10]:
# function to remove punctuations and stopwords
def message_text_process(message):   
    no_punct = [char for char in message if char not in string.punctuation]
    no_punct = ''.join(no_punct)    
    return [word for word in no_punct.split() if word.lower() not in stopwords.words('english')]           
                

In [11]:
# test the above function to see if its working
df['Message'].head(5).apply(message_text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: Message, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
# Apply CountVectorizer - Convert a collection of text documents to a matrix of token counts.
vectorization = CountVectorizer(analyzer = message_text_process )


In [14]:
# TRIAL ---- try out the Count Vectorizer and Transform for only 1 record to see the output.
# try out the count vectorizer for the first record alone
bag_of_words_transformer_try = vectorization.fit(df['Message'].head(1))
# print out the bag_of_words_transformer
print(bag_of_words_transformer_try.vocabulary_)
# transform the first record alone
message_try = bag_of_words_transformer_try.transform(df['Message'].head(1))
print(message_try)


{'Go': 2, 'jurong': 10, 'point': 13, 'crazy': 6, 'Available': 0, 'bugis': 5, 'n': 12, 'great': 9, 'world': 15, 'la': 11, 'e': 7, 'buffet': 4, 'Cine': 1, 'got': 8, 'amore': 3, 'wat': 14}
  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1
  (0, 9)	1
  (0, 10)	1
  (0, 11)	1
  (0, 12)	1
  (0, 13)	1
  (0, 14)	1
  (0, 15)	1


In [15]:
#Apply CountVectorizer to the entire df
bag_of_words_transformer = vectorization.fit(df['Message'])
print(len(bag_of_words_transformer.vocabulary_))
message = bag_of_words_transformer.transform(df['Message'])
print(message)

11425
  (0, 1110)	1
  (0, 1483)	1
  (0, 2060)	1
  (0, 4653)	1
  (0, 5217)	1
  (0, 5218)	1
  (0, 5769)	1
  (0, 6217)	1
  (0, 6906)	1
  (0, 6937)	1
  (0, 7555)	1
  (0, 7668)	1
  (0, 8336)	1
  (0, 8917)	1
  (0, 10965)	1
  (0, 11163)	1
  (1, 2451)	1
  (1, 3064)	1
  (1, 7701)	1
  (1, 8590)	1
  (1, 10698)	1
  (1, 11072)	1
  (2, 73)	1
  (2, 423)	1
  (2, 430)	1
  :	:
  (5568, 6691)	1
  (5568, 6882)	1
  (5568, 7159)	1
  (5568, 11418)	1
  (5569, 3228)	1
  (5569, 3721)	1
  (5569, 8252)	1
  (5569, 10199)	1
  (5570, 4508)	1
  (5570, 5055)	1
  (5570, 5251)	1
  (5570, 6282)	1
  (5570, 6699)	1
  (5570, 6799)	1
  (5570, 6984)	1
  (5570, 7287)	1
  (5570, 7394)	1
  (5570, 7800)	1
  (5570, 8420)	1
  (5570, 9915)	1
  (5570, 10787)	1
  (5570, 11006)	1
  (5571, 3431)	1
  (5571, 8348)	1
  (5571, 10648)	1


In [16]:
# Apply Tf/IDF
#TF/IDF transform - eg
#Consider a document containing 100 words wherein the word cat appears 3 times. 
#The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. 
#Now, assume we have 10 million documents and the word cat appears in one thousand of these. 
#Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. 
#Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
from sklearn.feature_extraction.text import TfidfTransformer

In [17]:
#try out the tfidf for only 1 record
tfidf_transformer_try = TfidfTransformer().fit(message_try)
message_tfidf_try = tfidf_transformer_try.transform(message_try)
print(message_tfidf_try.data)


[0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25
 0.25 0.25]


In [18]:
# Apply Tf/IDF for the whole df
tfidf_transformer = TfidfTransformer().fit(message)
message_tfidf = tfidf_transformer.transform(message)
message_tfidf.shape
message_tfidf.data


array([0.23026686, 0.19073429, 0.24704652, ..., 0.53921812, 0.48542915,
       0.68818773])

In [19]:
# Split data into train(70%) and test(30%)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(message_tfidf, df['Response'], test_size=0.30, random_state = 50)    


In [20]:
y_train

2696     ham
1659    spam
4829     ham
5319     ham
1394     ham
        ... 
3330     ham
70       ham
132      ham
2014    spam
1931     ham
Name: Response, Length: 3900, dtype: object

In [21]:
# Use Naive Bayes to detect spam
# Train the model first using train data
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [22]:
# Test the model using test data. Make predictions
predictions = spam_detect_model.predict(X_test)

In [23]:
print('predicted', predictions)

predicted ['spam' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [24]:
print('actual', y_test)

actual 3409    spam
2103     ham
2665     ham
3239     ham
1205    spam
        ... 
1302     ham
69       ham
4928     ham
2944     ham
629     spam
Name: Response, Length: 1672, dtype: object


In [25]:
# Compare Predicted vs Actual using Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,predictions))

[[1469    0]
 [  57  146]]


In [26]:
# Calculate accuracy score
from sklearn.metrics import accuracy_score

In [27]:
from sklearn.metrics import accuracy_score
print (accuracy_score(y_test, predictions))

0.9659090909090909
