## Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
import dill

## Load Data

In [2]:
data = pd.read_csv('data.csv',sep='\t',header=None)
messages = pd.DataFrame(list(zip(data[0].values,data[1].values)),columns=['Class','Message'])
messages.head()

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print(messages.groupby('Class').count())
print("\nTotal Message",len(messages))

       Message
Class         
ham       4825
spam       747

Total Message 5572


## Data Preprocessing

In [4]:
# Split Mesaage to words
SplitIntoWords = lambda message: TextBlob(message.lower()).words
print(messages.Message.head().apply(SplitIntoWords))

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, do, n't, think, he, goes, to, usf, he...
Name: Message, dtype: object


In [5]:
# Convert words to base form
WordsIntoBaseForm = lambda message: [word.lemma for word in SplitIntoWords(message)]

In [6]:
# Convert each messafe into a vector
trainingVector = CountVectorizer(analyzer=WordsIntoBaseForm,stop_words = 'english').fit(messages['Message'])

In [7]:
message10 = trainingVector.transform([messages['Message'][9]])
print(message10)

  (0, 88)	1
  (0, 359)	1
  (0, 1914)	1
  (0, 1947)	1
  (0, 2208)	1
  (0, 2240)	1
  (0, 3039)	1
  (0, 3382)	1
  (0, 3433)	2
  (0, 3778)	1
  (0, 4645)	1
  (0, 5182)	3
  (0, 5215)	1
  (0, 5222)	1
  (0, 5643)	1
  (0, 5690)	1
  (0, 6301)	1
  (0, 7673)	2
  (0, 7801)	2
  (0, 8002)	1
  (0, 8099)	2
  (0, 8495)	1
  (0, 8747)	1


In [8]:
# Print message #10 for comparison
print(messages['Message'][9])

Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


In [9]:
# Identify repeated words
print ('First word that appears twice:',trainingVector.get_feature_names()[3433])
print ('Word that appears three times:',trainingVector.get_feature_names()[5182])

First word that appears twice: free
Word that appears three times: mobile


In [10]:
# Bag of words for whole training data
messagesBagOfWords = trainingVector.transform(messages["Message"])
# Weight of words int the entire training data
TfidfTransformer = TfidfTransformer()
messagesTfidf = TfidfTransformer.fit_transform(messagesBagOfWords)

In [11]:
example = ["england"]
trainingVector.transform(example)

<1x8859 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

## Split Data into train and test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(messagesTfidf,messages['Class'].values)

In [13]:
X_train.shape

(4179, 8859)

## Train the model

In [14]:
classifiers=[]

In [15]:
# Using GaussianNB
from sklearn.naive_bayes import GaussianNB

spamDetector = GaussianNB().fit(X_train.toarray(),y_train)
y_pred = spamDetector.predict(X_test.toarray())
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy is ",accuracy)
classifiers.append(['GaussianNB',accuracy])

Accuracy is  0.89231873654


In [16]:
# Using MultinomialNB
from sklearn.naive_bayes import MultinomialNB

spamDetector = MultinomialNB().fit(X_train,y_train)
y_pred = spamDetector.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy is ",accuracy)
classifiers.append(['MultinomialNB',accuracy])

Accuracy is  0.953338119167


In [17]:
# Using BernoulliNB
from sklearn.naive_bayes import BernoulliNB

spamDetector = BernoulliNB().fit(X_train,y_train)
y_pred = spamDetector.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy is ",accuracy)
classifiers.append(['BernoulliNB',accuracy])

Accuracy is  0.981335247667


In [18]:
# Using Random Forest
from sklearn.ensemble import RandomForestClassifier

spamDetector = RandomForestClassifier().fit(X_train,y_train)
y_pred = spamDetector.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy is ",accuracy)
classifiers.append(['RandomForestClassifier',accuracy])

Accuracy is  0.969849246231


In [19]:
# Using SVC
from sklearn.svm import SVC

spamDetector = SVC().fit(X_train,y_train)
y_pred = spamDetector.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy is ",accuracy)
classifiers.append(['SVC',accuracy])

Accuracy is  0.870782483848


In [20]:
# Using LinearSVC
from sklearn.svm import LinearSVC

spamDetector = LinearSVC().fit(X_train,y_train)
y_pred = spamDetector.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy is ",accuracy)
classifiers.append(['LinearSVC',accuracy])

Accuracy is  0.981335247667


In [21]:
# Using SGDClassifier
from sklearn.linear_model import SGDClassifier

spamDetector = SGDClassifier().fit(X_train,y_train)
y_pred = spamDetector.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy is ",accuracy)
classifiers.append(['SGDClassifier',accuracy])

Accuracy is  0.982053122757


In [22]:
classifiers

[['GaussianNB', 0.89231873653984206],
 ['MultinomialNB', 0.95333811916726485],
 ['BernoulliNB', 0.98133524766690594],
 ['RandomForestClassifier', 0.96984924623115576],
 ['SVC', 0.87078248384781043],
 ['LinearSVC', 0.98133524766690594],
 ['SGDClassifier', 0.98205312275664036]]

In [23]:
import matplotlib.pyplot as plt

objects = [i[0] for i in classifiers]
y_pos = np.arange(len(objects))
performance = [(round(i[1],4)*100) for i in classifiers]

plt.barh(y_pos, performance, align='center', alpha=0.8)
for i, v in enumerate(performance):
    plt.text(v-15 , i-0.1 , str(v)+"%", color='white',fontweight='bold')
plt.yticks(y_pos, objects)
plt.xlabel('Accuaracy')
plt.title('Performance')
plt.show()

<matplotlib.figure.Figure at 0x7fb1168d0438>

## Save the best model

In [29]:
import pickle

# Dump the model
spamDetector = SGDClassifier().fit(X_train,y_train)
y_pred = spamDetector.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
print("Accuracy is ",accuracy)
pickle.dump(spamDetector,open('model.sav','wb'))

import dill
# Dump the trainingVector
with open("trainingVector", "wb") as dill_file:
    dill.dump(trainingVector, dill_file)
#pickle.dump(trainingVector,open('trainingVector.sav','wb'))

Accuracy is  0.983488872936


In [None]:
example = ['England v Macedonia - dont miss the goals/team news. Txt ENGLAND to 99999']
# Result
checkResult = spamDetector.predict(trainingVector.transform(example))
print ('The message [',example[0],'] has been classified as', checkResult)