In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas
import sklearn
from six.moves import cPickle as pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.learning_curve import learning_curve



In [2]:
messages = [line.rstrip() for line in open('./SMSSpamCollection')]
print(len(messages))

5574


In [3]:
for message_no, message in enumerate(messages[:10]):
    print(message_no, message)

0 ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
1 ham	Ok lar... Joking wif u oni...
2 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3 ham	U dun say so early hor... U c already then say...
4 ham	Nah I don't think he goes to usf, he lives around here though
5 spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
6 ham	Even my brother is not like to speak with me. They treat me like aids patent.
7 ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8 spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 

In [4]:
messages = pandas.read_csv('./SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
print(messages)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...
10     ham  I'm gonna be home soon and i don't want to tal...
11    spam  SIX chances to win CASH! From 100 to 20,000 po...
12    spam  URGENT! You have won a 1 week FREE membership ...
13     ham  I've been searching for the right words to tha...
14     ham                I HAVE A DATE ON SUNDAY WITH WILL!!
15    sp

In [5]:
messages['length'] = messages['message'].map(lambda text: len(text))
print (messages.head())

  label                                            message  length
0   ham  Go until jurong point, crazy.. Available only ...     111
1   ham                      Ok lar... Joking wif u oni...      29
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155
3   ham  U dun say so early hor... U c already then say...      49
4   ham  Nah I don't think he goes to usf, he lives aro...      61


In [6]:
 ## TODO: unicode python3 vs python 2
#Data preprocessing
def split_into_tokens(message):
    #message = unicode(message, 'utf8')  # convert bytes into proper unicode
    return TextBlob(message).words
messages.message.head().apply(split_into_tokens)

def split_into_lemmas(message):
    #message = unicode(message, 'utf8').lower()
    words = TextBlob(message.lower() ).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

TextBlob("Hello world, how is it going?").tags

messages.message.head().apply(split_into_lemmas)

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, do, n't, think, he, go, to, usf, he, ...
Name: message, dtype: object

In [7]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
print (len(bow_transformer.vocabulary_))

8859


In [8]:
message4 = messages['message'][3]
print (message4)

U dun say so early hor... U c already then say...


In [9]:
# result number is different with reference
bow4 = bow_transformer.transform([message4])
print ( bow4)
print (bow4.shape)

  (0, 1158)	1
  (0, 1897)	1
  (0, 2893)	1
  (0, 2923)	1
  (0, 4017)	1
  (0, 6726)	2
  (0, 7101)	1
  (0, 7687)	1
  (0, 8002)	2
(1, 8859)


In [10]:
print (bow_transformer.get_feature_names()[6726])
print (bow_transformer.get_feature_names()[8002])

say
u


In [11]:
messages_bow = bow_transformer.transform(messages['message'])
print ('sparse matrix shape:', messages_bow.shape)
print ('number of non-zeros:', messages_bow.nnz)
print ('sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])))

sparse matrix shape: (5574, 8859)
number of non-zeros: 80338
sparsity: 0.16%


In [12]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print (tfidf4)

  (0, 8002)	0.305003902449
  (0, 7687)	0.225308304351
  (0, 7101)	0.191397477878
  (0, 6726)	0.523390707413
  (0, 4017)	0.456371992578
  (0, 2923)	0.329688073969
  (0, 2893)	0.303704626273
  (0, 1897)	0.246652416565
  (0, 1158)	0.274944401638


In [13]:
print (tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
print (tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])

2.84954060396
8.23975323521


In [14]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print (messages_tfidf.shape)

(5574, 8859)


In [15]:
msg_train, msg_test, label_train, label_test = \
    train_test_split(messages['message'], messages['label'], test_size=0.2)


In [16]:
pipeline_svm = Pipeline([
    ('bow', CountVectorizer(analyzer=split_into_lemmas)),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC()),  # <== change here
])

# pipeline parameters to automatically explore and tune
param_svm = [
  {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']},
  {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
]

grid_svm = GridSearchCV(
    pipeline_svm,  # pipeline from above
    param_grid=param_svm,  # parameters to tune via cross validation
    refit=True,  # fit using all data, on the best detected classifier
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',  # what score are we optimizing?
    cv=StratifiedKFold(label_train, n_folds=10),  # what type of cross validation to use
)

In [17]:
%time svm_detector = grid_svm.fit(msg_train, label_train) # find the best combination from param_svm
print (svm_detector.grid_scores_)
print (svm_detector.predict(["Hi mom, how are you?"])[0])
print (svm_detector.predict(["WINNER! Credit for free!"])[0])


predictions = svm_detector.predict(msg_test)
print (predictions)
print(type(predictions))


print (confusion_matrix(label_test, predictions))
print (classification_report(label_test, predictions))
#for i in label_test:
 #   print(i[1])
#print(label_test)

label_test_value=label_test.values
label_test_index=label_test.index
#print(msg_train.shape)
#print(label_train.shape)
#print(label_test.as_matrix)
for i in range(len(predictions)):
    if (predictions[i] != label_test_value[i]):
        print(messages['message'][i])
        print(predictions[i])
        print(label_test_value[i])
        indexResult= str(label_test_index[i])
        #print(indexResult)
        #print(type(indexResult))
        mess= messages['message'][i]
        #print(mess)
        #print(type(mess))
        s1=pandas.Series([mess],index=[indexResult])
        msg_train=msg_train.append(s1)
        label_train=label_train.append(pandas.Series(label_test_value[i], index=[indexResult]))
        print(msg_train.shape)
        print(label_train.shape)
print (confusion_matrix(label_test, predictions))
print (classification_report(label_test, predictions))

CPU times: user 4.16 s, sys: 196 ms, total: 4.35 s
Wall time: 3min 23s
[mean: 0.98610, std: 0.00423, params: {'classifier__C': 1, 'classifier__kernel': 'linear'}, mean: 0.98610, std: 0.00386, params: {'classifier__C': 10, 'classifier__kernel': 'linear'}, mean: 0.98632, std: 0.00419, params: {'classifier__C': 100, 'classifier__kernel': 'linear'}, mean: 0.98632, std: 0.00419, params: {'classifier__C': 1000, 'classifier__kernel': 'linear'}, mean: 0.86679, std: 0.00083, params: {'classifier__C': 1, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}, mean: 0.86679, std: 0.00083, params: {'classifier__C': 1, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}, mean: 0.86679, std: 0.00083, params: {'classifier__C': 10, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}, mean: 0.86679, std: 0.00083, params: {'classifier__C': 10, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}, mean: 0.97062, std: 0.00588, params: {'classifier__C': 100, 'classifier__gamma': 0.001, 'c

In [18]:
grid_svm = GridSearchCV(
    pipeline_svm,  # pipeline from above
    param_grid=param_svm,  # parameters to tune via cross validation
    refit=True,  # fit using all data, on the best detected classifier
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',  # what score are we optimizing?
    cv=StratifiedKFold(label_train, n_folds=10),  # what type of cross validation to use
)

In [19]:
%time svm_detector = grid_svm.fit(msg_train, label_train) # find the best combination from param_svm
print (svm_detector.grid_scores_)

CPU times: user 4.24 s, sys: 140 ms, total: 4.38 s
Wall time: 3min 21s
[mean: 0.98457, std: 0.00545, params: {'classifier__C': 1, 'classifier__kernel': 'linear'}, mean: 0.98300, std: 0.00503, params: {'classifier__C': 10, 'classifier__kernel': 'linear'}, mean: 0.98323, std: 0.00462, params: {'classifier__C': 100, 'classifier__kernel': 'linear'}, mean: 0.98323, std: 0.00462, params: {'classifier__C': 1000, 'classifier__kernel': 'linear'}, mean: 0.86491, std: 0.00088, params: {'classifier__C': 1, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}, mean: 0.86491, std: 0.00088, params: {'classifier__C': 1, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}, mean: 0.86491, std: 0.00088, params: {'classifier__C': 10, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}, mean: 0.86491, std: 0.00088, params: {'classifier__C': 10, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}, mean: 0.96913, std: 0.00721, params: {'classifier__C': 100, 'classifier__gamma': 0.001, 'c

In [20]:
print (svm_detector.predict(["Hi mom, how are you?"])[0])
print (svm_detector.predict(["WINNER! Credit for free!"])[0])
print (confusion_matrix(label_test, svm_detector.predict(msg_test)))
print (classification_report(label_test, svm_detector.predict(msg_test)))

"""# store the spam detector to disk after training
with open('sms_spam_detector.pkl', 'wb') as fout:
    cPickle.dump(svm_detector, fout)

# ...and load it back, whenever needed, possibly on a different machine
svm_detector_reloaded = cPickle.load(open('sms_spam_detector.pkl'))
print ('before:', svm_detector.predict([message4])[0])
print ('after:', svm_detector_reloaded.predict([message4])[0])"""

ham
spam
[[961   1]
 [ 13 140]]
             precision    recall  f1-score   support

        ham       0.99      1.00      0.99       962
       spam       0.99      0.92      0.95       153

avg / total       0.99      0.99      0.99      1115



"# store the spam detector to disk after training\nwith open('sms_spam_detector.pkl', 'wb') as fout:\n    cPickle.dump(svm_detector, fout)\n\n# ...and load it back, whenever needed, possibly on a different machine\nsvm_detector_reloaded = cPickle.load(open('sms_spam_detector.pkl'))\nprint ('before:', svm_detector.predict([message4])[0])\nprint ('after:', svm_detector_reloaded.predict([message4])[0])"