In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas
import sklearn
from six.moves import cPickle as pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.learning_curve import learning_curve



In [2]:
messages = [line.rstrip() for line in open('./SMSSpamCollection')]
print(len(messages))

5574


In [3]:
for message_no, message in enumerate(messages[:10]):
    print(message_no, message)

0 ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
1 ham	Ok lar... Joking wif u oni...
2 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3 ham	U dun say so early hor... U c already then say...
4 ham	Nah I don't think he goes to usf, he lives around here though
5 spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
6 ham	Even my brother is not like to speak with me. They treat me like aids patent.
7 ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8 spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 

In [4]:
messages = pandas.read_csv('./SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
print(messages)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...
10     ham  I'm gonna be home soon and i don't want to tal...
11    spam  SIX chances to win CASH! From 100 to 20,000 po...
12    spam  URGENT! You have won a 1 week FREE membership ...
13     ham  I've been searching for the right words to tha...
14     ham                I HAVE A DATE ON SUNDAY WITH WILL!!
15    sp

In [5]:
messages['length'] = messages['message'].map(lambda text: len(text))
print (messages.head())

  label                                            message  length
0   ham  Go until jurong point, crazy.. Available only ...     111
1   ham                      Ok lar... Joking wif u oni...      29
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155
3   ham  U dun say so early hor... U c already then say...      49
4   ham  Nah I don't think he goes to usf, he lives aro...      61


In [6]:
 ## TODO: unicode python3 vs python 2
#Data preprocessing
def split_into_tokens(message):
    #message = unicode(message, 'utf8')  # convert bytes into proper unicode
    return TextBlob(message).words
messages.message.head().apply(split_into_tokens)

def split_into_lemmas(message):
    #message = unicode(message, 'utf8').lower()
    words = TextBlob(message.lower() ).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

TextBlob("Hello world, how is it going?").tags

messages.message.head().apply(split_into_lemmas)

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, do, n't, think, he, go, to, usf, he, ...
Name: message, dtype: object

In [7]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
print (len(bow_transformer.vocabulary_))

8859


In [8]:
message4 = messages['message'][3]
print (message4)

U dun say so early hor... U c already then say...


In [9]:
# result number is different with reference
bow4 = bow_transformer.transform([message4])
print ( bow4)
print (bow4.shape)

  (0, 1158)	1
  (0, 1897)	1
  (0, 2893)	1
  (0, 2923)	1
  (0, 4017)	1
  (0, 6726)	2
  (0, 7101)	1
  (0, 7687)	1
  (0, 8002)	2
(1, 8859)


In [10]:
print (bow_transformer.get_feature_names()[6726])
print (bow_transformer.get_feature_names()[8002])

say
u


In [11]:
messages_bow = bow_transformer.transform(messages['message'])
print ('sparse matrix shape:', messages_bow.shape)
print ('number of non-zeros:', messages_bow.nnz)
print ('sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])))

sparse matrix shape: (5574, 8859)
number of non-zeros: 80338
sparsity: 0.16%


In [12]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print (tfidf4)

  (0, 8002)	0.305003902449
  (0, 7687)	0.225308304351
  (0, 7101)	0.191397477878
  (0, 6726)	0.523390707413
  (0, 4017)	0.456371992578
  (0, 2923)	0.329688073969
  (0, 2893)	0.303704626273
  (0, 1897)	0.246652416565
  (0, 1158)	0.274944401638


In [13]:
print (tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
print (tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])

2.84954060396
8.23975323521


In [14]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print (messages_tfidf.shape)

(5574, 8859)


In [15]:
%time spam_detector = MultinomialNB().fit(messages_tfidf, messages['label'])

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 15.6 ms


In [16]:
print ('predicted:', spam_detector.predict(tfidf4)[0])
print ('expected:', messages.label[3])

predicted: ham
expected: ham


In [17]:
all_predictions = spam_detector.predict(messages_tfidf)
print (all_predictions)

['ham' 'ham' 'spam' ..., 'ham' 'ham' 'ham']


In [18]:
print ('accuracy', accuracy_score(messages['label'], all_predictions))
print ('confusion matrix\n', confusion_matrix(messages['label'], all_predictions))
print ('(row=expected, col=predicted)')

accuracy 0.969501255831
confusion matrix
 [[4827    0]
 [ 170  577]]
(row=expected, col=predicted)


In [19]:
print (classification_report(messages['label'], all_predictions))

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      4827
       spam       1.00      0.77      0.87       747

avg / total       0.97      0.97      0.97      5574



In [20]:
msg_train, msg_test, label_train, label_test = \
    train_test_split(messages['message'], messages['label'], test_size=0.2)

print (len(msg_train), len(msg_test), len(msg_train) + len(msg_test))
print(msg_train.iloc[:3])
print(type(msg_train.iloc[1]))
print(label_train.iloc[:3])
print(type(label_train.iloc[1]))
print(msg_train.shape)
print(label_train.shape)

4459 1115 5574
614     THANX4 TODAY CER IT WAS NICE 2 CATCH UP BUT WE...
2098         Are you the cutest girl in the world or what
2323                              Should I be stalking u?
Name: message, dtype: object
<class 'str'>
614     ham
2098    ham
2323    ham
Name: label, dtype: object
<class 'str'>
(4459,)
(4459,)


In [21]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=split_into_lemmas)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [22]:
scores = cross_val_score(pipeline,  # steps to convert raw messages into models
                         msg_train,  # training data
                         label_train,  # training labels
                         cv=10,  # split data randomly into 10 parts: 9 for training, 1 for scoring
                         scoring='accuracy',  # which scoring metric?
                         n_jobs=-1,  # -1 = use all cores = faster
                         )
print (scores)

[ 0.93959732  0.94854586  0.94183445  0.94630872  0.94843049  0.96404494
  0.94831461  0.95730337  0.9505618   0.95730337]


In [23]:
#tune paramaters
params = {
    'tfidf__use_idf': (True, False),
    'bow__analyzer': (split_into_lemmas, split_into_tokens),
}

grid = GridSearchCV(
    pipeline,  # pipeline from above
    params,  # parameters to tune via cross validation
    refit=True,  # fit using all available data at the end, on the best found param combination
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',  # what score are we optimizing?
    cv=StratifiedKFold(label_train, n_folds=5),  # what type of cross validation to use
)

In [24]:
%time nb_detector = grid.fit(msg_train, label_train)
print (nb_detector.grid_scores_)

CPU times: user 2.26 s, sys: 56 ms, total: 2.32 s
Wall time: 26.7 s
[mean: 0.94573, std: 0.00710, params: {'bow__analyzer': <function split_into_lemmas at 0x7fe2b8597378>, 'tfidf__use_idf': True}, mean: 0.92980, std: 0.00658, params: {'bow__analyzer': <function split_into_lemmas at 0x7fe2b8597378>, 'tfidf__use_idf': False}, mean: 0.94595, std: 0.00455, params: {'bow__analyzer': <function split_into_tokens at 0x7fe279dc4e18>, 'tfidf__use_idf': True}, mean: 0.92801, std: 0.00432, params: {'bow__analyzer': <function split_into_tokens at 0x7fe279dc4e18>, 'tfidf__use_idf': False}]


In [25]:
print( nb_detector.predict(["Hi mom, how are you?"])[0])
print( nb_detector.predict(["WINNER! Credit for free!"])[0])

ham
ham


In [26]:
predictions = nb_detector.predict(msg_test)
print (predictions)
print(type(predictions))


print (confusion_matrix(label_test, predictions))
print (classification_report(label_test, predictions))
#for i in label_test:
 #   print(i[1])
#print(label_test)

label_test_value=label_test.values
label_test_index=label_test.index
#print(msg_train.shape)
#print(label_train.shape)
#print(label_test.as_matrix)
for i in range(len(predictions)):
    if (predictions[i] != label_test_value[i]):
        print(messages['message'][i])
        print(predictions[i])
        print(label_test_value[i])
        indexResult= str(label_test_index[i])
        #print(indexResult)
        #print(type(indexResult))
        mess= messages['message'][i]
        #print(mess)
        #print(type(mess))
        s1=pandas.Series([mess],index=[indexResult])
        msg_train=msg_train.append(s1)
        label_train=label_train.append(pandas.Series(label_test_value[i], index=[indexResult]))
        print(msg_train.shape)
        print(label_train.shape)
print (confusion_matrix(label_test, predictions))
print (classification_report(label_test, predictions))

['ham' 'ham' 'ham' ..., 'ham' 'spam' 'ham']
<class 'numpy.ndarray'>
[[952   0]
 [ 58 105]]
             precision    recall  f1-score   support

        ham       0.94      1.00      0.97       952
       spam       1.00      0.64      0.78       163

avg / total       0.95      0.95      0.94      1115

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham
spam
(4460,)
(4460,)
England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/ú1.20 POBOXox36504W45WQ 16+
ham
spam
(4461,)
(4461,)
I see the letter B on my car
ham
spam
(4462,)
(4462,)
No calls..messages..missed calls
ham
spam
(4463,)
(4463,)
SMS. ac Sptv: The New Jersey Devils and the Detroit Red Wings play Ice Hockey. Correct or Incorrect? End? Reply END SPTV
ham
spam
(4464,)
(4464,)
Dear, will call Tmorrow.pls accomodate.
ham
spam
(4465,)
(4465,)
Hi frnd, which is best way to avoid missunderstding w

In [27]:
scores = cross_val_score(pipeline,  # steps to convert raw messages into models
                         msg_train,  # training data
                         label_train,  # training labels
                         cv=10,  # split data randomly into 10 parts: 9 for training, 1 for scoring
                         scoring='accuracy',  # which scoring metric?
                         n_jobs=-1,  # -1 = use all cores = faster
                         )
print (scores)

grid = GridSearchCV(
    pipeline,  # pipeline from above
    params,  # parameters to tune via cross validation
    refit=True,  # fit using all available data at the end, on the best found param combination
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',  # what score are we optimizing?
    cv=StratifiedKFold(label_train, n_folds=5),  # what type of cross validation to use
)

%time nb_detector = grid.fit(msg_train, label_train)
print (nb_detector.grid_scores_)

print( nb_detector.predict(["Hi mom, how are you?"])[0])
print( nb_detector.predict(["WINNER! Credit for free!"])[0])

predictions = nb_detector.predict(msg_test)
print (predictions)
print(type(predictions))


print (confusion_matrix(label_test, predictions))
print (classification_report(label_test, predictions))

[ 0.93818985  0.94701987  0.94247788  0.95132743  0.95132743  0.95121951
  0.94678492  0.9556541   0.96008869  0.88248337]
CPU times: user 2.8 s, sys: 56 ms, total: 2.85 s
Wall time: 26.4 s
[mean: 0.93978, std: 0.01000, params: {'bow__analyzer': <function split_into_lemmas at 0x7fe2b8597378>, 'tfidf__use_idf': True}, mean: 0.92251, std: 0.00782, params: {'bow__analyzer': <function split_into_lemmas at 0x7fe2b8597378>, 'tfidf__use_idf': False}, mean: 0.93890, std: 0.00836, params: {'bow__analyzer': <function split_into_tokens at 0x7fe279dc4e18>, 'tfidf__use_idf': True}, mean: 0.91942, std: 0.00777, params: {'bow__analyzer': <function split_into_tokens at 0x7fe279dc4e18>, 'tfidf__use_idf': False}]
ham
spam
['spam' 'ham' 'ham' ..., 'ham' 'spam' 'ham']
<class 'numpy.ndarray'>
[[952   0]
 [ 53 110]]
             precision    recall  f1-score   support

        ham       0.95      1.00      0.97       952
       spam       1.00      0.67      0.81       163

avg / total       0.95      0.95 