In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas
import sklearn
import cPickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.learning_curve import learning_curve

In [2]:
messages = pandas.read_csv('./data/SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
#print messages

In [3]:
messages.groupby('label').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,message
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,count,4827
ham,unique,4518
ham,top,"Sorry, I'll call later"
ham,freq,30
spam,count,747
spam,unique,653
spam,top,Please call our customer service representativ...
spam,freq,4


In [4]:
messages['length'] = messages['message'].map(lambda text: len(text))
print messages.head()

  label                                            message  length
0   ham  Go until jurong point, crazy.. Available only ...     111
1   ham                      Ok lar... Joking wif u oni...      29
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155
3   ham  U dun say so early hor... U c already then say...      49
4   ham  Nah I don't think he goes to usf, he lives aro...      61


In [None]:
messages.hist(column='length', by='label', bins=50)
plt.axis([0, 50, 0, 180], [0, 50, 0, 180])

In [5]:
def split_into_tokens(message):
    message = unicode(message, 'utf8')  # convert bytes into proper unicode
    return TextBlob(message).words

In [6]:
messages.message.head().apply(split_into_tokens)

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, do, n't, think, he, goes, to, usf, he...
Name: message, dtype: object

In [7]:
def split_into_lemmas(message):
    message = unicode(message, 'utf8').lower()
    words = TextBlob(message).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

messages.message.head().apply(split_into_lemmas)

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, do, n't, think, he, go, to, usf, he, ...
Name: message, dtype: object

In [8]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])


In [9]:
print (bow_transformer.vocabulary_)



In [10]:
vocab = bow_transformer.vocabulary_
#q = np.empty([8874,1],dtype=str)
q = np.array(vocab)
#q = np.dtype(np.str)

w = ['none']*8874;
for key, value in vocab.iteritems():
    w[value-1] = key
    #print w[value-1]
    #print key
for i in range(0,8873):
   print w[i]
#vocab.to_csv('data.csv')

'an
'anything
'comfort
'd
'doctors
'heart
'help
'hex
'hw
'if
'its
'll
'luxury
'm
'maangalyam
'married
'melle
'need
're
'rencontre
's
'simple
'taxless
'terrorist
'that
'til
'uptown
've
'wnevr
'xam
0
008704050406
0089
0121
01223585236
01223585334
0125698789
02
02/06/03
02/09/03
0207
0207-083-6089
02072069400
02073162414
02085076972
021
050703
0578
06
06.05.05
06/11/04
07/11/04
07008009200
07046744435
07090201529
07090298926
07099833605
07123456789
0721072
07732584351
07734396839
07742676969
07753741225
0776xxxxxxx
07786200117
077xxx
078
07801543489
07808
07808247860
07808726822
07815296484
07821230901
078498****7
0789xxxxxxx
07946746291/07880867867
0796xxxxxx
07973788240
07xxxxxxxxx
0800
08000407165
08000776320
08000839402
08000930705
08000938767
08001950382
08002888812
08002986030
08002986906
08002988890
08006344447
0808
08081263000
08081560665
0825
0844
08448350055
08448714184
0845
08450542832
08452810071
08452810073
08452810075over18
0870
0870..k
08700621170150p
08701213186
0870123739

In [11]:
messages_bow = bow_transformer.transform(messages['message'])
print 'sparse matrix shape:', messages_bow.shape
print 'number of non-zeros:', messages_bow.nnz
print 'sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))

sparse matrix shape: (5574, 8874)
number of non-zeros: 80272
sparsity: 0.16%


In [12]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)

#print tfidf_transformer.idf

In [13]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
#print messages_tfidf
messages_tfidf = messages_tfidf.toarray()
#print type(messages_tfidf)
#print tfidf_transformer.idf_



print messages_tfidf[5572][5427]
#print messages_tfidf.shape

0.206932692661


In [14]:
std_tfidf=messages_tfidf.std(axis=0)
print std_tfidf

[ 0.00595961  0.00364522  0.00373358 ...,  0.00772061  0.00398291
  0.00398291]


In [None]:
avg_tfidf=messages_tfidf.max(axis=0)
print avg_tfidf[5427]

In [None]:
#avg_sorted_tfidf = avg_tfidf.sort()
avg_sorted_tfidf = np.sort(avg_tfidf)
print avg_sorted_tfidf.shape

In [None]:
print type(avg_sorted_tfidf)
temp = avg_sorted_tfidf[avg_sorted_tfidf>0.0001]
print temp.shape

In [None]:
#print std_tfidf.shape
print avg_tfidf.shape

In [None]:
print messages_tfidf[:]
print messages_tfidf[100].min(axis=0)

#type(messages_tfidf)

In [None]:
print messages.message[0]

In [15]:
X = messages_tfidf
y = messages['label']

In [16]:
print X.shape
print y.shape
print type(X)
print type(y)

(5574, 8874)
(5574,)
<type 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [None]:
from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()

#to convert into numbers
y = le_sex.fit_transform(y)
print y

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [38]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
params = logreg.fit(X_train, y_train)
print type(params)
print logreg.coef_.size

<class 'sklearn.linear_model.logistic.LogisticRegression'>
8874


In [41]:
spam_detector = MultinomialNB().fit(messages_tfidf, messages['label'])

In [39]:
y_pred = logreg.predict(X_test)

In [42]:
y_pred = spam_detector.predict(X_test)

In [22]:
print y_pred
print y_pred.shape
print X_test.shape

['ham' 'ham' 'ham' ..., 'ham' 'ham' 'ham']
(2230,)
(2230, 8874)


In [40]:
from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred)

0.968609865471


In [None]:
test_msg = tfidf_transformer.transform(bow_transformer.transform("Hello!"))
print logreg.predict(test_msg)

<h1>heading</h1>

In [25]:
test_string ="learn c, java, python @niit pritampura"
words = test_string.split(" ")
#words = split_into_lemmas(words)
#test_string.apply(split_into_tokens)
test = [0]*8874
tf_test = [0]*8874
print words
for word in words:
    if(vocab.has_key(word)):
        tf_test[vocab[word]]=tf_test[vocab[word]]+1

length = len(words)        
        
for word in words:
    if(vocab.has_key(word)):
        test[vocab[word]]=(tf_test[vocab[word]]*tfidf_transformer.idf_[vocab[word]])/length
        #print test[vocab[word]]
    



['learn', 'c,', 'java,', 'python', '@niit', 'pritampura']


In [26]:
#logreg.predict(test)[0]
spam_detector.predict(test)[0]



'ham'

In [27]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=split_into_lemmas)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [29]:
params = {
    'tfidf__use_idf': (True, False),
    'bow__analyzer': (split_into_lemmas, split_into_tokens),
}

grid = GridSearchCV(
    pipeline,  # pipeline from above
    params,  # parameters to tune via cross validation
    refit=True,  # fit using all available data at the end, on the best found param combination
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',  # what score are we optimizing?
    cv=StratifiedKFold(y_train, n_folds=5),  # what type of cross validation to use
)

In [32]:
#nb_detector = grid.fit(X_train, y_train)
#print nb_detector.grid_scores_