In [96]:
import numpy as np
import pandas as pd

#import nltk
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
%matplotlib inline

In [72]:
with open('SMSSpamCollection') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
dataset_lines = [x.strip() for x in content] 

In [73]:
dataset_lines[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [74]:
labels, texts = zip(*map(lambda line: line.split('\t', 1), dataset_lines))
#print (texts)
labels = np.asarray(list((map(lambda label: (label == 'spam'), labels))))
#print (labels)
texts = np.asarray(texts)

In [75]:
len(texts), len(labels)

(5574, 5574)

In [71]:
l = list(map(lambda label: (label == 'spam'),labels))
np.asarray(l)

array([False, False, False, ..., False, False, False], dtype=bool)

In [76]:
labels

array([False, False,  True, ..., False, False, False], dtype=bool)

In [77]:
count_vectorizer = CountVectorizer()
X_data = count_vectorizer.fit_transform(texts)

In [78]:
X_data

<5574x8713 sparse matrix of type '<class 'numpy.int64'>'
	with 74169 stored elements in Compressed Sparse Row format>

In [80]:
estimator = LogisticRegression()
score = cross_val_score(estimator, X_data, labels, scoring='f1', cv=10)
np.mean(score.flat)


0.93334852685794145

In [81]:
log_reg = LogisticRegression()
log_reg.fit(X_data, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [88]:
test_messages = [
    "FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
    "FreeMsg: Txt: claim your reward of 3 hours talk time",]

In [89]:
print (' '.join(map(lambda b: str(int(b)), log_reg.predict(count_vectorizer.transform(test_messages)))))

1 1


In [86]:
test_messages2=[
    "Only 99$"
]

In [90]:
print (' '.join(map(lambda b: str(int(b)), log_reg.predict(count_vectorizer.transform(test_messages2)))))

0


In [98]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-2, 5, num=10),
    'class_weight': ['balanced'],
    'max_iter': [10, 100]
}

In [99]:
gscv = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, scoring='f1', cv=10, verbose=2)
gscv.fit(X_data, labels)
gscv.best_params_


Fitting 10 folds for each of 40 candidates, totalling 400 fits
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l1 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l1 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l1 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l1 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l1 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l1 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l1 ..........
[CV] . C=0.01,

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l2 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l2 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l2 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l2 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l2 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l2 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l2 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l2 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l2 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l2 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l2 ..........
[CV] . C=0.01, class_weight=balanced, max_iter=10, penalty=l2 -   0.0s
[CV] C=0.01, class_weight=balanced, max_iter=10, penalty=l2 ..........
[CV] .

[CV]  C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l2 
[CV]  C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l2 -   0.0s
[CV] C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l2 
[CV]  C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l2 -   0.0s
[CV] C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l2 
[CV]  C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l2 -   0.0s
[CV] C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=l2 
[CV]  C=0.0599484250319, class_weight=balanced, max_iter=100, penalty=

[CV]  C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l1 -   0.1s
[CV] C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l2 .
[CV]  C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l2 -   0.1s
[CV] C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l2 .
[CV]  C=2.15443469003, class_weight=balanced, max_iter=10, penalty=l2 -   0.0s
[CV] C=2.15443469003,

[CV]  C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=12.9154966501, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=12.9154

[CV]  C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 -   0.1s
[CV] C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 .
[CV]  C=464.158883361, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=464.158883361,

[CV]  C=2782.55940221, class_weight=balanced, max_iter=10, penalty=l2 -   0.1s
[CV] C=2782.55940221, class_weight=balanced, max_iter=10, penalty=l2 .
[CV]  C=2782.55940221, class_weight=balanced, max_iter=10, penalty=l2 -   0.1s
[CV] C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 
[CV]  C=2782.55940221, class_weight=balanced, max_iter=100, penalty=l1 -   0.0s
[CV] C=2782.5594

[CV]  C=16681.005372, class_weight=balanced, max_iter=100, penalty=l2 -   0.1s
[CV] C=16681.005372, class_weight=balanced, max_iter=100, penalty=l2 .
[CV]  C=16681.005372, class_weight=balanced, max_iter=100, penalty=l2 -   0.1s
[CV] C=16681.005372, class_weight=balanced, max_iter=100, penalty=l2 .
[CV]  C=16681.005372, class_weight=balanced, max_iter=100, penalty=l2 -   0.1s
[CV] C=16681.005372, class_weight=balanced, max_iter=100, penalty=l2 .
[CV]  C=16681.005372, class_weight=balanced, max_iter=100, penalty=l2 -   0.1s
[CV] C=16681.005372, class_weight=balanced, max_iter=100, penalty=l2 .
[CV]  C=16681.005372, class_weight=balanced, max_iter=100, penalty=l2 -   0.1s
[CV] C=100000.0, class_weight=balanced, max_iter=10, penalty=l1 ......
[CV]  C=100000.0, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=100000.0, class_weight=balanced, max_iter=10, penalty=l1 ......
[CV]  C=100000.0, class_weight=balanced, max_iter=10, penalty=l1 -   0.0s
[CV] C=100000.0, class_weight=b

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   17.8s finished


{'C': 77.426368268112697,
 'class_weight': 'balanced',
 'max_iter': 10,
 'penalty': 'l2'}