In [1]:
import numpy as np

In [2]:
f = open("SMSSpamCollection", encoding="utf8")
text = f.read()

In [3]:
mes = text.split("\n")

In [4]:
N = len(mes) - 1
data = []
label = np.zeros(N)
labelmap = {"spam": 1, "ham": 0}
for i in range(N):
    lab, sms = mes[i].split("\t")
    data.append(sms)
    label[i] = labelmap[lab]

In [5]:
N

5574

In [6]:
label.sum()

747.0

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [78]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)

In [9]:
len(vectorizer.get_feature_names())

8713

### LogisticRegression

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

In [80]:
clf = LogisticRegression()
res = cross_val_score(clf, X, label, scoring="f1", cv=10)
print("%.4f" % res.mean())

0.9333


In [81]:
Test = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
"Only 99$"] 

In [82]:
X_Test = vectorizer.transform(Test)

In [83]:
clf = LogisticRegression()
clf.fit(X, label)
print(clf.predict(X_Test))

[ 1.  1.  0.  0.  0.]


### LogisticRegression на различных n-граммах

In [16]:
ngram_ranges = [(2, 2), (3, 3), (1, 3), (1, 1)]
for ngram in ngram_ranges:
    vectorizer = CountVectorizer(ngram_range=ngram)
    X = vectorizer.fit_transform(data)
    clf = LogisticRegression()
    res = cross_val_score(clf, X, label, scoring="f1", cv=10)
    print(ngram, ": ", "%.2f" % res.mean())

(2, 2) :  0.82
(3, 3) :  0.73
(1, 3) :  0.93
(1, 1) :  0.93


### MultinomialNB

In [17]:
from sklearn.naive_bayes import MultinomialNB
ngram_ranges = [(2, 2), (3, 3), (1, 3), (1, 1)]
for ngram in ngram_ranges:
    vectorizer = CountVectorizer(ngram_range=ngram)
    X = vectorizer.fit_transform(data)
    clf = MultinomialNB()
    res = cross_val_score(clf, X, label, scoring="f1", cv=10)
    print(ngram, ": ", "%.2f" % res.mean())

(2, 2) :  0.65
(3, 3) :  0.38
(1, 3) :  0.89
(1, 1) :  0.93


## Tfidf

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range = (1, 1))
X = tfidf.fit_transform(data)
clf = LogisticRegression()
res = cross_val_score(clf, X, label, scoring="f1", cv=10)
print("%.2f" % res.mean())

0.85


### Можно улучшить результат если рассмотреть еще символьные n-граммы

In [19]:
from sklearn.pipeline import FeatureUnion

In [62]:
vectorizer_char = CountVectorizer(min_df = 2, max_df = 200, ngram_range = (3, 5), analyzer="char")
vectorizer_word = CountVectorizer(min_df = 2, ngram_range = (1, 2), analyzer="word")
combined_features = FeatureUnion([("word", vectorizer_word), ("char", vectorizer_char)])
X = combined_features.fit_transform(data)

In [63]:
from sklearn.svm import LinearSVC

In [64]:
clf = MultinomialNB()
res = cross_val_score(clf, X, label, scoring="f1", cv=10)
print("%.4f" % res.mean())

0.9614


In [72]:
logreg = LogisticRegression()
svc = LinearSVC()
res = cross_val_score(logreg, X, label, scoring="f1", cv=10)
print("logreg: " "%.4f" % res.mean())
res = cross_val_score(svc, X, label, scoring="f1", cv=10)
print("svc: " "%.4f" % res.mean())

logreg: 0.9477
svc: 0.9529


### Добавим SVD

In [74]:
from sklearn.decomposition import TruncatedSVD

In [75]:
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_SVD = svd.fit_transform(X)
res = cross_val_score(logreg, X_SVD, label, scoring="f1", cv=10)
print("logreg: " "%.4f" % res.mean())
res = cross_val_score(svc, X_SVD, label, scoring="f1", cv=10)
print("svc: " "%.4f" % res.mean())

logreg: 0.9631
svc: 0.9559


### Вывод 
Используя простые признаки как мешок слов можно неплохо решать некоторые задачи анализа текстов.
Лучшими признаками среди рассмотренных n-грамм являются 1-граммы слов.
