# SMS AntiSpam model

*by Ilya Zakharkin (2017)*

In [147]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

### Working with data

In [148]:
df = pd.read_csv('./SMSSpamCollection.txt', sep='\t', header=None)

In [149]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [150]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [151]:
df.describe()

Unnamed: 0,0,1
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [152]:
df.columns = ['is_spam', 'message']

In [153]:
df.head()

Unnamed: 0,is_spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [154]:
messages = np.array(df['message'].values, dtype='str')
y = [1 if mark == 'spam' else 0 for mark in df['is_spam'].values]

In [155]:
messages[:5]

array([ 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       'U dun say so early hor... U c already then say...',
       "Nah I don't think he goes to usf, he lives around here though"],
      dtype='<U910')

In [156]:
y[:5]

[0, 0, 1, 0, 0]

### Let`s use CountVectorizer

In [157]:
vect_count = CountVectorizer()
X = vect_count.fit_transform(raw_documents=messages)

In [158]:
X

<5572x8713 sparse matrix of type '<class 'numpy.int64'>'
	with 74169 stored elements in Compressed Sparse Row format>

### Now it is time to apply some model (5th task)

In [159]:
clf = LogisticRegression(random_state=2)

In [160]:
cv_scores = cross_val_score(estimator=clf, X=X, y=y, cv=10, scoring='f1')

In [161]:
with open('1.txt', 'w') as wfile:
    wfile.write(str(round(cv_scores.mean(), 1)))
print(round(cv_scores.mean(), 1))

0.9


### Checking the model on various inputs (6th task)

In [162]:
test_texts = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB", 
              "FreeMsg: Txt: claim your reward of 3 hours talk time", 
              "Have you visited the last lecture on physics?", 
              "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$", 
              "Only 99$"]

In [163]:
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [164]:
preds = logreg.predict(vect_count.transform(test_texts))

In [165]:
with open('2.txt', 'w') as wfile:
    for pred in preds:
        wfile.write(str(pred) + ' ')
print(preds)

[1 1 0 0 0]


### Trying n-grams (7th task)

In [166]:
vect_count2 = CountVectorizer(ngram_range=(2,2))
vect_count3 = CountVectorizer(ngram_range=(3,3))
vect_count123 = CountVectorizer(ngram_range=(1,3))

In [167]:
X2 = vect_count2.fit_transform(raw_documents=messages)
X3 = vect_count3.fit_transform(raw_documents=messages)
X123 = vect_count123.fit_transform(raw_documents=messages)

In [168]:
clf = LogisticRegression(random_state=2)
cv_scores2 = cross_val_score(estimator=clf, X=X2, y=y, cv=10, scoring='f1')

In [169]:
clf = LogisticRegression(random_state=2)
cv_scores3 = cross_val_score(estimator=clf, X=X3, y=y, cv=10, scoring='f1')

In [170]:
clf = LogisticRegression(random_state=2)
cv_scores123 = cross_val_score(estimator=clf, X=X123, y=y, cv=10, scoring='f1')

In [171]:
scores = [cv_scores2.mean(), cv_scores3.mean(), cv_scores123.mean()]
with open('3.txt', 'w') as wfile:
    for score in scores:
        wfile.write(str(round(score, 2)) + ' ')
        print(str(round(score, 2)) + ' ')

0.82 
0.73 
0.93 


### Trying MultinomialNB as classifier (8th task)

In [173]:
clf = MultinomialNB()

In [174]:
cv_scores2 = cross_val_score(estimator=clf, X=X2, y=y, cv=10, scoring='f1')

In [175]:
cv_scores3 = cross_val_score(estimator=clf, X=X3, y=y, cv=10, scoring='f1')

In [176]:
cv_scores123 = cross_val_score(estimator=clf, X=X123, y=y, cv=10, scoring='f1')

In [177]:
scores = [cv_scores2.mean(), cv_scores3.mean(), cv_scores123.mean()]
with open('4.txt', 'w') as wfile:
    for score in scores:
        wfile.write(str(round(score, 2)) + ' ')
        print(str(round(score, 2)) + ' ')

0.65 
0.38 
0.89 


### Let`s use TfidfVectorizer (9th task)

In [187]:
vect_tfidf = TfidfVectorizer()
X = vect_tfidf.fit_transform(raw_documents=messages)

In [188]:
clf = LogisticRegression(random_state=2)

In [189]:
cv_scores = cross_val_score(estimator=clf, X=X, y=y, cv=10, scoring='f1')

In [190]:
with open('5.txt', 'w') as wfile:
    wfile.write(str(round(cv_scores.mean(), 2)))
print(round(cv_scores.mean(), 2))

0.85


In [192]:
with open('5.txt', 'w') as wfile:
    wfile.write('-1')