In [5]:
import pandas as pd
sms = pd.read_csv("./data/SMSSpamCollection",sep = '\t',header=None)

In [6]:
sms.columns = ['spam','message']
sms.head(5)

Unnamed: 0,spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
docs_lower = [d.lower() for d in sms['message']]

In [8]:
def count_word(word, sentence):
    tokens = sentence.split()
    return len([w for w in tokens if w == word])

free_counts = [count_word('free', d) for d in docs_lower]
df = pd.DataFrame(free_counts, columns=['free'])

In [9]:
import re
def count_numbers(sentence):
    return len(re.findall('[0-9]', sentence))

df['num_char'] = [count_numbers(d) for d in docs_lower]


In [10]:
df.head()

Unnamed: 0,free,num_char
0,0,0
1,0,0
2,1,25
3,0,0
4,0,0


In [11]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [12]:
def split_fit_eval(X, y, model=None,epochs=10,random_state=0):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
    if not model:
        model = Sequential()
        model.add(Dense(1, input_dim=X.shape[1],activation='sigmoid'))
        model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
        h = model.fit(X_train, y_train,epochs=epochs,verbose=0)
        loss, acc = model.evaluate(X_test, y_test)
        return loss, acc, model, h

In [13]:
y = sms.spam.apply(lambda x: 1 if x == 'spam' else 0)
res = split_fit_eval(df.values, y)



In [14]:
print("Simple model accuracy: {:0.3f}".format(res[1]))

Simple model accuracy: 0.971


In [15]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(df.values,y)
dummy_clf.score(df.values,y)

0.8659368269921034

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vocab_size =3000
vect = CountVectorizer(decode_error='ignore',
            stop_words='english',
            lowercase=True,
            max_features=vocab_size)
X = vect.fit_transform(sms['message'])
X

<5572x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 37142 stored elements in Compressed Sparse Row format>

In [17]:
Xd = X.todense()
vocab = vect.get_feature_names()
vocab[:10]

['00',
 '000',
 '02',
 '0207',
 '02073162414',
 '03',
 '04',
 '05',
 '06',
 '07123456789']

In [18]:
vocab[-10:]

['yogasana', 'yor', 'yr', 'yrs', 'yummy', 'yun', 'yunny', 'yuo', 'yup', 'zed']

In [19]:
res = split_fit_eval(Xd, y)
print("Test set accuracy:\t{:0.3f}".format(res[1]))

Test set accuracy:	0.978


In [20]:
model = res[2]
w_ = model.get_weights()[0].ravel()
vocab_weights = pd.Series(w_, index=vocab)
vocab_weights.sort_values(ascending=False).head(20)

www        0.592438
txt        0.562227
claim      0.529760
uk         0.525961
150p       0.511881
service    0.496169
free       0.492492
18         0.491958
16         0.477451
mobile     0.468412
prize      0.467345
stop       0.465982
reply      0.428802
urgent     0.401753
1000       0.388725
won        0.385079
chat       0.368362
text       0.361913
500        0.357648
dating     0.352068
dtype: float32

In [21]:
vocab_weights.sort_values(ascending=False).tail(20)

pick    -0.454934
wat     -0.455426
yeah    -0.457931
fine    -0.469903
going   -0.477141
think   -0.488829
gt      -0.491252
way     -0.501204
home    -0.503352
good    -0.503890
lt      -0.507017
lol     -0.515124
lor     -0.517260
da      -0.523463
later   -0.525618
oh      -0.527135
come    -0.545150
sorry   -0.553396
ll      -0.594037
ok      -0.619448
dtype: float32