# SMS Spam Filtering

### loading data

In [1]:
dt =[]
with open('SMSSpamCollection') as f:
    for line in f:
        dt.append(line.strip().split('\t'))

### shuffle

In [2]:
from random import shuffle
shuffle(dt)

### funció preprocess

In [3]:
import string
import re
from nltk.corpus import stopwords

sw=set(stopwords.words('english'))

def preprocess(ex):
    t = re.sub(' +',' ',''.join([c for c in ex.lower() if c not in string.punctuation]))
    return [w for w in t.split(' ') if w not in sw]

### X, Y i llindar

In [4]:
llindar = len(dt) // 2
train = [(ex[0], preprocess(ex[1])) for ex in dt[:llindar]]
test = [(ex[0], preprocess(ex[1])) for ex in dt[llindar:]]

### funció kNN (k=1)

In [16]:
train[0]

('ham', ['released', 'another', 'italian', 'one', 'today', 'cosign', 'option'])

In [15]:
from nltk.metrics.distance import jaccard_distance
def jaccard(a, b):
    return jaccard_distance(set(a), set(b))

def kNN(train, exTest):
    return min(train,key=lambda x: jaccard(exTest[1], x[1]))[0]

### jaccard

In [7]:
from nltk.metrics.scores import accuracy
from nltk.metrics import ConfusionMatrix
refs = [t[0] for t in test]
preds = [kNN(train, ex) for ex in test]

In [8]:
refs = [t[0] for t in test]
round(accuracy(refs, preds), 3)

0.977

In [9]:
print(ConfusionMatrix(refs, preds).pretty_format())

     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2403>  14 |
spam |   51 <319>|
-----+-----------+
(row = reference; col = test)



### sklearn example

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
Xtrn = cv.fit_transform([' '.join(ex[1]) for ex in train])
Xtst = cv.transform([' '.join(ex[1]) for ex in test])
Ytrn = [ex[0] for ex in train]
Ytst = [ex[0] for ex in test]

In [11]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(1)
clf.fit(Xtrn, Ytrn)
preds = clf.predict(Xtst).tolist()
round(accuracy(refs, preds), 3)

0.938

In [12]:
print(ConfusionMatrix(refs, preds).pretty_format())

     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2415>   2 |
spam |  172 <198>|
-----+-----------+
(row = reference; col = test)



In [13]:
from sklearn.svm import SVC

clf = SVC(kernel='linear')
clf.fit(Xtrn, Ytrn)
preds = clf.predict(Xtst).tolist()
round(accuracy(refs, preds), 3)

0.979

In [14]:
print(ConfusionMatrix(refs, preds).pretty_format())

     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2412>   5 |
spam |   54 <316>|
-----+-----------+
(row = reference; col = test)

