In [12]:
# Spam Detector using machine learning

In [33]:
import pandas as pd
df = pd.read_csv("SMSSpamCollection.txt", sep="\t", names=["label", "message"])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Data Preprocessing

In [14]:
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [15]:
stemmer = SnowballStemmer("english")
def preprocess(document):
    document = document.lower()
    words = word_tokenize(document)
    words = [word for word in words if word not in stopwords.words("english")]
    words = [stemmer.stem(word) for word in words]
    return words

In [34]:
# [([w1,w2,w3..], label1), ([w3,w4,w5], label2)]
dataset = []
for index, row in df.iterrows():
    dataset.append((preprocess(row['message']), row['label']))
print(dataset[2])

(['free', 'entri', '2', 'wkli', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', 'fa', '87121', 'receiv', 'entri', 'question', '(', 'std', 'txt', 'rate', ')', '&', 'c', "'s", 'appli', '08452810075over18', "'s"], 'spam')


In [17]:
len(dataset)

5572

# Split the dataset
Training 80% and Testing 20%

In [18]:
#[1,2,3,4,5,6,7,8,9,10]
# 0,1,2,3,4,5,6,7,8,9
slice_index = int(len(dataset)*0.8)
slice_index

4457

In [19]:
# Add shuffling on dayaset

In [20]:
# 0 to 4457 => Training
train_messages = dataset[:slice_index] # 0:4457
test_messages = dataset[slice_index:] #4457:5572
print(len(train_messages))
print(len(test_messages))

4457
1115


## Feature Extraction

In [21]:
def get_words_in_messages(dataset):
    allwords = []
    for (message, label) in dataset:
        allwords.extend(message)
    return allwords
wordlist = get_words_in_messages(dataset)

In [22]:
len(wordlist)

70131

In [23]:
word_freq = nltk.FreqDist(wordlist)
word_freq

FreqDist({'.': 4738, ',': 1938, '?': 1550, '!': 1397, '...': 1146, 'u': 1136, '&': 922, ';': 768, ':': 722, '..': 697, ...})

In [24]:
word_features = list(word_freq.keys())

In [25]:
len(word_features)

7981

In [26]:
def extract_features(message):
    #print(message)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = word in message
    return features

In [27]:
training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [28]:
training_set[:1]

[({'contains(go)': True, 'contains(jurong)': True, 'contains(point)': True, 'contains(,)': True, 'contains(crazi)': True, 'contains(..)': True, 'contains(avail)': True, 'contains(bugi)': True, 'contains(n)': True, 'contains(great)': True, 'contains(world)': True, 'contains(la)': True, 'contains(e)': True, 'contains(buffet)': True, 'contains(...)': True, 'contains(cine)': True, 'contains(got)': True, 'contains(amor)': True, 'contains(wat)': True, 'contains(ok)': False, 'contains(lar)': False, 'contains(joke)': False, 'contains(wif)': False, 'contains(u)': False, 'contains(oni)': False, 'contains(free)': False, 'contains(entri)': False, 'contains(2)': False, 'contains(wkli)': False, 'contains(comp)': False, 'contains(win)': False, 'contains(fa)': False, 'contains(cup)': False, 'contains(final)': False, 'contains(tkts)': False, 'contains(21st)': False, 'contains(may)': False, 'contains(2005.)': False, 'contains(text)': False, 'contains(87121)': False, 'contains(receiv)': False, 'contains(

In [29]:
model = nltk.NaiveBayesClassifier.train(training_set)

In [30]:
nltk.classify.accuracy(model, testing_set)

0.9820627802690582

In [31]:
test_message = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
test_features = extract_features(preprocess(test_message))
model.classify(test_features)

'spam'

In [32]:
19/20

0.95