In [102]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [110]:
## Reading the given dataset
spam = pd.read_csv('spam.csv',encoding='Windows 1250')
# spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])

In [111]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [112]:
spam = spam[['v1','v2']]

In [107]:
spam.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [115]:
## Converting the read dataset in to a list of tuples, each tuple(row) containing the message and it's label
data_set = []
for index,row in spam.iterrows():
    data_set.append((row['v2'], row['v1']))

In [118]:
data_set[:5]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  'ham'),
 ('Ok lar... Joking wif u oni...', 'ham'),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  'spam'),
 ('U dun say so early hor... U c already then say...', 'ham'),
 ("Nah I don't think he goes to usf, he lives around here though", 'ham')]

In [113]:
wordnet = WordNetLemmatizer()
porterstemmer = PorterStemmer()

In [123]:
def preprocess(message,stem=True):

    message = message.lower()
    tokens = word_tokenize(message)
    
    tokens = [token for token in tokens if token not in stopwords.words('english') ]
    if stem is True:
        words = [porterstemmer.stem(token) for token in tokens]
    else:
        words = [wordnet.lemmatize(message) for token in tokens]
    processedmessage = " ".join(words)

    return processedmessage

In [124]:
message_set = []
for (message,label) in data_set:
    words_filtered = [e for e in preprocess(message).split() if len(e)>3]
    message_set.append((words_filtered,label))

In [126]:
print(message_set[:5])

[(['jurong', 'point', 'crazy..', 'avail', 'bugi', 'great', 'world', 'buffet', 'cine', 'amor'], 'ham'), (['joke'], 'ham'), (['free', 'entri', 'wkli', 'comp', 'final', '21st', '2005.', 'text', '87121', 'receiv', 'entri', 'question', 'rate', 'appli', '08452810075over18'], 'spam'), (['earli', 'alreadi'], 'ham'), (['think', 'live', 'around', 'though'], 'ham')]


In [142]:
len(message_set)

5572

In [127]:
def get_all_words(messages):
    all_words = []
    for (message,label) in messages:
        all_words.extend(message)
    return all_words

In [129]:
## - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words
## Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [132]:
word_features = get_word_features(get_all_words(message_set))

In [140]:
len(word_features)

7149

In [144]:
sliceIndex = int((len(message_set)*.8))

In [146]:
## - shuffle the pack to create a random and unbiased split of the dataset
random.shuffle(message_set)

In [147]:
train_messages, test_messages = message_set[:sliceIndex], message_set[sliceIndex:]

In [174]:
## creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [175]:
## - creating the feature map of train and test data

training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [178]:
print(training_set[:1])

 False, 'contains(prasanth)': False, 'contains(ettan)': False, 'contains(08718738002)': False, 'contains(48922)': False, 'contains(21/11/04)': False, 'contains(appi)': False, 'contains(fizz)': False, 'contains(contain)': False, 'contains(genu)': False, 'contains(robinson)': False, 'contains(not..tel)': False, 'contains(name..)': False, 'contains(imat)': False, 'contains(idiot\\)': False, 'contains(msg..sometext)': False, 'contains(07099833605)': False, 'contains(ref:9280114)': False, 'contains(smth..)': False, 'contains(chloe)': False, 'contains(150p/text)': False, 'contains(wewa)': False, 'contains(130.)': False, 'contains(iriv)': False, 'contains(255.)': False, 'contains(da..jst)': False, 'contains(hmmm.but)': False, 'contains(surli)': False, 'contains(07808726822)': False, 'contains(0871-872-9758)': False, 'contains(dokey)': False, 'contains(mmmmmmm)': False, 'contains(*snuggl)': False, 'contains(you*)': False, 'contains(*whispers*)': False, 'contains(healthi)': False, 'contains(2bo

In [179]:
print('Training set size : ', len(training_set))
print('Test set size : ', len(testing_set))

Training set size :  4457
Test set size :  1115


In [180]:
## Training the classifier with NaiveBayes algorithm
spamClassifier = nltk.NaiveBayesClassifier.train(training_set)

In [181]:
## - Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, training_set))

0.9914740857078752


In [182]:
## Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, testing_set))

0.9820627802690582


In [183]:
## Testing a example message with our newly trained classifier
m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spamClassifier.classify(extract_features(m.split())))

Classification result :  ham
