In [22]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import random

##### Reading data

In [13]:
data = pd.read_csv('https://cdn.upgrad.com/UpGrad/temp/bab3e784-e601-4911-9000-f1fbc994a62d/SMSSpamCollection.txt',sep= '\t', names=['label','message'])

In [14]:
#Converting data in list of tuples
spam_data = []
for idx,row in data.iterrows():
    spam_data.append((row['message'],row['label']))

In [15]:
spam_data

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  'ham'),
 ('Ok lar... Joking wif u oni...', 'ham'),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  'spam'),
 ('U dun say so early hor... U c already then say...', 'ham'),
 ("Nah I don't think he goes to usf, he lives around here though", 'ham'),
 ("FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
  'spam'),
 ('Even my brother is not like to speak with me. They treat me like aids patent.',
  'ham'),
 ("As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
  'ham'),
 ('WINNER!! As a valued network customer you have been selected to receivea £900 prize 

In [16]:
print(len(spam_data))

5572


### Preprocessing data

In [17]:
def preprocessing(doc, stem = True):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    doc = doc.lower()
    words = word_tokenize(doc)
    words = [word for word in words if word not in stopwords.words('english')]
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [lemmatizer.lemmatize(word, pos= 'v') for word in words]
    doc = ' '.join(words)
    return doc

In [18]:
messages = []
for (message,label) in spam_data:
    preprocessed = [e.lower() for e in preprocessing(message, stem=False).split() if len(e) >= 3]
    messages.append((preprocessed,label))

In [20]:
print(messages)

[(['jurong', 'point', 'crazy', 'available', 'bugis', 'great', 'world', 'buffet', '...', 'cine', 'get', 'amore', 'wat', '...'], 'ham'), (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'), (['free', 'entry', 'wkly', 'comp', 'win', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', '87121', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply', '08452810075over18'], 'spam'), (['dun', 'say', 'early', 'hor', '...', 'already', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'usf', 'live', 'around', 'though'], 'ham'), (['freemsg', 'hey', 'darling', 'week', 'word', 'back', 'like', 'fun', 'still', 'xxx', 'std', 'chgs', 'send', '£1.50', 'rcv'], 'spam'), (['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'], 'ham'), (['per', 'request', "'melle", 'melle', 'oru', 'minnaminunginte', 'nurungu', 'vettam', 'set', 'callertune', 'callers', 'press', 'copy', 'friends', 'callertune'], 'ham'), (['winner', 'value', 'network', 'customer', 'select', 'receivea', '£900', 'prize', 

### Preparing data to create feature

In [21]:
#Creating single list for all words
def get_words_in_messages(msgs):
    all_words = []
    for (message, label) in msgs:
        all_words.extend(message)
    return all_words

In [23]:
def get_word_features(wordlist):
    w_list = nltk.FreqDist(wordlist)
    word_features = w_list.keys()
    return word_features

In [24]:
# Creating word features
word_features = get_word_features(get_words_in_messages(messages))

In [25]:
len(word_features)

8003

### Train-test split

In [26]:
slice_index = int(len(messages)*0.8)

In [27]:
#Shuffle messages
random.shuffle(messages)

In [28]:
train_msgs, test_msgs = messages[:slice_index],messages[slice_index:]

In [29]:
print(len(train_msgs))
print(len(test_msgs))

4457
1115


### Create feature maps for train and test data set

In [30]:
# Creating lazymap
def feature_extraction(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [31]:
training_set = nltk.classify.apply_features(feature_extraction, train_msgs)
testing_set = nltk.classify.apply_features(feature_extraction, test_msgs)

In [32]:
print(training_set[:5])

[({'contains(jurong)': False, 'contains(point)': False, 'contains(crazy)': False, 'contains(available)': False, 'contains(bugis)': False, 'contains(great)': False, 'contains(world)': False, 'contains(buffet)': False, 'contains(...)': False, 'contains(cine)': False, 'contains(get)': False, 'contains(amore)': False, 'contains(wat)': False, 'contains(lar)': False, 'contains(joke)': False, 'contains(wif)': False, 'contains(oni)': False, 'contains(free)': False, 'contains(entry)': False, 'contains(wkly)': False, 'contains(comp)': False, 'contains(win)': False, 'contains(cup)': False, 'contains(final)': False, 'contains(tkts)': False, 'contains(21st)': False, 'contains(may)': False, 'contains(2005.)': False, 'contains(text)': False, 'contains(87121)': False, 'contains(receive)': False, 'contains(question)': False, 'contains(std)': False, 'contains(txt)': False, 'contains(rate)': False, 'contains(apply)': False, 'contains(08452810075over18)': False, 'contains(dun)': False, 'contains(say)': Fa

### Model training

In [33]:
spam_classifier = nltk.NaiveBayesClassifier.train(training_set)

### Evaluation

In [34]:
#Training accuracy
print(nltk.classify.accuracy(spam_classifier,training_set))

0.9919228180390397


In [35]:
#Test accuracy
print(nltk.classify.accuracy(spam_classifier,testing_set))

0.97847533632287


In [36]:
## Testing a example message with our newly trained classifier
m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spam_classifier.classify(feature_extraction(m.split())))

Classification result :  spam


In [39]:
## Priting the most informative features in the classifier
print(spam_classifier.show_most_informative_features(10))

Most Informative Features
         contains(award) = True             spam : ham    =    169.0 : 1.0
         contains(nokia) = True             spam : ham    =    114.5 : 1.0
        contains(camera) = True             spam : ham    =    103.1 : 1.0
       contains(service) = True             spam : ham    =     91.4 : 1.0
          contains(code) = True             spam : ham    =     90.0 : 1.0
        contains(urgent) = True             spam : ham    =     83.7 : 1.0
       contains(attempt) = True             spam : ham    =     81.2 : 1.0
      contains(landline) = True             spam : ham    =     67.1 : 1.0
        contains(todays) = True             spam : ham    =     63.6 : 1.0
           contains(txt) = True             spam : ham    =     63.5 : 1.0
None


In [40]:
## storing the classifier on disk for later usage
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(spam_classifier,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_spam_classifier.pickle
