<a href="https://colab.research.google.com/github/s10114618/Machine-Learning-NLP-Spam-Detector/blob/master/SpamDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Challenge
In this challenge, we will use Natural Language Processing and Naive Bayes Classifier to detect spam. We will compare Naive Bayes with Logistic Regression to understand which is better for spam detection.


### Concepts
1. Try out Naive Bayes Classifier on another dataset and analyze the outcome
2. Compare Naive Bayes against LogisticRegression

### Useful Links
[Email Dataset](http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/index.html) <br>
[Supervised Classification](https://www.nltk.org/book/ch06.html)<br>
[Feature Extraction](https://scikit-learn.org/stable/modules/feature_extraction.html) <br>
[Lemmatization](https://www.machinelearningplus.com/nlp/lemmatization-examples-python/)<br>
[NLP Vectorization](https://towardsdatascience.com/natural-language-processing-count-vectorization-with-scikit-learn-e7804269bb5e)

In [None]:
# Imports and additional functions which will be used for the Lab.
from __future__ import print_function, division
import pandas as pd
import string
import nltk
import random
import os
import tarfile

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from collections import Counter
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier, classify

In [None]:
# Download ntlk data to working directory
dirpath = os.getcwd()
# Nltk is NLP tool kit
nltk.data.path = [dirpath+'/nltk_data']
nltk.download('stopwords', download_dir='nltk_data') #i me my myself we our ours
nltk.download('punkt', download_dir='nltk_data') #
nltk.download('wordnet', download_dir='nltk_data') #
stoplist = stopwords.words('english')
print("Print the length of Stoplist: "+ str(len(stoplist)))
print("Print some example of Stoplist: " + str(stoplist[-5:]))

[nltk_data] Downloading package stopwords to nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Print the length of Stoplist: 179
Print some example of Stoplist: ["weren't", 'won', "won't", 'wouldn', "wouldn't"]


In [None]:
# Download online Spam dataset
import urllib.request
import tarfile
enron2_dataset_url = "http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron2.tar.gz"
enron2_dataset_fileobj = urllib.request.urlopen(enron2_dataset_url)

tf = tarfile.open(fileobj=enron2_dataset_fileobj, mode="r|gz")
tf.extractall(path=".")

# Read each item in file and store into a List Object
def init_lists(folder):
    key_list = []
    file_content = os.listdir(folder)
    for a_file in file_content:
        f = open(folder + a_file, 'r', encoding='cp437')
        key_list.append(f.read())
    f.close()
    return key_list

spam = init_lists('./enron2/spam/')
ham = init_lists('./enron2/ham/')

print("Print the length of Spam Messages: "+ str(len(spam)))
print("Print some example of Spam: " + str((spam[10])))

Print the length of Spam Messages: 1496
Print some example of Spam: Subject: save your money buy getting this thing here
you have not tried cialls yet ?
than you cannot even imagine what it is like to be a real man in bed !
the thing is that a great errrectlon is provided for you exactly when you want .
ciails has a iot of advantaqes over viaqra
- the effect lasts 36 hours !
- you are ready to start within just 10 minutes !
- you can mix it with aicohol ! we ship to any country !
get it riqht now ! .



In [None]:
print("Print the length of ham Messages: "+ str(len(ham)))
print("Print some example of ham: " + str((ham[10])))

Print the length of ham Messages: 4361
Print some example of ham: Subject: re : grades
mr . kaminsky ,
i still need grades for :
israni , rakhi
lu , feng
planck , jeffrey
so , winny
taylor , orlando
wankhade , sanjay
zhang , ning
i will be available by e - mail this evening or by phone ( 5 : 30 or so ) at
713 - 668 - 1704 . ? i just called the registrar ' s office and if i bring in the
grades by 8 : 30 tomorrow morning we will be fine . ? please advise .
thanks for your help . - pam
at 08 : 23 am 5 / 4 / 01 - 0500 , vince . j . kaminski @ enron . com wrote :
pam ,
the last group .
please , let me know if any name is missing .
( embedded image moved to file : pic 25177 . pcx )
grade : a
thanks a lot . it was a pleasure working with you .
vince kaminski


In [None]:
all_mails = [(mail, 'spam') for mail in spam]
all_mails += [(mail, 'ham') for mail in ham]
print ('Corpus of size = ' + str(len(all_mails)) + ' mails')

Corpus of size = 5857 mails


In [None]:
## Tokenise sentences into word/char
def get_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}

def preprocess(sentence):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence)]

all_features = [(get_features(mail, ''), label) for (mail, label) in all_mails]
print ('Fetched ' + str(len(all_features)) + ' feature sets')
print("Example of tokenized word/char: "+str(all_features[0:1][0:5]))

Fetched 5857 feature sets
Example of tokenized word/char: [({'subject': True, ':': True, 'returned': True, 'mail': True, 'message': True, 'sent': True, 'could': True, 'delivered': True, '.': True, 'original': True, 'wa': True, 'received': True, '19': True, 'jul': True, '2005': True, '10': True, '57': True, '00': True, '+': True, '0100': True, '?': True, '-': True, 'following': True, 'address': True, 'delivery': True, 'problem': True, '(': True, 'permanent': True, 'unrecoverable': True, 'error': True, ')': True, 'diese': True, 'e': True, 'enthσlt': True, 'vertrauliche': True, 'und': True, '/': True, 'oder': True, 'rechtlich': True, 'geschⁿtzte': True, 'informationen': True, 'wenn': True, 'sie': True, 'nicht': True, 'der': True, 'richtige': True, 'adressat': True, 'sind': True, 'irrtⁿmlich': True, 'erhalten': True, 'haben': True, ',': True, 'informieren': True, 'bitte': True, 'sofort': True, 'den': True, 'absender': True, 'vernichten': True, 'da': True, 'unerlaubte': True, 'kopieren': Tr

In [None]:
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set of size= ' + str(len(train_set)) + ' mails')
    print ('Test set of size = ' + str(len(test_set)) + ' mails')
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    nbc_accuracy = classify.accuracy(classifier, test_data) * 100
    return train_set, test_set, classifier, nbc_accuracy

# train the classifier and get the training and test dataset
train_set, test_set, classifier, nbc_accuracy = train(all_features, 0.8)

Training set of size= 4685 mails
Test set of size = 1172 mails


In [None]:
def evaluate(train_set, test_set, classifier):
    # test accuracy of classifier on training and test set
    print ('Training set accuracy = ' + str(classify.accuracy(classifier, train_set)))
    print ('Test set accuracy = ' + str(classify.accuracy(classifier, test_set)))
    # check most informative words for the classifier
    classifier.show_most_informative_features(20)

# evaluate performance
evaluate(train_set, test_set, classifier)

Training set accuracy = 0.9980789754535753
Test set accuracy = 0.9982935153583617
Most Informative Features
                   vince = True              ham : spam   =    635.6 : 1.0
                     hou = True              ham : spam   =    338.4 : 1.0
                     ect = True              ham : spam   =    215.4 : 1.0
                 shirley = True              ham : spam   =    167.2 : 1.0
                      cc = True              ham : spam   =    162.6 : 1.0
                     713 = True              ham : spam   =    150.6 : 1.0
                     oem = True             spam : ham    =     90.2 : 1.0
                     853 = True              ham : spam   =     89.3 : 1.0
              macromedia = True             spam : ham    =     87.4 : 1.0
               forwarded = True              ham : spam   =     85.5 : 1.0
                     php = True             spam : ham    =     81.7 : 1.0
                   woman = True             spam : ham    =     80.

#Same dataset, different model - Logistic Regression model accuracy


In [None]:
df_mails = pd.DataFrame(all_features, columns=['words','label'])
df_mails.head()

Unnamed: 0,words,label
0,"{'subject': True, ':': True, 'returned': True,...",spam
1,"{'subject': True, ':': True, 'esecure': True, ...",spam
2,"{'subject': True, ':': True, 'online': True, '...",spam
3,"{'subject': True, ':': True, 'free': True, 'st...",spam
4,"{'subject': True, ':': True, 'catalogue': True...",spam


In [None]:
# Fit the dataframe column into the Vectorizer and get feature names
dictVectorizer = DictVectorizer()
train_data_dict = dictVectorizer.fit_transform(df_mails['words'])

In [None]:
# Following the train function for Naive Bayes, use the first 80% to train, next 20% to test. 
split_size = round(df_mails.shape[0] * 0.8)

# Get the features and result columns
att = train_data_dict
res = df_mails['label']

# Split training data
train_att = att[:split_size]
train_res = res[:split_size]

# Split testing data
test_att = att[split_size:]
test_res = res[split_size:]

In [None]:
# Do LogisticRegression
clf = LogisticRegression()
clf.fit(train_att, train_res)
pred = clf.predict(test_att)
lr_accuracy = 100.0 * accuracy_score(test_res, pred)
print("LR accuracy: " + str(lr_accuracy))

LR accuracy: 99.1460290350128


In [None]:
## Final comparison
print("NB: " + str(nbc_accuracy) + "\nLR: " + str(lr_accuracy))

NB: 99.57337883959045
LR: 99.1460290350128
