# Your Name: Stephanie Buchanan

# import all packages 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import nltk
import contractions
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB, CategoricalNB, MultinomialNB
from sklearn.metrics import accuracy_score

# Data Prepocessing

In [2]:
#load the data
emails = pd.read_csv('spam.csv')
emails.drop('FILE_NAME', axis = 1, inplace = True)
emails.columns = ['spam', 'email_message']

#check for missing data
emails.isna().sum()

spam             0
email_message    0
dtype: int64

In [3]:
def clean(text, stopwords):
    # remove tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text) 
    # split text on whitespace
    text_list = text.split()
    expanded_words = []
    text_words = []
    
    punctuation = set(string.punctuation)
    punctuation.add("“")
    punctuation.add("”")
    
    # keep #tags and @mentions
    ## punctuation.remove("#")
    ## punctuation.remove("@")
    
    for word in text_list:
        expanded_words.append(contractions.fix(word))
    
    for word in expanded_words:
        # remove punctuation marks at the beginning
        # of each word
        while len(word) > 0 and word[0] in punctuation:
            word = word[1:]
        # remove punctuation marks at the end of each word
        while len(word) > 0 and word[-1] in punctuation: 
            word = word[:-1]
        # a rule to eliminate most urls
        if len(word) > 0 and "/" not in word: 
            # eliminate stopwords
            if word.lower() not in stopwords:
                # append the word to the text_words list
                text_words.append(word.lower())
    
    cleaner_text = ' '.join([lemmatizer.lemmatize(w) for w in text_words])
    
    return cleaner_text

In [4]:
sw = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [5]:
emails['email_message'] = emails['email_message'].apply(clean, stopwords=sw) 

In [6]:
emails_data = emails[emails['email_message'].str.len() > 60] 

# Data Splitting 

In [7]:
X_train_em, X_test_em, y_train_em, y_test_em = train_test_split(emails_data['email_message'],emails_data['spam'], 
                                                                        test_size=0.30, random_state=42)

# Model Building and Evaluation 

In [8]:
tfidf2 = TfidfVectorizer(ngram_range=(1,2), stop_words="english", min_df=10, max_features=None)
email_pipe = Pipeline([("tfidf",tfidf2), ("mnb", MultinomialNB())])
email_pipe.fit(X_train_em, y_train_em)

scores_em = cross_val_score(estimator=email_pipe, X=X_train_em, y=y_train_em, cv=5) 

print(f'Average cross validation score: {scores_em.mean():.4f}')
print(f'Standard deviation of cross validation scores: {scores_em.std():.4f}')

Average cross validation score: 0.9826
Standard deviation of cross validation scores: 0.0042


In [9]:
print(f'Training accuracy for Email Multinomial NB model: {email_pipe.score(X_train_em, y_train_em):.4f}')
print(f'Test accuracy for Email Multinomial NB model: {email_pipe.score(X_test_em, y_test_em):.4f}')

Training accuracy for Email Multinomial NB model: 0.9871
Test accuracy for Email Multinomial NB model: 0.9823


In [10]:
param_grid_em = [{'mnb__alpha': [0.1, 0.01, 0.001],
                   'mnb__fit_prior': [True, False],
                   'tfidf__min_df':[5, 20],
                   'tfidf__ngram_range':[(1, 1), (1, 2)]}]

grid_em = GridSearchCV(estimator=email_pipe, param_grid =param_grid_em, cv=3) 

grid_em.fit(X_train_em, y_train_em)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(min_df=10,
                                                        ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('mnb', MultinomialNB())]),
             param_grid=[{'mnb__alpha': [0.1, 0.01, 0.001],
                          'mnb__fit_prior': [True, False],
                          'tfidf__min_df': [5, 20],
                          'tfidf__ngram_range': [(1, 1), (1, 2)]}])

In [11]:
print("Best Params: ", grid_em.best_params_)
print(f'Training accuracy for Multinomial Grid Search: {grid_em.score(X_train_em, y_train_em):.4f}')
print(f'Test accuracy for Multinomial Grid Search: {grid_em.score(X_test_em, y_test_em):.4f}')

Best Params:  {'mnb__alpha': 0.01, 'mnb__fit_prior': False, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2)}
Training accuracy for Multinomial Grid Search: 0.9970
Test accuracy for Multinomial Grid Search: 0.9929


# Conclusion 

This project was aimed at classifying different email messages as either spam or not-spam.  The dataset was a labeled dataset, and the attributes were the email text and the spam label.  There was also a 'file name' attribute that contained a unique identifier for each email messgae that was dropped from the dataset to simplify the data.

A Multinomial Naive Bayes classifer was selected to classify email messages as spam or not-spam.  The email messages were cleaned for punctuation, the contractions were expanded, the words were lemmatized and the stopwords were removed.  The text was also vectorized using the TfidfVectorizer, which applies a weight to the tokens with higher weights given to more infrequent words relative to the whole corpus.  

The initial model performed well at a 98.3% accuracy using cross-validation with 5-folds. Overfitting was not observed as the training and test scores were both very similar.  A grid search was performed to tune the 'alpha' and 'fit_prior' paramters for the classifier, and the 'min_df' and 'ngram_range' parameters for the TfidfVectorizer.  'Alpha' is the Laplace smoothing parameter, and the 'fit_prior' parameter is a boolean representing whether to learn class prior probabilities or not.  The 'min_df' represents the number of documents that is the threshold, thus instructs the algorithm to ignore terms that have a document frequency strictly lower than the given value. The 'ngram_range' parameter gives the upper and lower range for the n-grams to be extracted.

In conclusion, the Multinomial Naive Bayes classifier is a good choice for detecting spam from email messages.  The training accuracy with the parameters: {'mnb__alpha': 0.01, 'mnb__fit_prior': False, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2)} was 99.7% and the test accuracy was 99.3%.  Overfitting does not seem to be present in this model due to the fact the testing and test accuracies were very similar and not significantly different.  This model can be used to classify future email messages as spam or not-spam.