In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import re
import nltk
from contractions import contractions_dict
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [2]:
data = pd.read_csv("emails.csv")

In [3]:
def strip_titles(text):
    if "Subject: re :" in text:
        return text[13:]
    elif "Subject: news :" in text:
        return text[15:]
    else:
        return text[8:]
    
data['text'] = data['text'].apply(lambda x: strip_titles(x))

data['text'] = data['text'].apply(lambda x: word_tokenize(x))

In [4]:
def normalize_tokens(list_of_tokens):
    return map(lambda x: x.lower(),list_of_tokens)

data['text'] = data['text'].apply(lambda x: normalize_tokens(x))

data['text'] = data['text'].apply(lambda x: list(x))

In [5]:
def contracted_word_expansion(token):
    if token in contractions_dict.keys():
        return contractions_dict[token]
    else:
        return token
    
def contractions_expansion(list_of_tokens):
    return map(contracted_word_expansion,list_of_tokens)

data['text'] = data['text'].apply(lambda x: contractions_expansion(x))

data['text'] = data['text'].apply(lambda x: list(x))

In [6]:
regex = r'^@[a-zA-z0-9]|^#[a-zA-Z0-9]|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+'

def waste_word_or_not(token):
    return re.search(regex,token)

def filter_waste_words(list_of_tokens):
    return filterfalse(waste_word_or_not,list_of_tokens)

data['text'] = data['text'].apply(lambda x: filter_waste_words(x))

data['text'] = data['text'].apply(lambda x: list(x))

def split(list_of_tokens):
    return map(lambda x: re.split(regex,x)[0],list_of_tokens)

data['text'] = data['text'].apply(lambda x: split(x))

data['text'] = data['text'].apply(lambda x: list(x))

In [7]:
en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))

def is_stopword(token):
    return not(token in en_stop_words or re.search(r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',token))

def stopwords_removal(list_of_tokens):
    return filter(is_stopword,list_of_tokens)

data['text'] = data['text'].apply(lambda x: stopwords_removal(x))

data['text'] = data['text'].apply(lambda x: list(x))

In [8]:
def get_wnet_pos_tag(treebank_tag):
    
    if treebank_tag[1].startswith('J'):
        return (treebank_tag[0],wordnet.ADJ)
    
    elif treebank_tag[1].startswith('V'):
        return (treebank_tag[0],wordnet.VERB)
    
    elif treebank_tag[1].startswith('N'):
        return (treebank_tag[0],wordnet.NOUN)
    
    elif treebank_tag[1].startswith('R'):
        return (treebank_tag[0],wordnet.ADV)
    
    else:
        (treebank_tag[0],wordnet.NOUN)
        
def get_pos_tag(list_of_tokens):
    return map(get_wnet_pos_tag,pos_tag(list_of_tokens))

data['text'] = data['text'].apply(lambda x: get_pos_tag(x))

data['text'] = data['text'].apply(lambda x: list(x))

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
def token_lemmatization(token_pos_tuple):
    if token_pos_tuple == None:
        return ""
    else:
        return lemmatizer.lemmatize(word=token_pos_tuple[0],pos=token_pos_tuple[1])
    
def lemmatization(list_of_tokens):
    if len(list_of_tokens) > 0:
        return map(lambda x: token_lemmatization(x),list_of_tokens)
    
data['text'] = data['text'].apply(lambda x: lemmatization(x))

data['text'] = data['text'].apply(lambda x: list(x))

In [11]:
vocab = set()
for list_of_tokens in data['text']:
    vocab = vocab.union(set(list_of_tokens))
    
vocab = list(vocab)

while('' in vocab):
    vocab.remove('')
    
vocab_dict = dict(zip(vocab,list(range(0,len(vocab)))))

In [12]:
def join_tokens(list_of_tokens):
    return " ".join(list_of_tokens)

data['text'] = data['text'].apply(lambda x: join_tokens(x))

In [13]:
corpus = list()
for email_text in data['text']:
    corpus.append(email_text)

In [14]:
vectorizer = TfidfVectorizer(vocabulary=vocab_dict)

tf_idf_matrix = vectorizer.fit_transform(corpus)

tf_idf_matrix = tf_idf_matrix.toarray()

In [15]:
df = pd.DataFrame(tf_idf_matrix)

df['spam'] = data['spam']

In [16]:
gnb = GaussianNB()

In [17]:
components = [10, 20, 30, 40, 50, 60, 70, 100, 500, 1000, 5000]

def apply_pca(n):
    
    pca = PCA(n_components=n)

    tf_idf_matrix_reduced = pca.fit_transform(tf_idf_matrix)

    df = pd.DataFrame(data=tf_idf_matrix_reduced)

    df['spam'] = data['spam']
    
    X_data = df.iloc[:,0:n]
    
    y_data = df['spam']
    
    X_data = np.array(X_data)
    
    y_data = np.array(y_data)
    
    X_train, X_test, y_train, y_test = train_test_split( X_data, y_data, test_size=0.20)
    
    return  X_train, X_test, y_train, y_test
    


for n in components:

    X_train_data, X_test_data, y_train_data, y_test_data = apply_pca(n) 
    
    gnb.fit(X=X_train_data, y=y_train_data)
    
    predicted_categories = gnb.predict(X_train_data)
    
    print(f"Principle Components: {n}")
    
    print(classification_report(y_true=y_train_data, y_pred=predicted_categories))
    
    print("\n")
    


Principle Components: 10
              precision    recall  f1-score   support

           0       0.85      0.99      0.92      3510
           1       0.95      0.45      0.61      1072

    accuracy                           0.86      4582
   macro avg       0.90      0.72      0.76      4582
weighted avg       0.88      0.86      0.85      4582



Principle Components: 20
              precision    recall  f1-score   support

           0       0.85      0.99      0.91      3489
           1       0.94      0.43      0.59      1093

    accuracy                           0.86      4582
   macro avg       0.89      0.71      0.75      4582
weighted avg       0.87      0.86      0.84      4582



Principle Components: 30
              precision    recall  f1-score   support

           0       0.88      0.99      0.93      3476
           1       0.93      0.56      0.70      1106

    accuracy                           0.88      4582
   macro avg       0.90      0.77      0.81      

Since accuracy is found highest for 60 principle components, hence we will find test accuracy on the data applying pca with 60 principle components.

In [18]:
X_train_data, X_test_data, y_train_data, y_test_data = apply_pca(60)

In [19]:
gnb.fit(X=X_train_data, y=y_train_data)
    
predicted_categories = gnb.predict(X_train_data)

In [20]:
predicted_test_categories = gnb.predict(X_test_data)

In [21]:
print(classification_report(y_true=y_test_data, y_pred= predicted_test_categories))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96       882
           1       0.84      0.88      0.86       264

    accuracy                           0.93      1146
   macro avg       0.90      0.91      0.91      1146
weighted avg       0.93      0.93      0.93      1146

