In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import re
import nltk
from contractions import contractions_dict
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv("emails.csv")

In [None]:
def strip_titles(text):
    if "Subject: re :" in text:
        return text[13:]
    elif "Subject: news :" in text:
        return text[15:]
    else:
        return text[8:]

In [None]:
data['text'] = data['text'].apply(lambda x: strip_titles(x))

In [None]:
data['text'] = data['text'].apply(lambda x: word_tokenize(x))

In [None]:
def normalize_tokens(list_of_tokens):
    return map(lambda x: x.lower(),list_of_tokens)

In [None]:
data['text'] = data['text'].apply(lambda x: normalize_tokens(x))

In [None]:
data['text'] = data['text'].apply(lambda x: list(x))

In [None]:
def contracted_word_expansion(token):
    if token in contractions_dict.keys():
        return contractions_dict[token]
    else:
        return token

In [None]:
def contractions_expansion(list_of_tokens):
    return map(contracted_word_expansion,list_of_tokens)

In [None]:
data['text'] = data['text'].apply(lambda x: contractions_expansion(x))

In [None]:
data['text'] = data['text'].apply(lambda x: list(x))

In [None]:
regex = r'^@[a-zA-z0-9]|^#[a-zA-Z0-9]|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+'

In [None]:
def waste_word_or_not(token):
    return re.search(regex,token)

In [None]:
def filter_waste_words(list_of_tokens):
    return filterfalse(waste_word_or_not,list_of_tokens)

In [None]:
data['text'] = data['text'].apply(lambda x: filter_waste_words(x))

In [None]:
data['text'] = data['text'].apply(lambda x: list(x))

In [None]:
def split(list_of_tokens):
    return map(lambda x: re.split(regex,x)[0],list_of_tokens)

In [None]:
data['text'] = data['text'].apply(lambda x: split(x))

In [None]:
data['text'] = data['text'].apply(lambda x: list(x))

In [None]:
en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))

In [None]:
def is_stopword(token):
    return not(token in en_stop_words or re.search(r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',token))

In [None]:
def stopwords_removal(list_of_tokens):
    return filter(is_stopword,list_of_tokens)

In [None]:
data['text'] = data['text'].apply(lambda x: stopwords_removal(x))

In [None]:
data['text'] = data['text'].apply(lambda x: list(x))

In [None]:
def get_wnet_pos_tag(treebank_tag):
    if treebank_tag[1].startswith('J'):
        return (treebank_tag[0],wordnet.ADJ)
    elif treebank_tag[1].startswith('V'):
        return (treebank_tag[0],wordnet.VERB)
    elif treebank_tag[1].startswith('N'):
        return (treebank_tag[0],wordnet.NOUN)
    elif treebank_tag[1].startswith('R'):
        return (treebank_tag[0],wordnet.ADV)
    else:
        (treebank_tag[0],wordnet.NOUN)

In [None]:
def get_pos_tag(list_of_tokens):
    return map(get_wnet_pos_tag,pos_tag(list_of_tokens))

In [None]:
data['text'] = data['text'].apply(lambda x: get_pos_tag(x))

In [None]:
data['text'] = data['text'].apply(lambda x: list(x))

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def token_lemmatization(token_pos_tuple):
    if token_pos_tuple == None:
        return ""
    else:
        return lemmatizer.lemmatize(word=token_pos_tuple[0],pos=token_pos_tuple[1])

In [None]:
def lemmatization(list_of_tokens):
    if len(list_of_tokens) > 0:
        return map(lambda x: token_lemmatization(x),list_of_tokens)

In [None]:
data['text'] = data['text'].apply(lambda x: lemmatization(x))

In [None]:
data['text'] = data['text'].apply(lambda x: list(x))

In [None]:
vocab = set()
for list_of_tokens in data['text']:
    vocab = vocab.union(set(list_of_tokens))

In [None]:
vocab = list(vocab)

In [None]:
vocab.pop(0)

In [None]:
vocab_dict = dict(zip(vocab,list(range(0,len(vocab)))))

In [None]:
def join_tokens(list_of_tokens):
    return " ".join(list_of_tokens)

In [None]:
data['text'] = data['text'].apply(lambda x: join_tokens(x))

In [None]:
corpus = list()
for email_text in data['text']:
    corpus.append(email_text)

In [None]:
vectorizer = TfidfVectorizer(vocabulary=vocab_dict)
tf_idf_matrix = vectorizer.fit_transform(corpus)

In [None]:
tf_idf_matrix = tf_idf_matrix.toarray()

In [None]:
df = pd.DataFrame(tf_idf_matrix)

In [None]:
df['spam'] = data['spam']

In [None]:
pca = PCA(n_components=5000)

In [None]:
tf_idf_matrix_reduced = pca.fit_transform(tf_idf_matrix)

In [None]:
df = pd.DataFrame(data=tf_idf_matrix_reduced)

In [None]:
df['spam'] = data['spam']

In [None]:
gnb = GaussianNB()

In [None]:
X_train = df.iloc[:,0:5000]
y_train = df['spam']

In [None]:
gnb.fit(X=X_train,y=y_train)

In [None]:
predicted_categories = gnb.predict(X_train)

In [None]:
print(classification_report(y_true=y_train,y_pred=predicted_categories))