In [42]:
import os
import glob
import numpy as np
import email

In [43]:
path = './datasets/spam/'

In [44]:
easy_ham_paths = glob.glob(path+'easy_ham/*')
spam_paths = glob.glob(path+'spam/*')

In [45]:
def get_email_content(email_path):
    file = open(email_path,encoding='latin1')
    try:
        msg = email.message_from_file(file)
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                return part.get_payload() 
    except Exception as e:
        print(e)
        
        
def get_email_content_bulk(email_paths):
    email_contents = [get_email_content(o) for o in email_paths]
    return email_contents

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
ham_path=[easy_ham_paths]
spam_path=[spam_paths]

In [48]:
ham_sample = np.array([train_test_split(o) for o in ham_path])

  """Entry point for launching an IPython kernel.


In [49]:
ham_train = np.array([])
ham_test = np.array([])
for o in ham_sample:
    ham_train = np.concatenate((ham_train,o[0]),axis=0)
    ham_test = np.concatenate((ham_test,o[1]),axis=0)

In [50]:
ham_train.shape, ham_test.shape

((1875,), (626,))

In [51]:
spam_sample = np.array([train_test_split(o) for o in spam_path])

  """Entry point for launching an IPython kernel.


In [52]:
spam_train = np.array([])
spam_test = np.array([])
for o in spam_sample:
    spam_train = np.concatenate((spam_train,o[0]),axis=0)
    spam_test = np.concatenate((spam_test,o[1]),axis=0)

In [53]:
spam_train.shape, spam_test.shape

((375,), (126,))

In [54]:
ham_train_label = [0]*ham_train.shape[0]
spam_train_label = [1]*spam_train.shape[0]
x_train = np.concatenate((ham_train,spam_train))
y_train = np.concatenate((ham_train_label,spam_train_label))

In [55]:
ham_test_label = [0]*ham_test.shape[0]
spam_test_label = [1]*spam_test.shape[0]
x_test = np.concatenate((ham_test,spam_test))
y_test = np.concatenate((ham_test_label,spam_test_label))

In [56]:
train_shuffle_index = np.random.permutation(np.arange(0,x_train.shape[0]))
test_shuffle_index = np.random.permutation(np.arange(0,x_test.shape[0]))

In [57]:
x_train = x_train[train_shuffle_index]
y_train = y_train[train_shuffle_index]

In [58]:
x_test = x_test[test_shuffle_index]
y_test = y_test[test_shuffle_index]

In [59]:
x_train = get_email_content_bulk(x_train)
x_test = get_email_content_bulk(x_test)

In [60]:
def remove_null(datas,labels):
    not_null_idx = [i for i,o in enumerate(datas) if o is not None]
    return np.array(datas)[not_null_idx],np.array(labels)[not_null_idx]

In [61]:
x_train,y_train = remove_null(x_train,y_train)
x_test,y_test = remove_null(x_test,y_test)

In [62]:
import re
import string
from nltk.tokenize import word_tokenize
def remove_hyperlink(word):
    return  re.sub(r"http\S+", "", word)
def to_lower(word):
    result = word.lower()
    return result
def remove_number(word):
    result = re.sub(r'\d+', '', word)
    return result
def remove_punctuation(word):
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result
def remove_whitespace(word):
    result = word.strip()
    return result
def replace_newline(word):
    return word.replace('\n','')
def clean_up_pipeline(sentence):
    cleaning_utils = [remove_hyperlink,
                      replace_newline,
                      to_lower,
                      remove_number,
                      remove_punctuation,remove_whitespace]
    for o in cleaning_utils:
        sentence = o(sentence)
    return sentence

In [63]:
x_train = [clean_up_pipeline(o) for o in x_train]
x_test = [clean_up_pipeline(o) for o in x_test]

In [64]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [65]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [66]:
x_train = [word_tokenize(o) for o in x_train]
x_test = [word_tokenize(o) for o in x_test]

In [67]:
def remove_stop_words(words):
    result = [i for i in words if i not in ENGLISH_STOP_WORDS]
    return result

In [68]:
def word_stemmer(words):
    return [stemmer.stem(o) for o in words]

In [69]:
def word_lemmatizer(words):
    return [lemmatizer.lemmatize(o) for o in words]

In [70]:
def clean_token_pipeline(words):
    cleaning_utils = [remove_stop_words,word_lemmatizer]
    for o in cleaning_utils:
        words = o(words)
    return words

In [71]:
x_train = [clean_token_pipeline(o) for o in x_train]
x_test = [clean_token_pipeline(o) for o in x_test]

In [72]:
x_train = [" ".join(o) for o in x_train]
x_test = [" ".join(o) for o in x_test]

In [73]:
x_train = [o.split(" ") for o in x_train]
x_test = [o.split(" ") for o in x_test]

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [75]:
vectorizer = TfidfVectorizer()
raw_sentences = [' '.join(o) for o in x_train]
vectorizer.fit(raw_sentences)

TfidfVectorizer()

In [76]:
def convert_to_feature(raw_tokenize_data):
    raw_sentences = [' '.join(o) for o in raw_tokenize_data]
    return vectorizer.transform(raw_sentences)

In [77]:
x_train_features = convert_to_feature(x_train)
x_test_features = convert_to_feature(x_test)

In [78]:
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
vectorizer = CountVectorizer()
raw_sentences = [' '.join(o) for o in x_train]
vectorizer.fit(raw_sentences)

CountVectorizer()

In [80]:
x_train_features = convert_to_feature(x_train)
x_test_features = convert_to_feature(x_test)

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, x_train_features, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.974, total=   0.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] .................................... , score=0.976, total=   0.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV] .................................... , score=0.981, total=   0.4s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s finished


0.9771012758026544