In [1]:
import glob
import numpy as np

def pathToList(path = "data/enron1/ham/*.txt", unnecessary = ["-", ".", ",", "/", ":", "@"]):
    files  = glob.glob(path)
    content_list = []
    for file in files:
        with open(file, encoding="ISO-8859-1") as f:
            content = f.read()
            content = content.lower()
            if len(unnecessary) is not 0:
                content = ''.join([c for c in content if c not in unnecessary])
            content_list.append(content)
    
    return content_list

In [10]:
# Collect Ham data
ham_paths = ["data/enron1/ham/*.txt", "data/enron2/ham/*.txt"]

ham = pathToList(ham_paths[0])

for index in range(1, len(ham_paths)):
    ham = ham + pathToList(ham_paths[index])
    
print(len(ham))

8033


In [3]:
# Collect Spam data
spam_paths = ["data/enron1/spam/*.txt", "data/enron2/spam/*.txt"]

spam = pathToList(spam_paths[0])

for index in range(1, len(spam_paths)):
    
    spam = spam + pathToList(path = spam_paths[index])
    
print(len(spam))

2996


In [12]:
# Memory Issue occured in my computer
# Decrease the number of data in Ham set
import random
random.shuffle(ham)
ham = ham[:3000]

In [13]:
'''
arg: ham or spam data (numpy array)
return: int dictionary [ word_n: count_n, ... ]
'''
from collections import Counter

def build_vocab_int_dict(listed_data):
    
    # tokenize
    all_words = []
    for email in listed_data:
        words = email.split()
        all_words = all_words + words
    
    # Count
    count_words = Counter(all_words)
    
    # Sort by Freq
    sorted_words = count_words.most_common(len(count_words))
    
    vocab_int_dict = {word : index+1 for index, (word, count) in enumerate(sorted_words)}
        # index starts from 1, since 0 is reserved for padding
    
    return vocab_int_dict
    

In [14]:
def encode_words(listed_data, vocab_int_dict):
    encoded_words = []
    for email in listed_data:
        item = [vocab_int_dict[word] for word in email.split()]
        encoded_words.append(item)
        
    return encoded_words

In [15]:
ham_encoded_words = encode_words(ham, build_vocab_int_dict(ham))
spam_encoded_words = encode_words(spam, build_vocab_int_dict(spam))
all_encoded_words = ham_encoded_words + spam_encoded_words

In [16]:
print(len(all_encoded_words))

5996


In [17]:
ham_label = [0 for _ in range(len(ham))]
spam_label = [1 for _ in range(len(spam))]
all_label = ham_label + spam_label

In [18]:
print(len(all_label))

5996


In [19]:
'''
arg: encoded_words : list of lists
'''
def padding(encoded_words):
    sorted_encoded_words = sorted(encoded_words, key=lambda x:len(x))
    size = len(sorted_encoded_words[-1]) # the longest one will be the size of input to the model
    for i, x in enumerate(encoded_words):
        missing = size - len(x)
        encoded_words[i] = encoded_words[i] + [0 for _ in range(missing)] # 0 is padding
        
    return encoded_words

In [20]:
padded = padding(all_encoded_words)

In [21]:
import random
# python list
# shuffle two lists at the same time
def shuffle(a, b):
    c = list(zip(a,b))
    random.shuffle(c)
    a, b = zip(*c)
    return a, b

# np array
# assume that a.shape is eqaual to b.shape
import numpy as np
def np_shuffle(a, b):
    indices = np.arange(a.shape[0])
    np.random.shuffle(indices)
    return a[indices], b[indices]

In [28]:
inputs = np.array(padded)
labels = np.array(all_label)
inputs, labels = np_shuffle(inputs, labels)
print(inputs.shape, labels.shape)

(5996, 5420) (5996,)


In [29]:
PCT_TRAIN = 0.7
PCT_VALID = 0.2

length = len(labels)
train_x = inputs[:int(length*PCT_TRAIN)] 
train_y = labels[:int(length*PCT_TRAIN)]

valid_x = inputs[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_y = labels[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))]

test_x = inputs[int(length*(PCT_TRAIN+PCT_VALID)):]
test_y = labels[int(length*(PCT_TRAIN+PCT_VALID)):]

print(len(train_y), len(valid_y), len(test_y))
print(len(train_y)+len(valid_y)+len(test_y))

4197 1199 600
5996


In [30]:
np.savetxt("data/train_x.csv", train_x, delimiter=",")
np.savetxt("data/train_y.csv", train_y, delimiter=",")
np.savetxt("data/valid_x.csv", valid_x, delimiter=",")
np.savetxt("data/valid_y.csv", valid_y, delimiter=",")
np.savetxt("data/test_x.csv", test_x, delimiter=",")
np.savetxt("data/test_y.csv", test_y, delimiter=",")

In [31]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# using as_tensor() method to avoid copy (save memory)
train_data = TensorDataset(torch.as_tensor(train_x), torch.as_tensor(train_y))
valid_data = TensorDataset(torch.as_tensor(valid_x), torch.as_tensor(valid_y))
test_data = TensorDataset(torch.as_tensor(test_x), torch.as_tensor(test_y))

In [33]:
'''
argument
    data: numpy array
    shuffle: True or False
    batch_size: batch size
return
    DataLoader object
'''
def prep_loader(data, shuffle, batch_size):
    loader = DataLoader(data, shuffle = shuffle, batch_size = batch_size)
    return loader

In [34]:
# set shuffle = False since data is already shuffled
batch_size = 30
train_loader = prep_loader(train_data, False, 30)
valid_loader = prep_loader(valid_data, False, 30)
test_loader = prep_loader(test_data, False, 30)

In [38]:
# make sure it iterates
data_iter = iter(train_loader)
x, y = data_iter.next()
print(x.shape)
print(x[:2])
print(y.shape)
print(y[:2])

torch.Size([30, 5420])
tensor([[   20,   457,   129,  ...,     0,     0,     0],
        [   14,   622, 13133,  ...,     0,     0,     0]])
torch.Size([30])
tensor([1, 0])


In [None]:
# Define Model
'''
1) Embedding Layer
2) LSTM
3) Fully Connected Layer
4) Sigmoid Activation 
'''