In [1]:
import glob
import numpy as np

def pathToList(path = "data/enron1/ham/*.txt", unnecessary = ["-", ".", ",", "/", ":", "@"]):
    files  = glob.glob(path)
    content_list = []
    for file in files:
        with open(file, encoding="ISO-8859-1") as f:
            content = f.read()
            content = content.lower()
            if len(unnecessary) is not 0:
                content = ''.join([c for c in content if c not in unnecessary])
            content_list.append(content)
    
    return content_list

In [2]:
# Collect Ham data
ham_paths = ["data/enron1/ham/*.txt", "data/enron2/ham/*.txt"]

ham = pathToList(ham_paths[0])

for index in range(1, len(ham_paths)):
    ham = ham + pathToList(ham_paths[index])
    
print(len(ham))

8033


In [3]:
# Collect Spam data
spam_paths = ["data/enron1/spam/*.txt", "data/enron2/spam/*.txt"]

spam = pathToList(spam_paths[0])

for index in range(1, len(spam_paths)):
    
    spam = spam + pathToList(path = spam_paths[index])
    
print(len(spam))

2996


In [4]:
# Memory Issue occured in my computer
# Decrease the number of data in Ham set
import random
random.shuffle(ham)
ham = ham[:3000]

In [5]:
ham_and_spam = ham + spam

In [6]:
'''
arg: ham or spam data (numpy array)
return: int dictionary [ word_n: count_n, ... ]
'''
from collections import Counter

def build_vocab_int_dict(listed_data):
    
    # tokenize
    all_words = []
    for email in listed_data:
        words = email.split()
        all_words = all_words + words
    
    # Count
    count_words = Counter(all_words)
    
    # Sort by Freq
    sorted_words = count_words.most_common(len(count_words))
    
    vocab_int_dict = {word : index+1 for index, (word, count) in enumerate(sorted_words)}
        # index starts from 1, since 0 is reserved for padding
    
    return vocab_int_dict
    

In [8]:
def encode_words(listed_data, vocab_int_dict):
    encoded_words = []
    print("length of vocab_int_dict", len(vocab_int_dict))
    for email in listed_data:
        item = [vocab_int_dict[word] for word in email.split()]
        encoded_words.append(item)
        
    return encoded_words

In [9]:
all_encoded_words = encode_words(ham_and_spam, build_vocab_int_dict(ham_and_spam))

length of vocab_int_dict 63964


In [10]:
print(len(all_encoded_words))

5996


In [11]:
ham_label = [0 for _ in range(len(ham))]
spam_label = [1 for _ in range(len(spam))]
all_label = ham_label + spam_label

In [12]:
print(len(all_label))

5996


In [13]:
'''
arg: encoded_words : list of lists
'''
def padding(encoded_words):
    sorted_encoded_words = sorted(encoded_words, key=lambda x:len(x))
    size = len(sorted_encoded_words[-1]) # the longest one will be the size of input to the model
    for i, x in enumerate(encoded_words):
        missing = size - len(x)
        encoded_words[i] = encoded_words[i] + [0 for _ in range(missing)] # 0 is padding
        
    return encoded_words

In [14]:
padded = padding(all_encoded_words)

In [15]:
import random
# python list
# shuffle two lists at the same time
def shuffle(a, b):
    c = list(zip(a,b))
    random.shuffle(c)
    a, b = zip(*c)
    return a, b

# np array
# assume that a.shape is eqaual to b.shape
import numpy as np
def np_shuffle(a, b):
    indices = np.arange(a.shape[0])
    np.random.shuffle(indices)
    return a[indices], b[indices]

In [16]:
inputs = np.array(padded)
labels = np.array(all_label)
inputs, labels = np_shuffle(inputs, labels)
print(inputs.shape, labels.shape)

(5996, 7563) (5996,)


In [17]:
PCT_TRAIN = 0.7
PCT_VALID = 0.2

length = len(labels)
train_x = inputs[:int(length*PCT_TRAIN)] 
train_y = labels[:int(length*PCT_TRAIN)]

valid_x = inputs[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_y = labels[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))]

test_x = inputs[int(length*(PCT_TRAIN+PCT_VALID)):]
test_y = labels[int(length*(PCT_TRAIN+PCT_VALID)):]

print(train_x.shape)
print(len(train_y), len(valid_y), len(test_y))
print(len(train_y)+len(valid_y)+len(test_y))

(4197, 7563)
4197 1199 600
5996


In [18]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# using as_tensor() method to avoid copy (save memory)
train_data = TensorDataset(torch.as_tensor(train_x), torch.as_tensor(train_y))
valid_data = TensorDataset(torch.as_tensor(valid_x), torch.as_tensor(valid_y))
test_data = TensorDataset(torch.as_tensor(test_x), torch.as_tensor(test_y))

In [19]:
'''
argument
    data: numpy array
    shuffle: True or False
    batch_size: batch size
return
    DataLoader object
'''
def prep_loader(data, shuffle, batch_size):
    loader = DataLoader(data, shuffle = shuffle, batch_size = batch_size)
    return loader

In [20]:
# set shuffle = False since data is already shuffled
batch_size = 30
train_loader = prep_loader(train_data, False, 30)
valid_loader = prep_loader(valid_data, False, 30)
test_loader = prep_loader(test_data, False, 30)

In [21]:
# make sure it iterates
data_iter = iter(train_loader)
x, y = data_iter.next()
print(x.shape)
print(x[:2])
print(y.shape)
print(y[:2])

torch.Size([30, 7563])
tensor([[  18,  223, 1771,  ...,    0,    0,    0],
        [  18, 7220,  560,  ...,    0,    0,    0]])
torch.Size([30])
tensor([1, 0])


In [22]:
# Define Model
'''
1) Embedding Layer
2) LSTM
3) Fully Connected Layer
4) Sigmoid Activation (0 or 1)
'''
import torch.nn as nn

class SpamHamLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers,\
                 drop_lstm=0.2, drop_out = 0.3, train_on_gpu = False):

        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.train_on_gpu = train_on_gpu
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_lstm, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_out)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (self.train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
    

In [23]:
# Instantiate the model w/ hyperparams

vocab_size = 63964 + 1
output_size = 1
embedding_dim = int(vocab_size ** 0.25) # 15
hidden_dim = 256
n_layers = 2
net = SpamHamLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

SpamHamLSTM(
  (embedding): Embedding(63965, 15)
  (lstm): LSTM(15, 256, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [None]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
train_on_gpu = False
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        # h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        inputs = inputs.type(torch.LongTensor)
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                inputs = inputs.type(torch.LongTensor)
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

In [None]:
self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers,\
                 drop_lstm=0.2, drop_out = 0.3, train_on_gpu = False

In [None]:
https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html
    That is, the embedding vector dimension should be the 4th root of the number of categories. Since our vocabulary size in this example is 81, the recommended number of dimensions is 3:
        Note that this is just a general guideline; you can set the number of embedding dimensions as you please.
        
        
        https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
            
            So what about size of the hidden layer(s)--how many neurons? There are some empirically-derived rules-of-thumb, of these, the most commonly relied on is 'the optimal size of the hidden layer is usually between the size of the input and size of the output layers'. Jeff Heaton, author of Introduction to Neural Networks in Java offers a few more.