In [0]:
!tar -xopf eron.tar

In [0]:
import glob
import numpy as np
import random
import torch


class FileReader(object):

  def read_file(self, path, minimum_word_count = 3, unnecessary =  ["-", ".", ",", "/", ":", "@"]):
    files  = glob.glob(path)
    content_list = []
    for file in files:
        with open(file, encoding="ISO-8859-1") as f:
            content = f.read()
            if len(content.split()) > minimum_word_count:      
              content = content.lower()
              if len(unnecessary) is not 0:
                  content = ''.join([c for c in content if c not in unnecessary])
              content_list.append(content)
    return content_list
  
  def truncate_data(self, data, maximum_length = 5000):
    if maximum_length is not 0:
      if len(data) > maximum_length:
        random.shuffle(data)
        data = data[:maximum_length]
    return data
  
  def run(self, ham_paths = ["enron1/ham/*.txt", "enron2/ham/*.txt", "enron3/ham/*.txt", "enron4/ham/*.txt", "enron5/ham/*.txt", "enron6/ham/*.txt"],\
          spam_paths = ["enron1/spam/*.txt", "enron2/spam/*.txt", "enron3/spam/*.txt", "enron4/spam/*.txt", "enron5/spam/*.txt", "enron6/spam/*.txt"]):
    
    ham = [ item for path in ham_paths for item in self.read_file(path) ]
    ham = self.truncate_data(ham)
    print("ham length ", len(ham))
    
    spam = [item for path in spam_paths for item in self.read_file(path) ]
    spam = self.truncate_data(spam)
    print("spam length ", len(spam))
    
    data = ham + spam
    
    ham_label = [0 for _ in range(len(ham))]
    spam_label = [1 for _ in range(len(spam))]
    
    label_tensor = torch.as_tensor(ham_label + spam_label, dtype = torch.int16)
    
    return data, label_tensor

In [5]:
  
reader = FileReader()

data, label = reader.run()

ham length  5000
spam length  5000


In [8]:
print(data[5002])
print(label[4998:5002])

subject lose your weight  new weightloss loses up to 19 % 
hello  i have a special offer for you   
want to lose weight ?
the most powerful weightloss is now available
without prescription  all natural adipren 720
100 % money back guarant?e !
 lose up to 19 % total body weight 
 up to 300 % more weight loss while dieting 
 loss of 20  35 % abdominal fat 
 reduction of 40  70 % overall fat under skin 
 increase metabolic rate by 76  9 % without exercise 
 burns calorized fat 
 suppresses appetite for sugar 
 boost your confidence level and self esteem 
get the facts about all  natural adipren 720  http    adiprenl 2  com 
    system information    
natural represents know and operation via part replaced
sender uses mistake resource native sends request specific
it development implemented simplified usage own because while
scenarios no marks include functionality disclose development sends
setting replaced create when forethought variation segments helpful

tensor([0, 0, 1, 1], dtype=tor

In [0]:
'''
arg: ham or spam data (numpy array)
return: int dictionary [ word_n: count_n, ... ]
'''
from collections import Counter

import torch
from torch.autograd import Variable



class Vectorizer(object):
  
  def __init__(self, seqs):
    self.vectorized_seqs, self.vocab_int = self.vectorize_seqs(seqs)
    self.seq_lengths = torch.LongTensor(list(map(len, self.vectorized_seqs)))
    self.seq_tensor = self.add_padding(self.vectorized_seqs, self.seq_lengths)
    
  def vectorize_seqs(self, seqs):
    # sequence of words
    vocabs = [vocab for seq in seqs for vocab in seq.split()]
      # a = [  word for seq in ["a d","b d","c d"] for word in seq.split() ]
      # ['a', 'd', 'b', 'd', 'c', 'd']

    # Count word frequency
    # Counter({'the': 39770, 'to': 32356, 'and': 22835, 'of': 19607, 'a': 17100, '_': 16955, 'you': 15593, 'in': 14481, .....
    vocab_count = Counter(vocabs)

    # Sort by Freq
    vocab_count = vocab_count.most_common(len(vocab_count))

    vocab_int = {word : index+1 for index, (word, count) in enumerate(vocab_count)}
    vocab_int.update({'__PADDING__': 0}) # index 0 for padding

    # Vectorize each sequence
    vectorized_seqs = []
    for seq in seqs: 
      vectorized_seqs.append([vocab_int[word] for word in seq.split()])

    return vectorized_seqs, vocab_int
  
  def add_padding(self, vectorized_seqs, seq_lengths):
    seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
    for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
      seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
    return seq_tensor
  
  def result(self):
    return self.seq_tensor, self.seq_lengths, self.vocab_int

  
  
  

# def padding(vectorized_seqs):
#     sorted_seqs = sorted(vectorized_seqs, key=lambda x:len(x))
#     size = len(sorted_seqs[-1]) # the longest one will be the size of input to the model
#     for i, x in enumerate(vectorized_seqs):
#         missing = size - len(x)
#         vectorized_seqs[i] = vectorized_seqs[i] + [0 for _ in range(missing)] # 0 is padding
        
#     return vectorized_seqs

In [10]:
vectorizer = Vectorizer(data)
seq_tensor, seq_lengths, vocab_int = vectorizer.result()

print(seq_tensor[:5])
print(seq_lengths[:5])
print(seq_tensor.shape)
print(len(vocab_int))

tensor([[  24, 4311, 4810,  ...,    0,    0,    0],
        [  24,   12, 3066,  ...,    0,    0,    0],
        [  24,   46,   46,  ...,    0,    0,    0],
        [  24,  607, 3066,  ...,    0,    0,    0],
        [  24,   88, 1503,  ...,    0,    0,    0]])
tensor([  83,   18,   63,  124, 1131])
torch.Size([10000, 39470])
87476


In [0]:
import torch.utils.data.sampler as splr


class CustomDataLoader(object):
  def __init__(self, seq_tensor, seq_lengths, label_tensor, batch_size):
    self.batch_size = batch_size
    self.seq_tensor = seq_tensor
    self.seq_lengths = seq_lengths
    self.label_tensor = label_tensor
    self.sampler = splr.BatchSampler(splr.RandomSampler(self.label_tensor), self.batch_size, False)
    self.sampler_iter = iter(self.sampler)
    
  def __iter__(self):
    self.sampler_iter = iter(self.sampler) # reset sampler iterator
    return self

  def _next_index(self):
    return next(self.sampler_iter) # may raise StopIteration

  def __next__(self):
    index = self._next_index()

    subset_seq_tensor = self.seq_tensor[index]
    subset_seq_lengths = self.seq_lengths[index]
    subset_label_tensor = self.label_tensor[index]

    subset_seq_lengths, perm_idx = subset_seq_lengths.sort(0, descending=True)
    subset_seq_tensor = subset_seq_tensor[perm_idx]
    subset_label_tensor = subset_label_tensor[perm_idx]

    return subset_seq_tensor, subset_seq_lengths, subset_label_tensor

  def __len__(self):
    return len(self.sampler)



In [12]:
shuffled_idx = torch.randperm(label.shape[0])

seq_tensor = seq_tensor[shuffled_idx]
seq_lenghts = seq_lengths[shuffled_idx]
label = label[shuffled_idx]

PCT_TRAIN = 0.7
PCT_VALID = 0.2

length = len(label)
train_seq_tensor = seq_tensor[:int(length*PCT_TRAIN)] 
train_seq_lengths = seq_lengths[:int(length*PCT_TRAIN)]
train_label = label[:int(length*PCT_TRAIN)]

valid_seq_tensor = seq_tensor[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_seq_lengths = seq_lengths[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_label = label[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))]

test_seq_tensor = seq_tensor[int(length*(PCT_TRAIN+PCT_VALID)):]
test_seq_lengths = seq_lengths[int(length*(PCT_TRAIN+PCT_VALID)):]
test_label = label[int(length*(PCT_TRAIN+PCT_VALID)):]

print(train_seq_tensor.shape)
print(valid_seq_tensor.shape)
print(test_seq_tensor.shape)


torch.Size([7000, 39470])
torch.Size([2000, 39470])
torch.Size([1000, 39470])


In [0]:
# set shuffle = False since data is already shuffled
batch_size = 50
train_loader = CustomDataLoader(train_seq_tensor, train_seq_lengths, train_label, batch_size)
valid_loader = CustomDataLoader(valid_seq_tensor, valid_seq_lengths, valid_label, batch_size)
test_loader = CustomDataLoader(test_seq_tensor, test_seq_lengths, test_label, batch_size)

In [15]:
print(next(train_loader))

(tensor([[  24,  243,  100,  ...,    0,    0,    0],
        [  24,    6,    1,  ...,    0,    0,    0],
        [  24, 1984,  568,  ...,    0,    0,    0],
        ...,
        [  24,   88,  147,  ...,    0,    0,    0],
        [  24,   54,   18,  ...,    0,    0,    0],
        [  24, 8299,  200,  ...,    0,    0,    0]]), tensor([2353, 1262, 1088,  783,  463,  428,  417,  413,  401,  300,  282,  282,
         278,  272,  270,  226,  193,  192,  174,  173,  173,  162,  152,  151,
         149,  144,  143,  142,  119,  108,   88,   85,   84,   82,   81,   74,
          69,   65,   62,   54,   50,   49,   44,   44,   40,   34,   34,   26,
          14,    9]), tensor([1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
        1, 0], dtype=torch.int16))


In [0]:
# Define Model
'''
1) Embedding Layer
2) LSTM
3) Fully Connected Layer
4) Sigmoid Activation
'''
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SpamHamLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers,\
                 drop_lstm=0.2, drop_out = 0.3):

        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_lstm, batch_first=True)
        

        # dropout layer
        self.dropout = nn.Dropout(drop_out)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, seq_lengths):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        
        
        batch_size = x.size(0)
        #h, c  = self.init_hidden(x.size(0))
        
        # embeddings and lstm_out
        embedded_seq_tensor = self.embedding(x)
                
        packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)
        
        packed_output, (ht, ct) = self.lstm(packed_input, None)
        
        output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)
        # output : batch_size X max_seq_len X hidden_dim
        
        
        # stack up lstm outputs
        output = output.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        output = self.dropout(output)
        output = self.fc(output)
              
        
        # reshape to be batch_size first
        output = output.view(batch_size, -1)
        output = output[:, -1] # get last batch of labels
        
        # sigmoid function
        output = self.sig(output)
        
        # return last sigmoid output and hidden state
        return output
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),\
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden
    

In [28]:
# Instantiate the model w/ hyperparams

vocab_size = len(vocab_int)
output_size = 1
embedding_dim = 100 # int(vocab_size ** 0.25) # 15
hidden_dim = 20
n_layers = 5
device = "cuda" if torch.cuda.is_available() else "cpu" 
net = SpamHamLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, \
                 0.2, 0.2)
net = net.to(device)
print(net)

SpamHamLSTM(
  (embedding): Embedding(85782, 100)
  (lstm): LSTM(100, 20, num_layers=5, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2)
  (fc): Linear(in_features=20, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [31]:
# loss and optimization functions
lr=0.03

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# training params

epochs = 10 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 10
clip=5 # gradient clipping


net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    # h = net.init_hidden(batch_size)

    net.zero_grad()
    # batch loop
    
    for seq_tensor, seq_tensor_lengths, label in iter(train_loader):
        counter += 1
               
        # https://github.com/pytorch/pytorch/issues/7236
        # The error states that it expects a torch.cuda.LongTensor, 
        # while you gave it a torch.LongTensor (a CPU array). 
        # Move the inputs to the GPU using the .cuda() method and the error should go away.
        seq_tensor = seq_tensor.to(device)
        seq_tensor_lengths = seq_tensor_lengths.to(device)
        label = label.to(device)
 
        # get the output from the model
        output = net(seq_tensor, seq_tensor_lengths)

        # calculate the loss and perform backprop
        loss = criterion(output, label.float())
        optimizer.zero_grad() 
        loss.backward()
        
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            #val_h = net.init_hidden(batch_size)
            
            val_losses = []
            net.eval()
            
            for seq_tensor, seq_tensor_lengths, label in iter(valid_loader):

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                # val_h = tuple([each.data for each in val_h])
                
                
                seq_tensor = seq_tensor.to(device)
                seq_tensor_lengths = seq_tensor_lengths.to(device)
                label = label.to(device)
            
                
                output = net(seq_tensor, seq_tensor_lengths)
                val_loss = criterion(output, label.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/10... Step: 10... Loss: 0.694397... Val Loss: 0.693086
Epoch: 1/10... Step: 20... Loss: 0.693789... Val Loss: 0.693048
Epoch: 1/10... Step: 30... Loss: 0.695010... Val Loss: 0.693013
Epoch: 1/10... Step: 40... Loss: 0.694909... Val Loss: 0.693150
Epoch: 1/10... Step: 50... Loss: 0.700169... Val Loss: 0.693950
Epoch: 1/10... Step: 60... Loss: 0.694288... Val Loss: 0.694061
Epoch: 1/10... Step: 70... Loss: 0.693994... Val Loss: 0.693066
Epoch: 1/10... Step: 80... Loss: 0.691942... Val Loss: 0.693144
Epoch: 1/10... Step: 90... Loss: 0.686288... Val Loss: 0.693547
Epoch: 1/10... Step: 100... Loss: 0.690337... Val Loss: 0.694826
Epoch: 1/10... Step: 110... Loss: 0.695417... Val Loss: 0.694423
Epoch: 1/10... Step: 120... Loss: 0.697981... Val Loss: 0.693249
Epoch: 1/10... Step: 130... Loss: 0.698370... Val Loss: 0.694492
Epoch: 1/10... Step: 140... Loss: 0.690111... Val Loss: 0.695615
Epoch: 2/10... Step: 150... Loss: 0.704378... Val Loss: 0.694325
Epoch: 2/10... Step: 160... Loss: 

In [0]:
self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers,\
                 drop_lstm=0.2, drop_out = 0.3, train_on_gpu = False

In [0]:
https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html
    That is, the embedding vector dimension should be the 4th root of the number of categories. Since our vocabulary size in this example is 81, the recommended number of dimensions is 3:
        Note that this is just a general guideline; you can set the number of embedding dimensions as you please.
        
        
        https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
            
            So what about size of the hidden layer(s)--how many neurons? There are some empirically-derived rules-of-thumb, of these, the most commonly relied on is 'the optimal size of the hidden layer is usually between the size of the input and size of the output layers'. Jeff Heaton, author of Introduction to Neural Networks in Java offers a few more.