In [0]:
#!tar -xopf eron.tar
!tar -zxvf data.tar.gz

In [0]:
import glob
import numpy as np
import random
import torch


class FileReader(object):
  def __init__(self):
    self.ham = []
    self.spam = []
    self.ham_paths = ["enron1/ham/*.txt", "enron2/ham/*.txt", "enron3/ham/*.txt", "enron4/ham/*.txt", "enron5/ham/*.txt", "enron6/ham/*.txt"]
    self.spam_paths = ["enron1/spam/*.txt", "enron2/spam/*.txt", "enron3/spam/*.txt", "enron4/spam/*.txt", "enron5/spam/*.txt", "enron6/spam/*.txt"]
  
  def read_file(self, path, minimum_word_count = 3, unnecessary =  ["-", ".", ",", "/", ":", "@"]):
    files  = glob.glob(path)
    content_list = []
    for file in files:
        with open(file, encoding="ISO-8859-1") as f:
            content = f.read()
            if len(content.split()) > minimum_word_count:      
              content = content.lower()
              if len(unnecessary) is not 0:
                  content = ''.join([c for c in content if c not in unnecessary])
              content_list.append(content)
    return content_list
  
  def truncate_before_combine(self, data, maximum_length = 5000):
    if maximum_length is not 0:
      if len(data) > maximum_length:
        random.shuffle(data)
        data = data[:maximum_length]
    return data
  
  def load_ham_and_spam(self, ham_paths = "default", spam_paths = "default", truncation_length = 5000): # 0 for no truncation
    
    if ham_paths == "default":
      ham_paths = self.ham_paths
    if spam_paths == "default":
      spam_paths = self.spam_paths
    
    self.ham = [ item for path in ham_paths for item in self.read_file(path) ]
    if truncation_length != 0:
      self.ham = self.truncate_before_combine(self.ham, truncation_length)
    print("ham length ", len(self.ham))
    
    self.spam = [item for path in spam_paths for item in self.read_file(path) ]
    if truncation_length != 0:
      self.spam = self.truncate_before_combine(self.spam, truncation_length)
    print("spam length ", len(self.spam))
    
    data = self.ham + self.spam
    
    ham_label = [0 for _ in range(len(self.ham))]
    spam_label = [1 for _ in range(len(self.spam))]
    
    label_tensor = torch.as_tensor(ham_label + spam_label, dtype = torch.int16)
    
    return data, label_tensor
  
  def print_sample(self, which ="both"): # ham, spam or both
    if which == "ham" or which == "both":
      idx = random.randint(0, len(self.ham))
      print("----------- ham sample -------------")
      print(self.ham[idx])
    if which == "spam" or which == "both":
      idx = random.randint(0, len(self.spam))
      print("----------- spam sample -------------")
      print(self.spam[idx])

In [30]:
reader = FileReader()

data, label = reader.load_ham_and_spam("default", "default", truncation_length = 0)

ham length  16540
spam length  17108


In [6]:
reader.print_sample()

----------- ham sample -------------
subject tw outage
operations is digging out 2000 feet of pipe to begin the hydro test today or
thursday  if the test results are good  they ' ll recoat the pipe and put it
back in service  we could be at 80 % volume on friday afternoon  but this is
tentative 
also  the smart pig test run several months ago identified 4 potential areas
that need inspections west of station 5  while east of thoreau is down 
these 4 areas will also be inspected 
do not pass this info onto customers at this point till we have further
information  we will make a posting hopefully this afternoon  operations is
also trying to combine some additional compressor work to this outage ( i  e 
grouting another unit and replacing some valves )  this work will limit our
west deliveries to 875  000  this notice will also be posted shortly 
see me if you have questions
kh
----------- spam sample -------------
subject paliourg no pre scription f e e s
your easy  to  use r x solution 

In [0]:
import os
import csv
import pandas as pd
from collections import Counter

class Vocab_to_int(object):

  def __init__(self, saved_dir='./', file_name="vocab_to_int.csv"):
      os.makedirs(saved_dir, exist_ok=True)
      self.path = os.path.join(saved_dir, file_name)

  def save_file(self, vocab_to_int):
      df = pd.DataFrame(list(vocab_to_int.items()))
      df.dropna(inplace=True)
      df = df.T
      df.to_csv(self.path, index=False, header=False)

  def open_file(self, path = "Default"):
      if path == "Default":
        path = self.path
      df = pd.read_csv(path)
      df.dropna(inplace=True)
      dict = df.to_dict('records')[0]
      return dict

  def generate(self, seqs, save_file=True):
      vocabs = [vocab for seq in seqs for vocab in seq.split()]
      # a = [  word for seq in ["a d","b d","c d"] for word in seq.split() ]
      # ['a', 'd', 'b', 'd', 'c', 'd']

      # Count word frequency
      # Counter({'the': 39770, 'to': 32356, 'and': 22835, 'of': 19607, 'a': 17100, '_': 16955, 'you': 15593, 'in': 14481, .....
      vocab_count = Counter(vocabs)

      vocab_count = vocab_count.most_common(len(vocab_count))

      vocab_to_int = {word : index+1 for index, (word, count) in enumerate(vocab_count)}
      vocab_to_int.update({'__PADDING__': 0}) # index 0 for padding

      if save_file:
        self.save_file(vocab_to_int)

      return vocab_to_int

In [0]:
vti = Vocab_to_int()
vocab_to_int = vti.generate(data, save_file=True)

In [0]:
'''
arg: ham or spam data (numpy array)
return: int dictionary [ word_n: count_n, ... ]
'''


import torch
from torch.autograd import Variable

class Vectorizer(object):
  
  def __init__(self, vocab_to_int):
    self.vocab_to_int = vocab_to_int
    
  def vectorize_seqs(self, seqs):
    # Vectorize each sequence
    vectorized_seqs = []
    for seq in seqs: 
      vectorized_seqs.append([self.vocab_to_int[word] for word in seq.split()])
    return vectorized_seqs
  
  def add_padding(self, vectorized_seqs, seq_lengths):
    seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
    for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
      seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
    return seq_tensor
  
  def vectorize(self, seqs):
    vectorized_seqs = self.vectorize_seqs(seqs)
    seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))
    seq_tensor = self.add_padding(vectorized_seqs, seq_lengths)
    
    return seq_tensor, seq_lengths

In [17]:
print(len(vocab_to_int))
v = Vectorizer(vocab_to_int)
seq_tensor, seq_lengths = v.vectorize(data)

159199


In [0]:
import torch.utils.data.sampler as splr


class CustomDataLoader(object):
  def __init__(self, seq_tensor, seq_lengths, label_tensor, batch_size):
    self.batch_size = batch_size
    self.seq_tensor = seq_tensor
    self.seq_lengths = seq_lengths
    self.label_tensor = label_tensor
    self.sampler = splr.BatchSampler(splr.RandomSampler(self.label_tensor), self.batch_size, False)
    self.sampler_iter = iter(self.sampler)
    
  def __iter__(self):
    self.sampler_iter = iter(self.sampler) # reset sampler iterator
    return self

  def _next_index(self):
    return next(self.sampler_iter) # may raise StopIteration

  def __next__(self):
    index = self._next_index()

    subset_seq_tensor = self.seq_tensor[index]
    subset_seq_lengths = self.seq_lengths[index]
    subset_label_tensor = self.label_tensor[index]

    subset_seq_lengths, perm_idx = subset_seq_lengths.sort(0, descending=True)
    subset_seq_tensor = subset_seq_tensor[perm_idx]
    subset_label_tensor = subset_label_tensor[perm_idx]

    return subset_seq_tensor, subset_seq_lengths, subset_label_tensor

  def __len__(self):
    return len(self.sampler)



In [18]:
shuffled_idx = torch.randperm(label.shape[0])

seq_tensor = seq_tensor[shuffled_idx]
seq_lenghts = seq_lengths[shuffled_idx]
label = label[shuffled_idx]

PCT_TRAIN = 0.7
PCT_VALID = 0.2

length = len(label)
train_seq_tensor = seq_tensor[:int(length*PCT_TRAIN)] 
train_seq_lengths = seq_lengths[:int(length*PCT_TRAIN)]
train_label = label[:int(length*PCT_TRAIN)]

valid_seq_tensor = seq_tensor[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_seq_lengths = seq_lengths[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_label = label[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))]

test_seq_tensor = seq_tensor[int(length*(PCT_TRAIN+PCT_VALID)):]
test_seq_lengths = seq_lengths[int(length*(PCT_TRAIN+PCT_VALID)):]
test_label = label[int(length*(PCT_TRAIN+PCT_VALID)):]

print(train_seq_tensor.shape)
print(valid_seq_tensor.shape)
print(test_seq_tensor.shape)


torch.Size([23553, 39470])
torch.Size([6730, 39470])
torch.Size([3365, 39470])


In [0]:
# set shuffle = False since data is already shuffled
batch_size = 200
train_loader = CustomDataLoader(train_seq_tensor, train_seq_lengths, train_label, batch_size)
valid_loader = CustomDataLoader(valid_seq_tensor, valid_seq_lengths, valid_label, batch_size)
test_loader = CustomDataLoader(test_seq_tensor, test_seq_lengths, test_label, batch_size)

In [0]:
# Define Model
'''
1) Embedding Layer
2) LSTM
3) Fully Connected Layer
4) Sigmoid Activation
'''

DEBUG = False

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SpamHamLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers,\
                 drop_lstm=0.1, drop_out = 0.1):

        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_lstm, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_out)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, seq_lengths):

        # embeddings
        embedded_seq_tensor = self.embedding(x)
        if DEBUG:
          print("embedded_seq_tensor = self.embedding(x)", embedded_seq_tensor.shape)
                
        # pack, remove pads
        packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)
        if DEBUG:
          print("packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)")
          print(packed_input.data.shape)
          print(packed_input.batch_sizes.shape)
        
        # lstm
        packed_output, (ht, ct) = self.lstm(packed_input, None)
        if DEBUG:
          print("packed_output, (ht, ct) = self.lstm(packed_input, None)")
          print(packed_output.data.shape)
          print(packed_output.batch_sizes.shape)
          print("ht")
          print(ht.shape)
        
        # unpack, recover padded sequence
        output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)
        # output : batch_size X max_seq_len X hidden_dim
        if DEBUG:
          print("output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)")
          print(output.shape)
          print(input_sizes)
       
        # gather the last output in each batch
        last_idxs = (input_sizes - 1).to(device) # last_idxs = input_sizes - torch.ones_like(input_sizes)
        output = torch.gather(output, 1, last_idxs.view(-1, 1).unsqueeze(2).repeat(1, 1, self.hidden_dim)).squeeze() # [batch_size, hidden_dim]
        if DEBUG:
          print(output.shape) 
        
        # dropout and fully-connected layer
        output = self.dropout(output)
        output = self.fc(output).squeeze()
        if DEBUG:
          print("output = self.fc(output)", output.shape)
               
        # sigmoid function
        output = self.sig(output)
        
        return output


In [21]:
# Instantiate the model w/ hyperparams

vocab_size = len(vocab_to_int)
output_size = 1
embedding_dim = 100 # int(vocab_size ** 0.25) # 15
hidden_dim = 15
n_layers = 2
device = "cuda" if torch.cuda.is_available() else "cpu" 
net = SpamHamLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, \
                 0.2, 0.2)
net = net.to(device)
print(net)

SpamHamLSTM(
  (embedding): Embedding(159199, 100)
  (lstm): LSTM(100, 15, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2)
  (fc): Linear(in_features=15, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [16]:
# Debug Purpose - Test Run

lr=0.0001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

net.eval()
seq_tensor, seq_tensor_lengths, label = next(train_loader)

seq_tensor = seq_tensor.to(device)
seq_tensor_lengths = seq_tensor_lengths.to(device)

label = label.to(device)
output = net(seq_tensor, seq_tensor_lengths)
loss = criterion(output, label.float())

binary_output = (output >= 0.5).short() # short(): torch.int16
right_or_not = torch.eq(binary_output, label)
accuracy = torch.sum(right_or_not).float().item()/right_or_not.shape[0]
print("{:2.3f}".format(accuracy*100))

51.500


In [0]:
import os
def save_model(model, saved_dir='./', file_name='saved_model.pth'):
    os.makedirs(saved_dir, exist_ok=True)
    check_point = {
        'net': model.state_dict()
    }
    output_path = os.path.join(saved_dir, file_name)
    torch.save(check_point, output_path)

In [0]:
# loss and optimization functions
criterion = nn.BCELoss()

lr=0.03
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,\
                                                       mode = 'min', \
                                                      factor = 0.5,\
                                                      patience = 2)

In [19]:


# training params

epochs = 6 

counter = 0
print_every = 10
clip=5 # gradient clipping


net.train()
# train for some number of epochs
val_losses = []
for e in range(epochs):
  
    scheduler.step(e)

    for seq_tensor, seq_tensor_lengths, label in iter(train_loader):
        counter += 1
               
        seq_tensor = seq_tensor.to(device)
        seq_tensor_lengths = seq_tensor_lengths.to(device)
        label = label.to(device)
 
        # get the output from the model
        output = net(seq_tensor, seq_tensor_lengths)
    
        # calculate the loss and perform backprop
        loss = criterion(output, label.float())
        optimizer.zero_grad() 
        loss.backward()
        
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            
            val_losses_in_itr = []
            sums = []
            sizes = []
            
            net.eval()
            
            for seq_tensor, seq_tensor_lengths, label in iter(valid_loader):

                seq_tensor = seq_tensor.to(device)
                seq_tensor_lengths = seq_tensor_lengths.to(device)
                label = label.to(device)
                output = net(seq_tensor, seq_tensor_lengths)
                
                # losses
                val_loss = criterion(output, label.float())     
                val_losses_in_itr.append(val_loss.item())
                
                # accuracy
                binary_output = (output >= 0.5).short() # short(): torch.int16
                right_or_not = torch.eq(binary_output, label)
                sums.append(torch.sum(right_or_not).float().item())
                sizes.append(right_or_not.shape[0])
                
            val_losses.append(np.mean(val_losses_in_itr))
            if len(val_losses) > 3:
              if val_losses[-2] > val_losses[-1]:
                print("Save Model...")
                save_model(net, './', 'lstm_model_saved_at_{}.pth'.format(counter))
            
            accuracy = sum(sums) / sum(sizes)
            
            net.train()
            print("Epoch: {:2d}/{:2d}\t".format(e+1, epochs),
                  "Steps: {:3d}\t".format(counter),
                  "Loss: {:.6f}\t".format(loss.item()),
                  "Val Loss: {:.6f}\t".format(np.mean(val_losses_in_itr)),
                  "Accuracy: {:.3f}".format(accuracy))

Epoch:  1/ 6	 Steps:  10	 Loss: 0.477550	 Val Loss: 0.652788	 Accuracy: 0.776
Epoch:  1/ 6	 Steps:  20	 Loss: 0.442478	 Val Loss: 0.439474	 Accuracy: 0.837
Epoch:  1/ 6	 Steps:  30	 Loss: 0.304971	 Val Loss: 0.391255	 Accuracy: 0.876
Save Model...
Epoch:  1/ 6	 Steps:  40	 Loss: 0.387431	 Val Loss: 0.297274	 Accuracy: 0.912
Epoch:  1/ 6	 Steps:  50	 Loss: 0.300540	 Val Loss: 0.849961	 Accuracy: 0.735
Save Model...
Epoch:  1/ 6	 Steps:  60	 Loss: 0.352602	 Val Loss: 0.301840	 Accuracy: 0.912
Save Model...
Epoch:  1/ 6	 Steps:  70	 Loss: 0.279997	 Val Loss: 0.220553	 Accuracy: 0.936
Epoch:  1/ 6	 Steps:  80	 Loss: 0.257140	 Val Loss: 0.279984	 Accuracy: 0.916
Save Model...
Epoch:  1/ 6	 Steps:  90	 Loss: 0.195319	 Val Loss: 0.214466	 Accuracy: 0.936
Epoch:  1/ 6	 Steps: 100	 Loss: 0.532466	 Val Loss: 0.406762	 Accuracy: 0.835
Save Model...
Epoch:  1/ 6	 Steps: 110	 Loss: 0.374303	 Val Loss: 0.228697	 Accuracy: 0.950
Save Model...
Epoch:  2/ 6	 Steps: 120	 Loss: 0.211804	 Val Loss: 0.1642

KeyboardInterrupt: ignored

It seemed starting over-fitting at step 250



In [9]:
vg = Vocab_to_int()
vocab_to_int = vg.open_file()
vr = Vectorizer(vocab_to_int)

vocab_size = len(vocab_to_int)
output_size = 1
embedding_dim = 100 # int(vocab_size ** 0.25) # 15
hidden_dim = 15
n_layers = 2
device = "cuda" if torch.cuda.is_available() else "cpu" 
model = SpamHamLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, \
                 0.2, 0.2)
model = model.to(device)
print(model)

model_path = './lstm_model_saved_at_240.pth'
checkpoint = torch.load(model_path)    
state_dict = checkpoint['net']   
model.load_state_dict(state_dict) 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data.shape)

SpamHamLSTM(
  (embedding): Embedding(159199, 100)
  (lstm): LSTM(100, 15, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2)
  (fc): Linear(in_features=15, out_features=1, bias=True)
  (sig): Sigmoid()
)
embedding.weight torch.Size([159199, 100])
lstm.weight_ih_l0 torch.Size([60, 100])
lstm.weight_hh_l0 torch.Size([60, 15])
lstm.bias_ih_l0 torch.Size([60])
lstm.bias_hh_l0 torch.Size([60])
lstm.weight_ih_l1 torch.Size([60, 15])
lstm.weight_hh_l1 torch.Size([60, 15])
lstm.bias_ih_l1 torch.Size([60])
lstm.bias_hh_l1 torch.Size([60])
fc.weight torch.Size([1, 15])
fc.bias torch.Size([1])


In [29]:
test_losses = []
sums = []
sizes = []

model.eval()

test_losses = []
for seq_tensor, seq_tensor_lengths, label in iter(test_loader):

    seq_tensor = seq_tensor.to(device)
    seq_tensor_lengths = seq_tensor_lengths.to(device)
    label = label.to(device)
    output = model(seq_tensor, seq_tensor_lengths)

    # losses
    test_loss = criterion(output, label.float())     
    test_losses.append(test_loss.item())

    # accuracy
    binary_output = (output >= 0.5).short() # short(): torch.int16
    right_or_not = torch.eq(binary_output, label)
    sums.append(torch.sum(right_or_not).float().item())
    sizes.append(right_or_not.shape[0])

accuracy = np.sum(sums) / np.sum(sizes)
print("Test Loss: {:.6f}\t".format(np.mean(test_losses)),
      "Accuracy: {:.3f}".format(accuracy))

Test Loss: 0.102844	 Accuracy: 0.971


In [31]:
sum = 0
count = 0
model.eval()

for idx in range(16500,16600):
  count = count + 1
  sample = data[idx]
  sample_label = label[idx].item()
  temp = [ sample ]
  seq_tensor, seq_tensor_lengths = vr.vectorize(temp)
  
  seq_tensor = seq_tensor.to(device)
  seq_tensor_lengths = seq_tensor_lengths.to(device)
  output = model(seq_tensor, seq_tensor_lengths)

  # accuracy
  binary_output = int(output.item() >= 0.5)
  acc = int(binary_output == sample_label)
  sum = sum + acc
  
  pred = "SPAM" if binary_output == 1 else "HAM"
  actual = "SPAM" if sample_label == 1 else "HAM"

  if count % 30 == 0:
    print("Sample", idx)
    print("Prediction:", pred)
    print("Actual", actual)
    print("-------------------------------------")
  
print(sum)
print("Accuracy(%)", float(sum) / count * 100)
  




Sample 16529
Prediction: HAM
Actual HAM
-------------------------------------
Sample 16559
Prediction: SPAM
Actual SPAM
-------------------------------------
Sample 16589
Prediction: SPAM
Actual SPAM
-------------------------------------
98
Accuracy(%) 98.0


In [36]:
myString = "This is my best offer. You can't take this chance away!"


unnecessary =  ["-", ".", ",", "/", ":", "@", "'", "!"]
content = myString.lower()
content = ''.join([c for c in content if c not in unnecessary])


input = [content]
seq_tensor, seq_tensor_lengths = vr.vectorize(input)

seq_tensor = seq_tensor.to(device)
seq_tensor_lengths = seq_tensor_lengths.to(device)
output = model(seq_tensor, seq_tensor_lengths)

print(output.item()) # 0.98 means strongly SPAM


0.9833419322967529
