In [0]:
#!tar -xopf eron.tar
!tar -zxvf data.tar.gz

In [0]:
import glob
import numpy as np
import random
import torch


class FileReader(object):
  def __init__(self):
    self.ham = []
    self.spam = []
    self.ham_paths = ["enron1/ham/*.txt", "enron2/ham/*.txt", "enron3/ham/*.txt", "enron4/ham/*.txt", "enron5/ham/*.txt", "enron6/ham/*.txt"]
    self.spam_paths = ["enron1/spam/*.txt", "enron2/spam/*.txt", "enron3/spam/*.txt", "enron4/spam/*.txt", "enron5/spam/*.txt", "enron6/spam/*.txt"]
  
  def read_file(self, path, minimum_word_count = 3, unnecessary =  ["-", ".", ",", "/", ":", "@", "'", "!"]):
    files  = glob.glob(path)
    content_list = []
    for file in files:
        with open(file, encoding="ISO-8859-1") as f:
            content = f.read()
            if len(content.split()) > minimum_word_count:      
              content = content.lower()
              if len(unnecessary) is not 0:
                  content = ''.join([c for c in content if c not in unnecessary])
              content_list.append(content)
    return content_list
  
  def truncate_before_combine(self, data, maximum_length = 5000):
    if maximum_length is not 0:
      if len(data) > maximum_length:
        random.shuffle(data)
        data = data[:maximum_length]
    return data
  
  def load_ham_and_spam(self, ham_paths = "default", spam_paths = "default", truncation_length = 5000): # 0 for no truncation
    
    if ham_paths == "default":
      ham_paths = self.ham_paths
    if spam_paths == "default":
      spam_paths = self.spam_paths
    
    self.ham = [ item for path in ham_paths for item in self.read_file(path) ]
    if truncation_length != 0:
      self.ham = self.truncate_before_combine(self.ham, truncation_length)
    print("ham length ", len(self.ham))
    
    self.spam = [item for path in spam_paths for item in self.read_file(path) ]
    if truncation_length != 0:
      self.spam = self.truncate_before_combine(self.spam, truncation_length)
    print("spam length ", len(self.spam))
    
    data = self.ham + self.spam
    
    ham_label = [0 for _ in range(len(self.ham))]
    spam_label = [1 for _ in range(len(self.spam))]
    
    label_tensor = torch.as_tensor(ham_label + spam_label, dtype = torch.int16)
    
    return data, label_tensor
  
  def print_sample(self, which ="both"): # ham, spam or both
    if which == "ham" or which == "both":
      idx = random.randint(0, len(self.ham))
      print("----------- ham sample -------------")
      print(self.ham[idx])
    if which == "spam" or which == "both":
      idx = random.randint(0, len(self.spam))
      print("----------- spam sample -------------")
      print(self.spam[idx])

In [2]:
reader = FileReader()

data, label = reader.load_ham_and_spam("default", "default", truncation_length = 0)

ham length  16540
spam length  17108


In [3]:
reader.print_sample()

----------- ham sample -------------
subject on  call notes
please find attached the on  call notes for the weekend of 3  10 & 11  01 
bob
----------- spam sample -------------
subject greater than some not
r e finance before election when the r a tes will rise 
it is your last chance
http    www  aonmate  com 
you are already approv e d with 3  0 point
thank you 
beard
          
we fastidious  at of emerald
compost an or the nigger quonset
cantle duchess is it doodle
counselor headlight  in gripe


In [0]:
import os
import csv
import pandas as pd
from collections import Counter

class Vocab_to_int(object):

  def __init__(self, saved_dir='./', file_name="vocab_to_int.csv"):
      os.makedirs(saved_dir, exist_ok=True)
      self.path = os.path.join(saved_dir, file_name)

  def save_file(self, vocab_to_int):
      df = pd.DataFrame(list(vocab_to_int.items()))
      df.dropna(inplace=True)
      df = df.T
      df.to_csv(self.path, index=False, header=False)
      print("saved as", self.path)

  def open_file(self, path = "Default"):
      if path == "Default":
        path = self.path
      df = pd.read_csv(path)
      df.dropna(inplace=True)
      dict = df.to_dict('records')[0]
      return dict

  def generate(self, seqs, save_file=True):
      vocabs = [vocab for seq in seqs for vocab in seq.split()]
      # a = [  word for seq in ["a d","b d","c d"] for word in seq.split() ]
      # ['a', 'd', 'b', 'd', 'c', 'd']

      # Count word frequency
      # Counter({'the': 39770, 'to': 32356, 'and': 22835, .....
      vocab_count = Counter(vocabs)

      vocab_count = vocab_count.most_common(len(vocab_count))

      vocab_to_int = {word : index+2 for index, (word, count) in enumerate(vocab_count)}
      vocab_to_int.update({'__PADDING__': 0}) # index 0 for padding
      vocab_to_int.update({'__UNKNOWN__': 1}) # index 1 for unknown word such as broken character

      if save_file:
        self.save_file(vocab_to_int)

      return vocab_to_int

In [4]:
vti = Vocab_to_int()
vocab_to_int = vti.generate(data, save_file=True)

saved as ./vocab_to_int.csv


In [0]:
import torch
from torch.autograd import Variable

class Vectorizer(object):
  ''' 
  Using vocab_to_int dict, 
  Change words into integers in string data
  '''
  
  def __init__(self, vocab_to_int):
    self.vocab_to_int = vocab_to_int
    
  def vectorize_seqs(self, seqs):
    # Vectorize each sequence
    vectorized_seqs = []
    for seq in seqs: 
      vectorized_seqs.append([self.vocab_to_int.get(word, 1) for word in seq.split()])
      # self.vocab_to_int.get(word, 1) ; mean if no value for key, it will return 1 (unknown)
    return vectorized_seqs
  
  def add_padding(self, vectorized_seqs, seq_lengths):
    '''
    The length of the seq_tensor is the length of the longest sentence in the data
    The shorter sentences will have padding(zero) at the their ends
    '''
    seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
    for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
      seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
    return seq_tensor
  
  def vectorize(self, seqs):
    vectorized_seqs = self.vectorize_seqs(seqs)
    seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))
    seq_tensor = self.add_padding(vectorized_seqs, seq_lengths)
    
    return seq_tensor, seq_lengths

In [0]:
v = Vectorizer(vocab_to_int)
seq_tensor, seq_lengths = v.vectorize(data)

In [0]:
del v

In [0]:
class DataDivider(object):

    def __init__(self):
        '''
        init with seq_tensor and label, both of them are torch.tensor
        '''        
    def check_length(self, seq_tensor, seq_lengths, label):
        length = len(label)
        if len(seq_tensor) != length or len(seq_lengths) != length:
            print("The lengths doesn't match with each other")
            print("seq_tensor:", len(seq_tensor))
            print("seq_length:", len(seq_lengths))
            print("label:", length)
            return False
          
        return True
    
    def shuffle(self, seq_tensor, seq_lengths, label):
        shuffled_idx = torch.randperm(label.shape[0])
        seq_tensor = seq_tensor[shuffled_idx]
        seq_lenghts = seq_lengths[shuffled_idx]
        label = label[shuffled_idx]
        return seq_tensor, seq_lengths, label
        
    def divide_train_valid_test(self, seq_tensor, seq_lengths, label, PCT_TRAIN = 0.7, PCT_VALID = 0.2, do_shuffle= True):
        '''
        PCT_TRAIN: the percent of train set
        PCT_VALID: the percent of validation set
        The rest part will be the test set
        '''
        assert self.check_length(seq_tensor, seq_lengths, label)
        
        length = len(label)
        
        if do_shuffle:
            seq_tensor, seq_lengths, label = self.shuffle(seq_tensor, seq_lengths, label)

        train_seq_tensor = seq_tensor[:int(length*PCT_TRAIN)] 
        train_seq_lengths = seq_lengths[:int(length*PCT_TRAIN)]
        train_label = label[:int(length*PCT_TRAIN)]

        valid_seq_tensor = seq_tensor[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
        valid_seq_lengths = seq_lengths[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
        valid_label = label[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))]

        test_seq_tensor = seq_tensor[int(length*(PCT_TRAIN+PCT_VALID)):]
        test_seq_lengths = seq_lengths[int(length*(PCT_TRAIN+PCT_VALID)):]
        test_label = label[int(length*(PCT_TRAIN+PCT_VALID)):]

        print("train:", train_seq_tensor.shape)
        print("valid:", valid_seq_tensor.shape)
        print("test:", test_seq_tensor.shape)
        
        return train_seq_tensor, train_seq_lengths, train_label, \
                valid_seq_tensor, valid_seq_lengths, valid_label, \
                test_seq_tensor, test_seq_lengths, test_label

In [9]:
dd = DataDivider()
train_seq_tensor, train_seq_lengths, train_label, \
valid_seq_tensor, valid_seq_lengths, valid_label, \
test_seq_tensor, test_seq_lengths, test_label = dd.divide_train_valid_test(seq_tensor, seq_lengths, label)

train: torch.Size([23553, 38538])
valid: torch.Size([6730, 38538])
test: torch.Size([3365, 38538])


In [0]:
import torch.utils.data.sampler as splr


class CustomDataLoader(object):
  def __init__(self, seq_tensor, seq_lengths, label_tensor, batch_size):
    self.batch_size = batch_size
    self.seq_tensor = seq_tensor
    self.seq_lengths = seq_lengths
    self.label_tensor = label_tensor
    self.sampler = splr.BatchSampler(splr.RandomSampler(self.label_tensor), self.batch_size, False)
    self.sampler_iter = iter(self.sampler)
    
  def __iter__(self):
    self.sampler_iter = iter(self.sampler) # reset sampler iterator
    return self

  def _next_index(self):
    return next(self.sampler_iter) # may raise StopIteration

  def __next__(self):
    index = self._next_index()

    subset_seq_tensor = self.seq_tensor[index]
    subset_seq_lengths = self.seq_lengths[index]
    subset_label_tensor = self.label_tensor[index]

    subset_seq_lengths, perm_idx = subset_seq_lengths.sort(0, descending=True)
    subset_seq_tensor = subset_seq_tensor[perm_idx]
    subset_label_tensor = subset_label_tensor[perm_idx]

    return subset_seq_tensor, subset_seq_lengths, subset_label_tensor

  def __len__(self):
    return len(self.sampler)



In [0]:
batch_size = 200
train_loader = CustomDataLoader(train_seq_tensor, train_seq_lengths, train_label, batch_size)
valid_loader = CustomDataLoader(valid_seq_tensor, valid_seq_lengths, valid_label, batch_size)
test_loader = CustomDataLoader(test_seq_tensor, test_seq_lengths, test_label, batch_size)

In [0]:
# Define Model
'''
1) Embedding Layer
2) LSTM
3) Fully Connected Layer
4) Sigmoid Activation
'''

DEBUG = False

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SpamHamLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers,\
                 drop_out_in_lstm, drop_out, output_size, device):

        super().__init__()
        self.device = device
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_out_in_lstm, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_out)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, seq_lengths):

        # embeddings
        embedded_seq_tensor = self.embedding(x)
        if DEBUG:
          print("embedded_seq_tensor = self.embedding(x)", embedded_seq_tensor.shape)
                
        # pack, remove pads
        packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)
        if DEBUG:
          print("packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)")
          print(packed_input.data.shape)
          print(packed_input.batch_sizes.shape)
        
        # lstm
        packed_output, (ht, ct) = self.lstm(packed_input, None)
        if DEBUG:
          print("packed_output, (ht, ct) = self.lstm(packed_input, None)")
          print(packed_output.data.shape)
          print(packed_output.batch_sizes.shape)
          print("ht")
          print(ht.shape)
        
        # unpack, recover padded sequence
        output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)
        # output : batch_size X max_seq_len X hidden_dim
        if DEBUG:
          print("output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)")
          print(output.shape)
          print(input_sizes)
       
        # gather the last output in each batch
        last_idxs = (input_sizes - 1).to(self.device) # last_idxs = input_sizes - torch.ones_like(input_sizes)
        output = torch.gather(output, 1, last_idxs.view(-1, 1).unsqueeze(2).repeat(1, 1, self.hidden_dim)).squeeze() # [batch_size, hidden_dim]
        if DEBUG:
          print(output.shape) 
        
        # dropout and fully-connected layer
        output = self.dropout(output)
        output = self.fc(output).squeeze()
        if DEBUG:
          print("output = self.fc(output)", output.shape)
               
        # sigmoid function
        output = self.sig(output)
        
        return output


In [0]:
import os
import numpy as np

class Model_wrapper(object):
  
	def set_params(self, vocab_size, \
					   embedding_dim = 100, \
					   hidden_dim = 15, \
					   n_layers = 2, \
					   drop_out_in_lstm = 0.2, \
					   drop_out = 0.2, \
					   output_size = 1, \
					   train_on_gpu = True):
    
		self.vocab_size = vocab_size
		self.embedding_dim = embedding_dim
		self.hidden_dim = hidden_dim
		self.n_layers = 2
		self.drop_out_in_lstm = drop_out_in_lstm
		self.drop_out = drop_out
		self.output_size = output_size
		self.train_on_gpu = train_on_gpu
		self.device = "cuda" if torch.cuda.is_available() and train_on_gpu else "cpu" 

	def set_model(self, do_print = True):
		self.model = SpamHamLSTM(self.vocab_size, self.embedding_dim, self.hidden_dim, self.n_layers, \
					 self.drop_out_in_lstm, self.drop_out, self.output_size, self.device)
		self.model = self.model.to(self.device)
		if do_print:
			print(self.model)

	def train(self, train_loader, valid_loader, criterion = "default", optimizer="default", learning_rate = 0.03, use_scheduler = True, \
         epochs = 6, validate_every = 10, gradient_clip = 5):

		if criterion == "default" :
			criterion = nn.BCELoss()
		print(criterion)
     

		if optimizer == "default" :
			optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
		print(optimizer)

		if use_scheduler :
			scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.5, patience = 2)

		counter = 0

		self.model.train()
	
		val_losses = []
		val_min_loss = 1000000

		for e in range(epochs):

			if use_scheduler :
				scheduler.step(e)

			for seq_tensor, seq_tensor_lengths, label in iter(train_loader):
				counter += 1

				seq_tensor = seq_tensor.to(self.device)
				seq_tensor_lengths = seq_tensor_lengths.to(self.device)
				label = label.to(self.device)

				# get the output from the model
				output = self.model(seq_tensor, seq_tensor_lengths)

				# calculate the loss and perform backprop
				loss = criterion(output, label.float())
				optimizer.zero_grad() 
				loss.backward()

				# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
				nn.utils.clip_grad_norm_(net.parameters(), gradient_clip)
				optimizer.step()

				# loss stats
				if counter % validate_every == 0:
					# Get validation loss
					val_losses_in_itr = []
					sums = []
					sizes = []

					self.model.eval()

					for seq_tensor, seq_tensor_lengths, label in iter(valid_loader):

						seq_tensor = seq_tensor.to(device)
						seq_tensor_lengths = seq_tensor_lengths.to(device)
						label = label.to(device)
						output = self.model(seq_tensor, seq_tensor_lengths)

						# losses
						val_loss = criterion(output, label.float())     
						val_losses_in_itr.append(val_loss.item())

						# accuracy
						binary_output = (output >= 0.5).short() # short(): torch.int16
						right_or_not = torch.eq(binary_output, label)
						sums.append(torch.sum(right_or_not).float().item())
						sizes.append(right_or_not.shape[0])

					val_losses.append(np.mean(val_losses_in_itr))
					if val_min_loss > val_losses[-1]:
						val_min_loss = val_losses[-1]
						self.save_state_dict('./', 'lstm_model_saved_at_{}.pth'.format(counter))

					accuracy = np.sum(sums) / np.sum(sizes)

					self.model.train()
					print("Epoch: {:2d}/{:2d}\t".format(e+1, epochs),
						  "Steps: {:3d}\t".format(counter),
						  "Loss: {:.5f}\t".format(loss.item()),
						  "Val Loss: {:.5f}\t".format(np.mean(val_losses_in_itr)),
						  "Accuracy: {:.3f}".format(accuracy))    

	def test(self, test_loader, criterion = "default"):
		if criterion == "default":
			criterion = nn.BCELoss()

		test_losses = []
		sums = []
		sizes = []

		self.model.eval()

		test_losses = []
		
		for seq_tensor, seq_tensor_lengths, label in iter(test_loader):
			seq_tensor = seq_tensor.to(self.device)
			seq_tensor_lengths = seq_tensor_lengths.to(self.device)
			label = label.to(self.device)
			output = self.model(seq_tensor, seq_tensor_lengths)

			# losses
			test_loss = criterion(output, label.float())     
			test_losses.append(test_loss.item())

			# accuracy
			binary_output = (output >= 0.5).short() # short(): torch.int16
			right_or_not = torch.eq(binary_output, label)
			sums.append(torch.sum(right_or_not).float().item())
			sizes.append(right_or_not.shape[0])

		accuracy = np.sum(sums) / np.sum(sizes)
		print("Test Loss: {:.6f}\t".format(np.mean(test_losses)),
		"Accuracy: {:.3f}".format(accuracy))
    
	def load_state_dict(self, saved_dir='./', file_name='saved_model.pth', do_print = True):
		output_path = os.path.join(saved_dir, file_name)
		checkpoint = torch.load(output_path)
		state_dict = checkpoint['net']
		self.model.load_state_dict(state_dict)
		if do_print:
			for name, param in self.model.named_parameters():
				if param.requires_grad:
					print(name, param.data.shape)

	def save_state_dict(self, saved_dir='./', file_name='saved_model.pth', do_print = True):
		os.makedirs(saved_dir, exist_ok=True)
		check_point = {
			'net': self.model.state_dict()
		}
		output_path = os.path.join(saved_dir, file_name)
		torch.save(check_point, output_path)
		if do_print:
			print("saved as", output_path)

In [179]:
vocab_size = len(vocab_to_int)
print(vocab_size)
mw = Model_wrapper()
mw.set_params(vocab_size)
mw.set_model()

159198
SpamHamLSTM(
  (embedding): Embedding(159198, 100)
  (lstm): LSTM(100, 15, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2)
  (fc): Linear(in_features=15, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [180]:
mw.train(train_loader, valid_loader)

BCELoss()
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.03
    weight_decay: 0
)
saved as ./lstm_model_saved_at_10.pth
Epoch:  1/ 6	 Steps:  10	 Loss: 0.69778	 Val Loss: 0.68049	 Accuracy: 0.558
Epoch:  1/ 6	 Steps:  20	 Loss: 0.68155	 Val Loss: 0.69174	 Accuracy: 0.562
Epoch:  1/ 6	 Steps:  30	 Loss: 0.86717	 Val Loss: 0.96266	 Accuracy: 0.654
saved as ./lstm_model_saved_at_40.pth
Epoch:  1/ 6	 Steps:  40	 Loss: 0.65479	 Val Loss: 0.63954	 Accuracy: 0.675
saved as ./lstm_model_saved_at_50.pth
Epoch:  1/ 6	 Steps:  50	 Loss: 0.43597	 Val Loss: 0.40465	 Accuracy: 0.881
saved as ./lstm_model_saved_at_60.pth
Epoch:  1/ 6	 Steps:  60	 Loss: 0.43143	 Val Loss: 0.37936	 Accuracy: 0.858
saved as ./lstm_model_saved_at_70.pth
Epoch:  1/ 6	 Steps:  70	 Loss: 0.37240	 Val Loss: 0.35781	 Accuracy: 0.888
saved as ./lstm_model_saved_at_80.pth
Epoch:  1/ 6	 Steps:  80	 Loss: 0.19122	 Val Loss: 0.22209	 Accuracy: 0.929
Epoch:  1/ 6	 Steps:  90	 Loss: 0.22

KeyboardInterrupt: ignored

Interrupt the training, model at step 380 will be used

In [198]:
mw.load_state_dict(saved_dir='./', file_name='lstm_model_saved_at_440.pth', do_print = False)
mw.test(test_loader)

Test Loss: 0.072314	 Accuracy: 0.977


In [199]:
mw.load_state_dict(saved_dir='./', file_name='lstm_model_saved_at_310.pth', do_print = False)
mw.test(test_loader)

Test Loss: 0.088483	 Accuracy: 0.976


In [183]:
mw.load_state_dict(saved_dir='./', file_name='lstm_model_saved_at_80.pth', do_print = False)
mw.test(test_loader)

Test Loss: 0.214243	 Accuracy: 0.926


In [0]:
class Predictor(Model_wrapper):
	def __init__(self, saved_dir='./', file_name = 'lstm_model_saved_at_380.pth'):
		vg = Vocab_to_int()
		self.vocab_to_int = vg.open_file()
		vocab_size = len(self.vocab_to_int)
		self.vr = Vectorizer(self.vocab_to_int)
		self.set_params(vocab_size, train_on_gpu = False)
		self.set_model(do_print = False)
		self.load_state_dict(saved_dir, file_name, do_print = False)

	def predict(self, text, unnecessary = ["-", ".", ",", "/", ":", "@", "'", "!"]):	
		text = text.lower()
		text = ''.join([c for c in text if c not in unnecessary])
		text = [text]
		seq_tensor, seq_tensor_lengths = self.vr.vectorize(text)
		seq_tensor_lengths = seq_tensor_lengths
		self.model.eval()
		output = self.model(seq_tensor, seq_tensor_lengths)
		return output.item()

In [193]:

p = Predictor(file_name = 'lstm_model_saved_at_440.pth')

myString = "This is the greatest offer. You can't take this chance away! We offer the best product in the world"
result = p.predict(myString)
print(result)

myString = "Hello, we have meeting with boss at 1:00 pm. Please prepare the document. I'll be there earlier, we need to discuss before the meeting"
result = p.predict(myString)
print(result)

0.9464383721351624
0.023171260952949524


In [197]:

p = Predictor(file_name = 'lstm_model_saved_at_310.pth')

myString = "This is the greatest offer. You can't take this chance away! We offer the best product in the world"
result = p.predict(myString)
print(result)

myString = "Hello, we have meeting with boss at 1:00 pm. Please prepare the document. I'll be there earlier, we need to discuss before the meeting"
result = p.predict(myString)
print(result)

0.9846702814102173
0.003224465297535062


In [194]:

p = Predictor(file_name = 'lstm_model_saved_at_80.pth')

myString = "This is the greatest offer. You can't take this chance away! We offer the best product in the world"
result = p.predict(myString)
print(result)

myString = "Hello, we have meeting with boss at 1:00 pm. Please prepare the document. I'll be there earlier, we need to discuss before the meeting"
result = p.predict(myString)
print(result)

0.9792095422744751
0.03566703572869301
