In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
cd /content/drive/MyDrive/NLP_Proj/

/content/drive/.shortcut-targets-by-id/1N_RbAQL8cE4O8RGgJnoCl82t6V9afaB-/NLP_Proj


In [6]:
!pip install jsonlines

Collecting jsonlines
  Downloading https://files.pythonhosted.org/packages/4f/9a/ab96291470e305504aa4b7a2e0ec132e930da89eb3ca7a82fbe03167c131/jsonlines-1.2.0-py2.py3-none-any.whl
Installing collected packages: jsonlines
Successfully installed jsonlines-1.2.0


In [7]:
!pip install tqdm



In [8]:
import os
import jsonlines
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import LSTM
from torch.nn import Embedding
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
from numpy import inf
import matplotlib.pyplot as plt
import pickle
import time

In [9]:
class Dictionary(object): #maps words to indices
  def __init__(self, datasets, include_valid=False):
      self.tokens = []
      self.ids = {}
      self.counts = {}

      # add special tokens
      self.add_token('<bos>') #beginning of sentence
      self.add_token('<eos>') #end of sentence
      self.add_token('<pad>')
      self.add_token('<unk>') #unknown. Needed in case use with text with word that isn't in vocab

      for line in tqdm(datasets['train']):
          for w in line:
              self.add_token(w)

      if include_valid is True:
          for line in tqdm(datasets['valid']):
              for w in line:
                  self.add_token(w)

  def add_token(self, w):
      if w not in self.tokens:
          self.tokens.append(w)
          _w_id = len(self.tokens) - 1
          self.ids[w] = _w_id
          self.counts[w] = 1
      else:
          self.counts[w] += 1

  def get_id(self, w):
      return self.ids[w]

  def get_token(self, idx):
      return self.tokens[idx]

  def decode_idx_seq(self, l):
      return [self.tokens[i] for i in l]

  def encode_token_seq(self, l):
      return [self.ids[i] if i in self.ids else self.ids['<unk>'] for i in l]

  def __len__(self):
      return len(self.tokens)


In [10]:
def load_pickle(path):
    with open(path, 'rb') as handle:
        tokenized_datasets = pickle.load(handle)
    return tokenized_datasets

def pad_strings(minibatch):
    max_len_sample = max(len(i.split(' ')) for i in minibatch)
    result = []
    for line in minibatch:
        line_len = len(line.split(' '))
        padding_str = ' ' + '<pad> ' * (max_len_sample - line_len)
        result.append(line + padding_str)
    return result

class TensoredDataset():
    def __init__(self, list_of_lists_of_tokens):
        self.input_tensors = []
        self.target_tensors = []

        for sample in list_of_lists_of_tokens:
            self.input_tensors.append(torch.tensor([sample[:-1]], dtype=torch.long))
            self.target_tensors.append(torch.tensor([sample[1:]], dtype=torch.long))

    def __len__(self):
        return len(self.input_tensors)

    def __getitem__(self, idx):
        # return a (input, target) tuple
        # f=0
        # if f==0:
        #   print(idx)
        #   print('len input',len(self.input_tensors))
        #   print('len target',len(self.target_tensors))
        #   f=1
        return (self.input_tensors[idx], self.target_tensors[idx])

def pad_list_of_tensors(list_of_tensors, pad_token):
    max_length = max([t.size(-1) for t in list_of_tensors])
    padded_list = []
    for t in list_of_tensors:
        padded_tensor = torch.cat([t, torch.tensor([[pad_token] * (max_length - t.size(-1))], dtype=torch.long)],
                                  dim=-1)
        padded_list.append(padded_tensor)

    padded_tensor = torch.cat(padded_list, dim=0)
    return padded_tensor

def pad_collate_fn(batch):
    input_list = [s[0] for s in batch]
    target_list = [s[1] for s in batch]
    pad_token = 2 # wiki_dict.get_id('<pad>')
    input_tensor = pad_list_of_tensors(input_list, pad_token)
    target_tensor = pad_list_of_tensors(target_list, pad_token)
    # f=0
    # if f==0:
    #   print(input_tensor.size())
    #   f=1
    return input_tensor, target_tensor

class LSTMLanguageModel(nn.Module):
    """
    This model combines embedding, lstm and projection layer into a single model
    """
    def __init__(self, options):
        super().__init__()

        self.lookup = nn.Embedding(num_embeddings=options['num_embeddings'], embedding_dim=options['embedding_dim'], padding_idx=options['padding_idx'])
        self.lstm = nn.LSTM(options['input_size'], options['hidden_size'], options['num_layers'], dropout=options['lstm_dropout'], batch_first=True)
        self.projection = nn.Linear(options['hidden_size'], options['num_embeddings'])

    def forward(self, encoded_input_sequence):
        """
        Forward method process the input from token ids to logits
        """
        # print('encoded_input_sequence')
        # print(encoded_input_sequence.size())
        # print(encoded_input_sequence[0])
        embeddings = self.lookup(encoded_input_sequence)
        # print('embeddings')
        # print(embeddings.size())
        # print(embeddings[0])
        lstm_outputs = self.lstm(embeddings)
        logits = self.projection(lstm_outputs[0])

        return logits

def model_training(model, optimizer, num_epochs):
  plot_cache = []
  best_loss = float(inf)
  no_improvement = 0

  
  for epoch_number in range(num_epochs):
      torch.cuda.empty_cache()
      avg_loss=0
      model.train()
      train_log_cache = []
      start_time = time.time()
      for i, (inp, target) in enumerate(wiki_loaders['train']):
          optimizer.zero_grad()
          inp = inp.to(current_device)
          target = target.to(current_device)
          logits = model(inp)
          loss = criterion(logits.view(-1, logits.size(-1)), target.view(-1))
          loss.backward()
          optimizer.step()
          train_log_cache.append(loss.item())
          torch.cuda.empty_cache()
      if current_device == 'cuda':
          print(torch.cuda.get_device_name(0))
          print('Memory Usage:')
          print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
          print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
      avg_loss = sum(train_log_cache)/len(train_log_cache)
      torch.cuda.empty_cache()
      print('Training loss after {} epoch = {:.{prec}f}'.format(epoch_number+1, avg_loss, prec=4))
      print(time.time()-start_time)

      valid_losses = []
      model.eval()
      with torch.no_grad():
        for i, (inp, target) in enumerate(wiki_loaders['valid']):
            inp = inp.to(current_device)
            target = target.to(current_device)
            logits = model(inp)

            loss = criterion(logits.view(-1, logits.size(-1)), target.view(-1))
            valid_losses.append(loss.item())
        avg_val_loss = sum(valid_losses) / len(valid_losses)
        torch.cuda.empty_cache()
        print('Validation loss after {} epoch = {:.{prec}f}'.format(epoch_number+1, avg_val_loss, prec=4))

        if (avg_val_loss < best_loss):
          best_loss = avg_val_loss
        else:
          no_improvement += 1

        if(no_improvement >= 5):
          print('Early stopping at epoch: %d', epoch_number+1)
          break
      plot_cache.append((avg_loss, avg_val_loss))

  return plot_cache, best_loss


In [11]:
# LANG (str): ar, en, it, hi
# TYPE (str): CHAR or WORD
# NUM_EPOCHS (int): number of epochs to train for
# BATCH_SIZE (int): batch size

LANG = 'it'
USE_CHARS = False
NUM_EPOCHS = 30 #10
BATCH_SIZE = 64


In [12]:
type = 'char' if USE_CHARS == True else 'word'
print('model language and type:', LANG, type)

batch_size = BATCH_SIZE
print('batch size:', batch_size)

num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'
print('device:', current_device)
if current_device == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

############################################################################
######################### AFTER GENERATING PICKLE ##########################
############################################################################
PATH = LANG+'_'+type+'_tokenized.pickle'

wiki_loaders = {}

print('start loading')
wiki_tokenized_datasets = load_pickle(path=PATH)
print('done loading')
wiki_path = LANG+'_'+type+'_wiki_dict_filtered.pickle'
# wiki_path = LANG+'_'+type+'_wiki_dict.pickle'
with open(wiki_path, 'rb') as handle:
    wiki_dict = pickle.load(handle)
print(len(wiki_dict.ids))

wiki_tensor_dataset = {}

for split, listoflists in wiki_tokenized_datasets.items():
    wiki_tensor_dataset[split] = TensoredDataset(listoflists)

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(wiki_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)

embedding_size = int(256)
hidden_size = int(1024/2)
num_layers = 3
lstm_dropout = 0.3
if USE_CHARS:
    num_embeddings = len(wiki_dict.ids)
if (not USE_CHARS):
    num_embeddings = len(wiki_dict.ids)

options = {
    'num_embeddings': num_embeddings, # number of characters/words + eos, bos, unk, pad
    'embedding_dim': embedding_size,
    'padding_idx': 2, #wiki_dict.get_id('<pad>')
    'input_size': embedding_size,
    'hidden_size': hidden_size,
    'num_layers': num_layers,
    'lstm_dropout': lstm_dropout,
}
print(options)

model language and type: it word
batch size: 64
device: cuda
Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
start loading
done loading
70508
{'num_embeddings': 70508, 'embedding_dim': 256, 'padding_idx': 2, 'input_size': 256, 'hidden_size': 512, 'num_layers': 3, 'lstm_dropout': 0.3}


In [19]:
# Check training data sample
wiki_tensor_dataset['train'][0][0].size(),wiki_tensor_dataset['train'][0][1].size(),

(torch.Size([1, 17]), torch.Size([1, 17]))

In [None]:
torch.cuda.empty_cache()

In [None]:
LSTM_model = LSTMLanguageModel(options).to(current_device)

criterion = nn.CrossEntropyLoss(ignore_index=2) #wiki_dict.get_id('<pad>')

model_parameters = [p for p in LSTM_model.parameters() if p.requires_grad]
optimizer = optim.SGD(model_parameters, lr=0.001, momentum=0.999)
filename = './saved_models/LSTM_'+LANG+'_'+type+'_'+str(BATCH_SIZE)+'bsize_'+str(NUM_EPOCHS)+'ep.pt'
# filename = './saved_models/LSTM_'+LANG+'_'+type+'_'+str(BATCH_SIZE)+'bsize_'+str(embedding_size)+'emb_'+str(hidden_size)+'hdim_'+str(num_layers)+'lyrs_'+str(NUM_EPOCHS)+'ep.pt'
plot, loss = model_training(model=LSTM_model, optimizer=optimizer, num_epochs=NUM_EPOCHS)
torch.save({'model_state_dict': LSTM_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'plot_cache': plot,
            'loss': loss,
            }, filename)
print(filename)

Tesla V100-SXM2-16GB
Memory Usage:
Allocated: 0.3 GB
Cached:    0.7 GB
Training loss after 1 epoch = 7.7298
52.250049114227295
Validation loss after 1 epoch = 6.7575
Tesla V100-SXM2-16GB
Memory Usage:
Allocated: 0.3 GB
Cached:    0.7 GB
Training loss after 2 epoch = 6.5283
52.13889837265015
Validation loss after 2 epoch = 6.3487
Tesla V100-SXM2-16GB
Memory Usage:
Allocated: 0.4 GB
Cached:    0.7 GB
Training loss after 3 epoch = 6.2697
52.46939516067505
Validation loss after 3 epoch = 6.1619
Tesla V100-SXM2-16GB
Memory Usage:
Allocated: 0.3 GB
Cached:    0.7 GB
Training loss after 4 epoch = 6.0941
52.53784251213074
Validation loss after 4 epoch = 6.0076
Tesla V100-SXM2-16GB
Memory Usage:
Allocated: 0.3 GB
Cached:    0.7 GB
Training loss after 5 epoch = 5.9587
52.486321687698364
Validation loss after 5 epoch = 5.8963
Tesla V100-SXM2-16GB
Memory Usage:
Allocated: 0.3 GB
Cached:    0.7 GB
Training loss after 6 epoch = 5.8526
52.501412868499756
Validation loss after 6 epoch = 5.8062
Tesla V

In [None]:
filename = './saved_models/LSTM_'+LANG+'_'+type+'_'+str(BATCH_SIZE)+'bsize_'+str(embedding_size)+'emb_'+str(hidden_size)+'hdim_'+str(num_layers)+'lyrs_'+str(NUM_EPOCHS)+'ep.pt'
torch.save({'model_state_dict': LSTM_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'plot_cache': plot,
            'loss': loss,
            }, filename)
print(filename)

./saved_models/LSTM_it_word_64bsize_256emb_512hdim_3lyrs_30ep.pt
