
# **LSTM for Penn-Bankset**


## **Initialization**

In [None]:
import torch
import torch.nn as nn
from torch.optim import SGD, lr_scheduler
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
torch.manual_seed(101)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH = "/content/drive/MyDrive/ex2_313581803_314882861"
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EMBEDDING_SIZE = 200
HIDDEN_SIZE = 200
NUM_OF_LAYERS = 2
BATCH_SIZE = 20 
SEQUENCE_SIZE = 20
INIT_WEIGHT = 0.1

### **Load Data**

In [None]:
def word_to_ix(word):
  if word not in vocab_map:
    vocab_map[word] = len(vocab_map)
  return vocab_map[word]
   
def split_minibatch(ix_list, batch_size):
  # add only full minibatchs
  total_seq_size = len(ix_list) // batch_size
  return np.array(
      [np.array(ix_list[total_seq_size*i: total_seq_size*(i+1)]) for i in range(batch_size)]) 
 
def load_data(set_type, batch_size=BATCH_SIZE):
  with open(f"{PATH}/Data/ptb.{set_type}.txt","r") as f:
    sentences = f.read().split("\n")
    sentences = [sen for sen in sentences if len(sen) != 0]
    word_list = "<eos>".join(sentences).split(" ")
    ix_list = [word_to_ix(word) for word in word_list if word != ""]
    return split_minibatch(ix_list, batch_size)



In [None]:
vocab_map = {}

train_set = load_data("train")
valid_set = load_data("valid")
test_set = load_data("test")
print(f"Vocabulary size: {len(vocab_map)}")

Vocabulary size: 10000


### **Network Architecture**

Implement LSTM for Penn-Bankset

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, p=0, use_gru=False):
        super(RNN, self).__init__()
        self.hidden_size = EMBEDDING_SIZE
        self.num_layers = NUM_OF_LAYERS
        self.use_gru = use_gru
        self.word2Embedding = nn.Embedding(vocab_size, EMBEDDING_SIZE)
        self.dropout = nn.Dropout(p)
        self.lstm = nn.LSTM(input_size=EMBEDDING_SIZE, hidden_size=self.hidden_size, 
                            num_layers=self.num_layers, dropout=p, batch_first=True)
        self.gru=nn.GRU(EMBEDDING_SIZE, self.hidden_size, self.num_layers, dropout=p, batch_first=True)
        self.hidden2word = nn.Linear(EMBEDDING_SIZE, vocab_size)
        self.criterion = nn.CrossEntropyLoss(reduction='sum')
        self.init_weights()


    def forward(self, sentence):
        embeds = self.word2Embedding(torch.tensor(sentence))
        embeds = self.dropout(embeds)
        if self.use_gru:
          rnn_out, h_n = self.gru(embeds, self.h_0)
          self.h_0 = h_n.detach()
        else:
          rnn_out, (h_n, c_n) = self.lstm(embeds, (self.h_0, self.c_0))
          self.h_0, self.c_0 = h_n.detach(), c_n.detach()
        rnn_out = self.dropout(rnn_out)
        words_pred = self.hidden2word(rnn_out)
        return words_pred

    def init_state(self):
        self.h_0 = torch.zeros(self.num_layers, BATCH_SIZE, self.hidden_size).to(DEVICE)
        self.c_0 = torch.zeros(self.num_layers, BATCH_SIZE, self.hidden_size).to(DEVICE)

    def init_weights(self):
        self.word2Embedding.weight.data.uniform_(-INIT_WEIGHT, INIT_WEIGHT)
        self.hidden2word.bias.data.fill_(0)
        self.hidden2word.weight.data.uniform_(-INIT_WEIGHT, INIT_WEIGHT)
  

In [None]:
def fit(model, train_data, val_data, optimizer, scheduler, max_norm, file_name, 
        max_epochs=50, save_model=False):
    train_results = []
    val_results = []
    best_loss = 1000000000
    for epoch in range(max_epochs):
      model.init_state()
      model.train()
      for i in range(0, train_data.shape[1], SEQUENCE_SIZE):
          # Forward pass
          end_of_seq = min(i+SEQUENCE_SIZE,train_data.shape[1]-1)
          seq = torch.tensor(train_set[:,i:end_of_seq]).to(DEVICE)
          labels = torch.tensor(train_set[:,i+1:end_of_seq+1]).to(DEVICE)
          y_pred = model(seq)
          loss = model.criterion(y_pred.transpose(dim0=1,dim1=2), labels)

          # Propagating the loss backward and optimizing the parameters
          optimizer.zero_grad()
          loss.backward()
          nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm)
          optimizer.step()
      scheduler.step()
      
      train_results.append(evaluation(model, train_data))
      val_results.append(evaluation(model, val_data))
      if save_model and val_results[-1] < best_loss :
        torch.save(model.state_dict(),f"{PATH}/Models/{file_name}_best.pt")
        best_acc = val_results[-1]
      print(f"epoch: {epoch}, train loss: {train_results[-1]}, validation loss: {val_results[-1]} \
            train perplexity: {np.exp(train_results[-1]):10.8f}, validation perplexity: {np.exp(val_results[-1]):10.8f}.") 
    if save_model:
        torch.save(model.state_dict(),f"{PATH}/Models/{file_name}_final.pt")
    generate_plots(train_results, val_results, file_name)
    return train_results[-1], val_results[-1]


def evaluation(model, data):
    model.eval()
    total_loss = 0
    total_pred = 0
    model.init_state()
    with torch.no_grad():
      for i in range(0, data.shape[1], SEQUENCE_SIZE):
          end_of_seq = min(i+SEQUENCE_SIZE, data.shape[1]-1)
          if i == end_of_seq:
            break
          seq = torch.tensor(data[:,i:end_of_seq]).to(DEVICE)
          labels = torch.tensor(data[:,i+1:end_of_seq+1]).to(DEVICE)
          y_pred = model(seq)
          loss = model.criterion(y_pred.transpose(dim0=1,dim1=2), labels)
          total_loss += float(loss)
          total_pred += BATCH_SIZE * (end_of_seq - i)
    return total_loss / total_pred


def generate_plots(train_results, val_results, file_name):
    if not os.path.exists(f"{PATH}/Results"):
      os.mkdir(f"{PATH}/Results")
    plt.xlabel("epoch")
    plt.ylabel("loss")
    plt.plot(np.exp(np.array(train_results)), label='training Perplexity', linewidth=5.0)
    plt.plot(np.exp(np.array(val_results)), label='validation Perplexity', linewidth=5.0)
    plt.title('Perplexity per epoch')
    plt.legend()
    plt.grid()

    plt.savefig(f'{PATH}/Results/{file_name}.png', bbox_inches='tight')
    plt.close()

## **Find Hyperparameters**

In [None]:
def find_hyper_parameters(mode, lr_list, decay_threshold_list, decay_list, max_norm_list,
                          p_lists, use_gru, init_whight, max_epochs):
  results = {}
  if not os.path.exists(f"{PATH}/Results/{mode}"):
    os.makedirs(f"{PATH}/Results/{mode}")
  if not os.path.exists(f"{PATH}/Models/{mode}"):
    os.makedirs(f"{PATH}/Models/{mode}")
  for lr in lr_list:
    for decay_threshold in decay_threshold_list:
      for decay in decay_list:
        for max_norm in max_norm_list:
          for p in p_lists:
            model = RNN(len(vocab_map), p=p, use_gru=use_gru, init_whight=init_whight).to(DEVICE)
            file_name = f"{mode}/{lr}_{decay_threshold}_{decay}_{max_norm}_{p}"
            optimizer = SGD(model.parameters(), lr=lr)
            scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1 if epoch < decay_threshold else decay**(-epoch+decay_threshold))
            results[(lr, decay_threshold, decay, max_norm, p)] = fit(
                model, train_set, valid_set, optimizer, scheduler, max_norm, file_name=file_name, save_model=True, max_epochs=max_epochs)

  np.save(f"{PATH}/Results/{mode}/res_dic", results, allow_pickle=True)

In [None]:
################################
### Find LSTM Hyperparmeters ###
################################

mode = "LSTM"
lr_list = [1.5,1.2,1,0.8,0.5]
decay_threshold_list = [1,3,5]
decay_list = [3,2,1.5]
max_norm_list = [5]
p_list = [0]
find_hyper_parameters(mode, lr_list, decay_threshold_list, decay_list, max_norm_list, p_list, 
                      use_gru=False, init_whight=0.1, max_epochs=20)


In [None]:
########################################
### Find LSTM-Dropout Hyperparmeters ###
########################################

mode = "LSTM_Dropout"
lr_list = [1.2,1,0.8]
decay_threshold_list = [5,7,10]
decay_list = [2,1.5,1.05]
max_norm_list = [5]
p_list = [0.35,0.5,0.65]
find_hyper_parameters(mode, lr_list, decay_threshold_list, decay_list, max_norm_list, p_list, 
                      use_gru=False, init_whight=0.05, max_epochs=50)


In [None]:
###############################
### Find GRU Hyperparmeters ###
###############################

mode = "GRU"
lr_list = [1,1.2,1.5]
decay_threshold_list = [1,3,5]
decay_list = [3,2,1.5]
max_norm_list = [2]
p_list = [0]
find_hyper_parameters(mode, lr_list, decay_threshold_list, decay_list, max_norm_list, p_list, 
                      use_gru=True, init_whight=0.1, max_epochs=20)



In [None]:

#######################################
### Find GRU Dropout Hyperparmeters ###
#######################################

mode = "GRU_Dropout"
lr_list = [1,1.2,1.5]
decay_threshold_list = [5,7,10]
decay_list = [2,1.3,1.08]
max_norm_list = [2]
p_list = [0.35,0.5,0.65]
find_hyper_parameters(mode, lr_list, decay_threshold_list, decay_list, max_norm_list, p_list, 
                      use_gru=True, init_whight=0.1, max_epochs=50)

## **Train Models**

In [None]:
MODES_PARAMS = {0: ("LSTM", 0.0, False, 1.0, 1, 2.0, 5.0, 15),
                1: ("LSTM_Dropout", 0.35, False, 1.0, 7, 1.05, 5.0, 50),
                2: ("GRU", 0.0, True, 1.2, 1, 2.0, 2.0, 15),
                3: ("GRU_Dropout", 0.35, True, 1.2, 7, 1.08, 2.0, 50)}


def fit_model_for_mode(mode, file_name):
  (mode_name, p, use_gru, lr, decay_threshold, decay, max_norm, max_epochs) = MODES_PARAMS[mode]

  if not os.path.exists(f"{PATH}/Results/{mode_name}"):
    os.makedirs(f"{PATH}/Results/{mode_name}")
  if not os.path.exists(f"{PATH}/Models/{mode_name}"):
      os.makedirs(f"{PATH}/Models/{mode_name}")

  model = RNN(len(vocab_map), p=p, use_gru=use_gru).to(DEVICE)
  optimizer = SGD(model.parameters(), lr=lr)
  scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1 if epoch < decay_threshold else decay**(-epoch+decay_threshold))
  return fit(model, train_set, test_set, optimizer, scheduler, max_norm, max_epochs=max_epochs, 
             file_name=f"{mode_name}/{file_name}", save_model=True)


def evaluate_trained_model(mode, file_name):
  mode_name = MODES_PARAMS[mode][0]
  use_gru = mode > 1
  model = RNN(len(vocab_map), use_gru=use_gru).to(DEVICE)
  model.load_state_dict(torch.load(f"{PATH}/Models/{mode_name}/{file_name}_best.pt", map_location=DEVICE))
  train_loss = evaluation(model, train_set)
  valid_loss = evaluation(model, valid_set)
  test_loss = evaluation(model, test_set)
  print(f"Model: {mode}, Train Perplexity: {np.exp(train_loss)}, Validation Perplexity: {np.exp(valid_loss)}, Test Perplexity: {np.exp(test_loss)}")

In [None]:
# List of modes of models to train:
# 0: LSTM without dropout 
# 1: LSTM with dropout 
# 2: GRU without dropout 
# 3: GRU with dropout 
modes_to_run = [0, 1, 2, 3] 
file_name = "test_train"

for mode in modes_to_run:
  fit_model_for_mode(mode, file_name)
  evaluate_trained_model(mode, file_name)



## **Evaluations**
Get train and test evaluation for existing models

In [None]:
file_name = "trained_model"
for mode in MODES_PARAMS.keys():
  evaluate_trained_model(mode, file_name)



Model: 0, Train Perplexity: 58.06171528545132, Validation Perplexity: 116.45095664102497, Test Perplexity: 112.0217984910468
Model: 1, Train Perplexity: 44.161830158557144, Validation Perplexity: 99.4926158715422, Test Perplexity: 94.49093709095733
Model: 2, Train Perplexity: 60.71044630672884, Validation Perplexity: 118.8719360926827, Test Perplexity: 114.92523756232262
Model: 3, Train Perplexity: 39.54581419538675, Validation Perplexity: 99.32161731855646, Test Perplexity: 95.04166000448642
