#Install datasets

In [None]:
!pip install datasets

#Import all libs

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch.nn as nn
import re
import numpy as np

#device

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

#Installing penn tree dataset

In [None]:
from datasets import load_dataset
ptb = load_dataset('ptb_text_only')

In [None]:
ptb #penn tree 

#Tokenization

In [None]:
#tokenization
all_train_tokens = []
train_tokens = []
for i in range(0,len(ptb['train'])):  #tokenizing train tokens, tokens are created with slpit() and hyphanated words are split and then tokenized
  tokens = re.split(' ', '<sep>'+' '+ ptb['train'][i]['sentence'].replace('-', ' '))  #adding <sep> token at the beginning of each sentence, and splitting the sentence with ' ' delimiter
  all_train_tokens.extend([word for word in tokens])
  train_tokens.append(tokens)


all_val_tokens = []
for i in range(0,len(ptb['validation'])):  #tokenizing validation tokens, tokens are created with slpit() and hyphanated words are split and then tokenized
  tokens = re.split(' ', '<sep>'+' '+ ptb['validation'][i]['sentence'].replace('-', ' ')) #adding <sep> token at the beginning of each sentence, and splitting the sentence with ' ' delimiter
  all_val_tokens.extend([word for word in tokens])

all_test_tokens = []
for i in range(0,len(ptb['test'])):  #tokenizing test tokens
  tokens = re.split(' ',ptb['test'][i]['sentence']) 
  all_test_tokens.extend([word for word in tokens])

all_tokens = all_train_tokens + all_val_tokens #combining all the trining and validation tokens 

len(all_train_tokens), len(all_val_tokens),len(all_tokens), all_tokens[0], len(all_test_tokens)

#creating the Vocab for train and validation data

In [None]:
vocab_size = len(all_tokens)  #creating the vocab
count = Counter(all_tokens)
tokens, counts = zip(*count.most_common(vocab_size))
vocab = {token: idx for idx, token in enumerate(tokens)} #creating a vocab dictionary
len(vocab), vocab['<unk>'],  vocab['<sep>']

#encoding all tokens(train and validation) from the corpus

In [None]:
def encode_tokens(tokens):
  encoded = [vocab.get(token, -1) for token in tokens]
  return encoded

def encode_test_tokens(tokens):
  encoded = [vocab.get(token, 1) for token in tokens]
  return encoded

train_data = encode_tokens(all_train_tokens)
val_data = encode_tokens(all_val_tokens)
test_data = encode_test_tokens(all_test_tokens)
len(train_data), len(val_data), len(test_data)

#Creating the data chunks with some fixed lengths

In [None]:
chunk_size = 50
x_train = []
y_train = []
for i in range(0, len(train_data), chunk_size):
  x_train.append(train_data[i:i+chunk_size])

for i in range(1, len(train_data), chunk_size):
  y_train.append(train_data[i:i+chunk_size])

x_val = []
y_val = []
for i in range(0, len(val_data), chunk_size):
  x_val.append(val_data[i:i+chunk_size])

for i in range(1, len(val_data), chunk_size):
  y_val.append(val_data[i:i+chunk_size])

x_test = []
y_test = []
for i in range(0, len(test_data), chunk_size):
  x_test.append(test_data[i:i+chunk_size])

for i in range(1, len(test_data), chunk_size):
  y_test.append(test_data[i:i+chunk_size])

del x_train[(len(x_train)-1):]
del y_train[(len(y_train)-1):]
del x_val[(len(x_val)-1):]
del y_val[(len(y_val)-1):]
del x_test[(len(x_test)-1):]
del y_test[(len(y_test)-1):]

len(x_train), len(y_train), len(x_val), len(y_val), len(x_train[0]), len(y_train[0]), len(x_val[0]), len(y_val[0]), len(x_train[18710]), len(y_train[18710]), len(x_val[1480]), len(y_val[1480]), len(x_test), len(y_test)


#creating the embeddings

In [None]:
class PTBDataset(Dataset):
  def __init__(self, x: list, y: list):
    self.x = x
    self.y = y

  def encode_tokens(self, tokens):  #creating tensors for train and validation data
    return torch.tensor(tokens, device=device)
  
  def __getitem__(self, n: int):
    ip_seq = self.x[n]
    op_seq = self.y[n]
    return self.encode_tokens(ip_seq), self.encode_tokens(op_seq)

  def __len__(self):
    return len(self.x)

In [None]:
train_ds = PTBDataset(x_train, y_train) #creating training dataset
val_ds = PTBDataset(x_val, y_val) #creating validation dataset
test_ds = PTBDataset(x_test, y_test)

In [None]:
#Hyper meters
hidden_size = 256
n_epochs = 20
learning_rate = 0.02
embedding_size = 100
vocab_size = len(vocab)
batch_size = 1
num_layers = 2

In [None]:
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True) #Dataloader for training data
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True)  #Dataloader for validation data
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=True)  #Dataloader for validation data

In [None]:
next(iter(test_loader)) #just checking

#Model Definition

In [None]:
class LMLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, batch_size):
                
        super(LMLSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        # self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input):
        input_embedding = self.embedding(input)
        #print(input_embedding.shape)
        batch_size = input_embedding.size(0)
        #print(batch_size)
        hidden = self.init_hidden(batch_size)
        #print(hidden.shape)
        rnn_out, hidden = self.lstm(input_embedding, hidden)
        #print(rnn_out.shape)
        affine_out = self.linear(torch.squeeze(rnn_out, 0)) #affine transformation
        affine_out = affine_out.view(-1,vocab_size)
        #print(affine_out.shape)

        return F.log_softmax(affine_out)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell

model = LMLSTM(vocab_size, embedding_size, hidden_size, num_layers, batch_size)
model.cuda() #-> model to devise

In [None]:
# class TagRNN(nn.Module):
#     def __init__(self, vocab_size, hidden_size, vocab_n, num_layers):
#         super(TagRNN, self).__init__()
#         self.vocab_size = vocab_size
#         self.hidden_size = hidden_size
#         self.vocab_n = vocab_n
#         self.batch_size = 64
#         self.num_layers = num_layers
#         self.embedding = nn.Embedding(self.vocab_size, self.hidden_size) #embedding layer
#         self.dropout = nn.Dropout(p=0.2)

#         self.rnn = nn.GRU(self.hidden_size, hidden_size=self.hidden_size,
#                           bidirectional=False,
#                           num_layers=num_layers, batch_first=True) #Applying a multi-layer gated recurrent unit (GRU) RNN to an input sequence.

#         self.linear = nn.Linear(hidden_size, vocab_size) #output layer

#     def forward(self, input):
#         input_embedding = self.embedding(input)
#         #print(input_embedding.shape)
#         batch_size = input_embedding.size(0)
#         #print(batch_size)
#         hidden = self.init_hidden(batch_size)
#         #print(hidden.shape)
#         rnn_out, _ = self.rnn(input_embedding, hidden)
#         #print(rnn_out.shape)
#         affine_out = self.linear(torch.squeeze(rnn_out, 0)) #affine transformation
#         #print(affine_out.shape)

#         return F.log_softmax(affine_out)

#     def init_hidden(self, batch_size):
#         # This method generates the first hidden state of zeros which we'll use in the forward pass
#         hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
#          # We'll send the tensor holding the hidden state to the device we specified earlier as well
#         return hidden

# model = TagRNN(vocab_size, hidden_size, vocab_n, 2)
# model.cuda() #-> model to devise

In [None]:
def train(loader, model, optimizer, loss_fn):
  model.train()
  losses = []
  pbar = tqdm(loader)
  for x, y in pbar:
    optimizer.zero_grad()
    logits = model(x)
    # print("logits: ", logits.shape)
    # y = torch.squeeze(y,0)
    # print("y: ", y.shape)
    loss = loss_fn(logits, y.view(-1))
    pbar.set_postfix({'loss': loss.item()})
    losses.append(loss.item())
    loss.backward()  # calculate gradients for w/b
    optimizer.step()  # update weights according to optimizer rules
  return sum(losses) / len(losses)


def evaluate(loader, model, loss_fn):
  model.eval()
  predictions = []
  labels = []
  losses = []
  pbar = tqdm(loader)
  for x, y in pbar:
    logits = model(x)
    y = torch.squeeze(y,0)
    loss = loss_fn(logits, y.view(-1))
    pbar.set_postfix({'loss': loss.item()})
    losses.append(loss.item())    
  return sum(losses) / len(losses)

def test_fn(loader, model, loss_fn):
  model.eval()
  losses = []
  pbar = tqdm(loader)
  for x, y in pbar:
    logits = model(x)
    y = torch.squeeze(y,0)
    loss = loss_fn(logits, y.view(-1))
    pbar.set_postfix({'loss': loss.item()})
    losses.append(loss.item())    
  return sum(losses) / len(losses)

In [None]:
import matplotlib.pyplot as plt
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 15], gamma=0.1)
loss_fn = nn.CrossEntropyLoss()#nn.NLLLoss()
# score_fn = accuracy_score
avg_loss_list = []
evalloss_list = []
testloss_list = []
train_pp = []
val_pp = []
test_pp = []
n_epochs = n_epochs
best_acc = 0
for epoch in range(n_epochs):
  avg_loss = train(train_loader, model, optimizer, loss_fn)
  avg_loss_list.append(avg_loss)
  print('Tarin Loss: ', avg_loss)
  train_perplexity  = torch.exp(torch.tensor(avg_loss))
  train_pp.append(train_perplexity)
  print('Train Perplexity:', train_perplexity) 
  scheduler.step()
  eval_loss = evaluate(val_loader, model, loss_fn)
  evalloss_list.append(eval_loss)
  print('Validation Loss: ', eval_loss)
  valid_perplexity  = torch.exp(torch.tensor(eval_loss))
  print('Validation Perplexity:', valid_perplexity) 
  val_pp.append(valid_perplexity)
  # test_loss = test_fn(test_loader, model, loss_fn)
  # testloss_list.append(test_loss)
  # print('Test Loss: ', test_loss)
  # test_perplexity  = torch.exp(torch.tensor(test_loss))
  # print('Test Perplexity:', test_perplexity)  
  # test_pp.append(test_perplexity) 


fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12,6))

# Plot loss values over the epochs
ax1 = ax[0]
ax1.set_title("Train, Validation Losses")
ax1.plot(avg_loss_list, color="red", label="Train Loss")
ax1.plot(evalloss_list, color="green", label="Validation Loss")
ax1.legend()

# Plot accuracies over the epochs
ax2 = ax[1]
ax2.set_title("Train, Validation Perplexities")
ax2.plot(train_pp, color="red", label="Train Perplexity")
ax2.plot(val_pp, color="green", label="Validation Perplexity")
ax2.legend()

plt.show()

In [None]:
  test_loss = test_fn(test_loader, model, loss_fn)
  testloss_list.append(test_loss)
  print('Test Loss: ', test_loss)
  test_perplexity  = torch.exp(torch.tensor(test_loss))
  print('Test Perplexity:', test_perplexity)  
  test_pp.append(test_perplexity) 

  fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12,6))

# Plot loss values over the epochs
ax1 = ax[0]
ax1.set_title("Test Loss")
ax1.plot(testloss_list, color="blue", label="Test Loss")
ax1.legend()

# Plot accuracies over the epochs
ax2 = ax[1]
ax2.set_title("Test Perplexity")
ax2.plot(test_pp, color="blue", label="Test Perplexity")
ax2.legend()

plt.show()