<a href="https://colab.research.google.com/github/sukhmancs/TextWizards/blob/main/smallGPT_pg_book_corpus_encoder_decoder_with_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#✋ **_Note:_**
- **MAX_LENGTH must be equal to the target_len.**
- **The cells that are testing the code will going to throw `runtime` error while using GPU. (Because the tensors are not on the same device). Stick with CPU to test the code 🌝.**

# Install LangChain

In [94]:
%matplotlib inline

In [95]:
#!pip install chromadb==0.4.10 tiktoken==0.3.3 sqlalchemy==2.0.15
!pip install langchain==0.0.249
#!pip install --force-reinstall pydantic==1.10.6
#!pip install sentence_transformers



# Import modules and Download data

In [96]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, ConversationalRetrievalChain, ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.schema import messages_from_dict, messages_to_dict
from langchain.memory.chat_message_histories.in_memory import ChatMessageHistory
from langchain.agents import Tool
from langchain.agents import initialize_agent
from langchain.agents import AgentType

In [97]:
from langchain.document_loaders import GutenbergLoader

loader = GutenbergLoader(
    "https://www.gutenberg.org/cache/epub/100/pg100.txt"
)

document = loader.load()

extrait = ' '.join(document[0].page_content.split()[:100])
display(extrait + " .......")



'The Project Gutenberg eBook of The Complete Works of William Shakespeare This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title: The Complete Works of William Shakespeare .......'

In [130]:
data = ' '.join(document[0].page_content.split())

In [99]:
import copy
import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset, RandomSampler
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

# Prepare Data

In [100]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        # To Do: Add normalizeString() function in here to to normalize the sentence (i.e. data)
        for word in sentence.split(' '):
            self.addWord(word)

    # To Do: Define normalizeString() function in here
    def normalizeString(s):
        # Start code
        pass
        # End code

    # To Do: Define unicodeToAscii(s) function in here
    def unicodeToAscii(s):
        # Start code
        pass
        # End code

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [101]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [102]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [103]:
lang = Lang()
lang.addSentence(data)
encode = lambda s: [lang.word2index[word] for word in s.split(' ')] # encoder: take a string, output a list of integers
decode = lambda l: ' '.join([lang.index2word[i] for i in l]) # decoder: take a list of integers, output a string

In [104]:
print(encode("what values are available"))
print(decode(encode("what values are available")))

[508, 20726, 52, 71246]
what values are available


In [131]:
# let's now encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(data), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:300]) # the 300 characters we looked at earier will to the GPT look like this

torch.Size([966501]) torch.int64
tensor([  2,   3,   4,   5,   6,   2,   7,   8,   6,   9,  10,  11,  12,  13,
         14,  15,  16,   6,  17,  18,  19,  15,  20,  21,  22,  23,  24,  25,
          6,  15,  26,  27,  28,  29,  22,  30,  31,  28,  32,  33,  34,  35,
         36,  37,  38,  39,  40,  41,  42,  39,  43,  15,  44,   6,  15,   3,
          4,  45,  46,  30,  47,  12,  41,  48,  27,  49,  50,  51,  52,  53,
         54,  19,  15,  20,  55,  51,  56,  57,  58,  59,  15,  60,   6,  15,
         61,  62,  51,  52,  54,  63,  64,  47,  65,  66,   2,   7,   8,   6,
          9,  10,  67,   9,  10,  68,  69,  70,  71,  72,  73,  74,  75,  76,
         77,  70,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  85,
         89,  90,  84,  91,  92,  82,   2,   7,   8,   6,   9,  10,  93,   9,
         10,  94,  85,  95,  96,  97,  98,  99,  97,  85, 100,  84, 101, 102,
        103, 104, 105, 106, 107,  85, 108,  84, 109,  85, 100,  84, 110, 111,
         85, 100,  84, 112, 113

In [106]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [107]:
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([2]) the target: 3
when input is tensor([2, 3]) the target: 4
when input is tensor([2, 3, 4]) the target: 5
when input is tensor([2, 3, 4, 5]) the target: 6
when input is tensor([2, 3, 4, 5, 6]) the target: 2
when input is tensor([2, 3, 4, 5, 6, 2]) the target: 7
when input is tensor([2, 3, 4, 5, 6, 2, 7]) the target: 8
when input is tensor([2, 3, 4, 5, 6, 2, 7, 8]) the target: 6


In [108]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 10

In [109]:
lang.n_words * 0.05

3579.8

In [110]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [111]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
n_words = lang.n_words

def get_batch(split, batch_size):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    # How many sentences i want to make out of ~71k words
    # Increase this value to train on a larger data (i.e. more number of sentences)
    ix = torch.randint(len(data) - MAX_LENGTH, (int(n_words * 0.05),))
    x = torch.stack([data[i:i+MAX_LENGTH] for i in ix])
    y = torch.stack([data[i+1:i+MAX_LENGTH+1] for i in ix])
    return x, y

def get_dataloader(batch_size, split='train'):
    print("Parsing data...")
    input_ids, target_ids = get_batch(split, batch_size)
    print(f"input_ids length: {len(input_ids)}, target_ids length: {len(target_ids)}")
    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return train_dataloader

xb, yb = get_batch('train', batch_size)
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([3579, 10])
tensor([[ 2093,   244,   272,  ...,    58,   227, 41029],
        [11783,    30, 15195,  ...,  3257,    58,   907],
        [ 5177,    15, 44665,  ...,   288,   283, 12861],
        ...,
        [ 1871, 35235,   117,  ..., 28416,  7386,   764],
        [61548,   371,  1080,  ...,  3316, 18563,    93],
        [  272,  1061,   932,  ...,    58,    15,  2396]])
targets:
torch.Size([3579, 10])
tensor([[  244,   272,   238,  ...,   227, 41029,  1162],
        [   30, 15195, 53804,  ...,    58,   907,   908],
        [   15, 44665, 44359,  ...,   283, 12861, 44640],
        ...,
        [35235,   117, 30761,  ...,  7386,   764,   570],
        [  371,  1080,   297,  ..., 18563,    93,    22],
        [ 1061,   932,  4853,  ...,    15,  2396,     6]])
----
when input is [2093] the target: 244
when input is [2093, 244] the target: 272
when input is [2093, 244, 272] the target: 238
when input is [2093, 244, 272, 238] the target: 1300
when input is [2093, 244, 272

In [112]:
train_dataloader = get_dataloader(batch_size=32, split='train')

for data in train_dataloader:
    inputs, targets = data
    print(inputs)
    break


Parsing data...
input_ids length: 3579, target_ids length: 3579
tensor([[ 5160,   455,   259,  4808, 26845,  4466,   753, 10491,   483,    63],
        [27855,    22, 57200,   115, 36115,   312,   434,   214,  4776,  5812],
        [ 2587,   527,  4291,   238,   326,  1383, 26233, 10022,   288,    15],
        [  326, 14112,    22,  3019, 62922, 45744,   202,  2485,   764,   233],
        [49849,    22,    53,    58,  1305,   227, 49848,  1840,   866, 49850],
        [ 2570, 18064, 17498,  5103,   372,    13, 18065,   753,  3063,    15],
        [ 1901,   195,  7263,     6, 38577,    19, 12251,  1659,  1305,   212],
        [  212,  8786,   334, 16593,    93,   221, 16821,   244,  3069,  3833],
        [21737,   669, 24916,  6520,  4948, 39759, 15980,    51,  1342,  1897],
        [11765,  4549, 23012,   694,   227,  5175,    58,    15,  1273,   362],
        [  764, 11879,   753,   200,  7200,   227, 25015,   506,  5713,   227],
        [15210, 51565,  1340,   516,   481,  7498,   719

# Encoder

In [113]:
class Encoder(nn.Module):
    def __init__(self, n_features, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_features = n_features
        self.hidden = None
        #self.embedding_dimension = 256
        self.embd = nn.Embedding(self.n_features, self.hidden_dim)
        self.basic_rnn = nn.GRU(self.hidden_dim, self.hidden_dim, batch_first=True)

    def forward(self, X):
        X_embd = self.embd(X) # N, F -> N, F, H
        rnn_out, self.hidden = self.basic_rnn(X_embd) # N, F, H x N, H, H  ->  N, F, H

        return rnn_out, self.hidden # N, F, H

## Testing Encoder

In [114]:
### Uncomment to test the code (make sure you are using CPU otherwise this code will going to throw runtime error)

#torch.manual_seed(21)
# full_seq = torch.full((2, 3), 1)
# encoder = Encoder(n_features=3, hidden_dim=5)
# hidden_seq, hidden_final = encoder(full_seq) # output is N, L, F
# hidden_final.size()

#Decoder

In [115]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.hidden = None
        self.embedding = nn.Embedding(output_size, self.hidden_dim)
        self.basic_rnn = nn.GRU(self.hidden_dim, self.hidden_dim, batch_first=True)
        self.regression = nn.Linear(self.hidden_dim, output_size)

    def init_hidden(self, hidden_seq):
        # We only need the final hidden state from encoder for each sentence
        hidden_final = hidden_seq[:, -1:] # N, F
        #self.hidden = hidden_final
        self.hidden = hidden_final.permute(1, 0, 2) # 1, N, H   Because output of encoder is sequence first but GRU expects batch first

    def forward(self, X):
        X = self.embedding(X) # N, F -> N, F, H
        batch_first_output, self.hidden = self.basic_rnn(X, self.hidden) # N, F, H x N, H, H -> N, F, H
        out = self.regression(batch_first_output) # N, F, output_size

        # N, F, output_size
        return out

## Testing Decoder

In [116]:
### Uncomment to test the code (make sure you are using CPU otherwise this code will going to throw runtime error)

# torch.manual_seed(21)
# decoder = Decoder(output_size=5, hidden_dim=5)
# batch_size = 16

# # Initial hidden state will be encoder's final hidden state
# decoder.init_hidden(hidden_seq)
# # Initial data point is the last element of source sequence
# #inputs = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
# inputs = torch.empty(2, 1, dtype=torch.long, device=device).fill_(SOS_token)  # remove me

# decoder_outputs = []
# target_len = 2
# for i in range(target_len):
#     print(f'Hidden: {decoder.hidden}')
#     decoder_output = decoder(inputs)   # Predicts coordinates
#     decoder_outputs.append(decoder_output)
#     _, topi = decoder_output.topk(1)
#     inputs = topi.squeeze(-1).detach()  # detach from history as input
#     print(f'Output: {decoder_output}\n')
# decoder_outputs = torch.cat(decoder_outputs, dim=1)
# print(f'combinet_outputs: {decoder_outputs}')

# Decoder with attention

In [117]:
class Attention(nn.Module):
    def __init__(self, hidden_dim, input_dim=None, proj_values=False):
        super().__init__()
        self.d_k = hidden_dim
        self.input_dim = hidden_dim if input_dim is None else input_dim
        self.proj_values = proj_values
        # Affine transformations for Q, K, and V
        self.linear_query = nn.Linear(self.input_dim, hidden_dim)
        self.linear_key = nn.Linear(self.input_dim, hidden_dim)
        self.linear_value = nn.Linear(self.input_dim, hidden_dim)
        self.alphas = None

    def init_keys(self, keys):
        self.keys = keys
        self.proj_keys = self.linear_key(self.keys) # N, F, H x N, H, H -> N, F, H
        self.values = self.linear_value(self.keys) if self.proj_values else self.keys  # N, F, H x N, H, H -> N, F, H

    def score_function(self, query):
        proj_query = self.linear_query(query) # N, 1, H x N, H, H -> N, 1, H
        # scaled dot product
        # N, 1, H x N, H, F -> N, 1, F
        dot_products = torch.bmm(proj_query, self.proj_keys.permute(0, 2, 1))
        scores =  dot_products / np.sqrt(self.d_k)
        return scores

    def forward(self, query, mask=None):
        # Query is batch-first N, 1, H
        # L or F means sequence length
        scores = self.score_function(query) # N, 1, F
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        alphas = F.softmax(scores, dim=-1) # N, 1, F
        self.alphas = alphas.detach()

        # N, 1, F x N, F, H -> N, 1, H
        context = torch.bmm(alphas, self.values)
        return context

### Decoder with rnn and attention

In [118]:
class DecoderAttn(nn.Module):
    def __init__(self, output_size, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.hidden = None
        self.output_size = output_size
        self.embedding = nn.Embedding(output_size, self.hidden_dim)
        self.basic_rnn = nn.GRU(self.hidden_dim, self.hidden_dim, batch_first=True)
        self.attn = Attention(self.hidden_dim)
        self.regression = nn.Linear(2 * self.hidden_dim, self.output_size)

    def init_hidden(self, hidden_seq):
        # the output of the encoder is N, F, H
        # and init_keys expects batch-first as well
        self.attn.init_keys(hidden_seq)
        hidden_final = hidden_seq[:, -1:]
        self.hidden = hidden_final.permute(1, 0, 2)   # F, N, H

    def forward(self, X, mask=None):
        # X is N, 1
        # N is batch size, H is hidden dimensions
        X = self.embedding(X) # N, 1 -> N, 1, H
        batch_first_output, self.hidden = self.basic_rnn(X, self.hidden) # N, 1, H x N, H, H -> N, 1, H

        query = batch_first_output # N, 1, H
        # Attention
        context = self.attn(query, mask=mask) # N, 1, H
        concatenated = torch.cat([context, query], axis=-1) # N, 1, 2*H
        out = self.regression(concatenated)  # N, 1, 2*H x N, 2*H, 1 -> N, 1, 1

        # N, 1, F
        return out.view(-1, 1, self.output_size)

#### Test Decoder with RNN and Attention

In [119]:
### Uncomment to test the code (make sure you are using CPU otherwise this code will going to throw runtime error)

# torch.manual_seed(21)
# decoder = DecoderAttn(output_size=5, hidden_dim=5)

# # Initial hidden state will be encoder's final hidden state
# decoder.init_hidden(hidden_seq)
# # Initial data point is the last element of source sequence
# #inputs = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
# inputs = torch.empty(2, 1, dtype=torch.long, device=device).fill_(SOS_token)  # remove me

# decoder_outputs = []
# target_len = 2
# for i in range(target_len):
#     print(f'Hidden: {decoder.hidden}')
#     decoder_output = decoder(inputs)   # Predicts coordinates
#     decoder_outputs.append(decoder_output)
#     _, topi = decoder_output.topk(1)
#     inputs = topi.squeeze(-1).detach()  # detach from history as input
#     print(f'Output: {decoder_output}\n')
# decoder_outputs = torch.cat(decoder_outputs, dim=1)
# print(f'combinet_outputs: {decoder_outputs}')

# Encoder-Decoder Architecture

In [120]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, target_len, teacher_forcing_prob=0.5):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.target_len = target_len
        self.teacher_forcing_prob = teacher_forcing_prob
        self.outputs = None

    def init_outputs(self, batch_size):
        device = next(self.parameters()).device
        # N, L (target), F
        self.outputs = torch.zeros(batch_size,
                              self.target_len,
                              self.encoder.n_features).to(device)

    def store_output(self, i, out):
        # Stores the output
        self.outputs[:, i:i+1, :] = out

    def forward(self, X, target_tensor=None):
        # X is batch of sentences -> N, F
        # splits the data in source and target sequences
        # the target seq will be empty in testing mode
        # N, L, F

        # Encoder expected N, F
        hidden_seq, hidden_final = self.encoder(X)
        # Output is N, F, hidden_dim
        self.decoder.init_hidden(hidden_seq)

        # The last input of the encoder is also
        # the first input of the decoder
        #dec_inputs = source_seq[:, -1:, :]
        batch_size = hidden_seq.size(0)

        dec_inputs = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_outputs = []
        # Generates as many outputs as the target length
        for i in range(self.target_len):
            # Output of decoder is N, 1, F
            decoder_output = self.decoder(dec_inputs)
            decoder_outputs.append(decoder_output)

            prob = self.teacher_forcing_prob

            # In evaluation/test the target sequence is
            # unknown, so we cannot use teacher forcing
            #if not self.training:
            #   prob = 0

            if torch.rand(1) <= prob and target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
            #_, topi = decoder_output.topk(1)
            #dec_inputs = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs

In [121]:
### Uncomment to test the code (make sure you are using CPU otherwise this code will going to throw runtime error)

# hidden_size = 128
# batch_size = 32
# encoder = Encoder(lang.n_words, hidden_size).to(device)
# decoder = Decoder(lang.n_words, hidden_size).to(device)
# encdec = EncoderDecoder(encoder, decoder, target_len=5)

# outputs = encdec(full_seq)
# _, topi = outputs.topk(1)
# topi.squeeze(-1).detach()

# Train

In [122]:
def train_epoch(dataloader, model, optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        optimizer.zero_grad()
        decoder_outputs = model(input_tensor, target_tensor)

        loss = F.cross_entropy(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        # Step 4 - Updates parameters using gradients and the learning rate
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [123]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [124]:
def train(train_dataloader, model, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    model.train()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, model, optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [125]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [55]:
hidden_size = 128
batch_size = 32

train_dataloader = get_dataloader(batch_size)

# Encoder-Decoder with RNN
#encoder = Encoder(lang.n_words, hidden_size).to(device)
#decoder = Decoder(lang.n_words, hidden_size).to(device)
#model = EncoderDecoder(encoder, decoder, target_len=10)

# Encoder-Decoder with Attention and RNN (The best model so far)
encoder = Encoder(lang.n_words, hidden_size).to(device)
decoder = DecoderAttn(lang.n_words, hidden_size).to(device)
model = EncoderDecoder(encoder, decoder, target_len=10)

# Encoder-Decoder Self-Attention
#encoder = EncoderSelfAttn(n_features=input_lang.n_words, d_model=hidden_size, n_heads=4, ff_units=4).to(device)
#decoder = DecoderSelfAttn(n_features=output_lang.n_words, d_model=hidden_size, n_heads=4, ff_units=4).to(device)
#model = EncoderDecoderSelfAttention(encoder, decoder, target_len=10)

train(train_dataloader, model, 80, print_every=5, plot_every=5)

Parsing data...
input_ids length: 3579, target_ids length: 3579
0m 30s (- 7m 34s) (5 6%) 7.0192
1m 0s (- 7m 4s) (10 12%) 4.6284
1m 30s (- 6m 34s) (15 18%) 2.9824
2m 1s (- 6m 4s) (20 25%) 2.1110
2m 33s (- 5m 38s) (25 31%) 1.5964
3m 4s (- 5m 6s) (30 37%) 1.2386
3m 34s (- 4m 35s) (35 43%) 0.9799
4m 4s (- 4m 4s) (40 50%) 0.7878
4m 35s (- 3m 33s) (45 56%) 0.6306
5m 5s (- 3m 3s) (50 62%) 0.5243
5m 35s (- 2m 32s) (55 68%) 0.4384
6m 6s (- 2m 2s) (60 75%) 0.3357
6m 37s (- 1m 31s) (65 81%) 0.3117
7m 8s (- 1m 1s) (70 87%) 0.2562
7m 40s (- 0m 30s) (75 93%) 0.1876
8m 11s (- 0m 0s) (80 100%) 0.1884


In [56]:
def evaluate(model, input_tensor, lang):
    with torch.no_grad():
        #input_tensor = encode(sentence)

        decoder_outputs = model(input_tensor)
        #print(decoder_outputs)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(lang.index2word[idx.item()])
    return decoded_words

In [57]:
model.eval() # Set to evaluation mode

EncoderDecoder(
  (encoder): Encoder(
    (embd): Embedding(71596, 128)
    (basic_rnn): GRU(128, 128, batch_first=True)
  )
  (decoder): DecoderAttn(
    (embedding): Embedding(71596, 128)
    (basic_rnn): GRU(128, 128, batch_first=True)
    (attn): Attention(
      (linear_query): Linear(in_features=128, out_features=128, bias=True)
      (linear_key): Linear(in_features=128, out_features=128, bias=True)
      (linear_value): Linear(in_features=128, out_features=128, bias=True)
    )
    (regression): Linear(in_features=256, out_features=71596, bias=True)
  )
)

In [58]:
count = 10 # number of sentences we want to predict
number_of_predictions = 0

for input_tensor, target_tensor in train_dataloader:
    # Randomly choose an index
    index = random.choice(range(len(input_tensor)))

    # Choose both the input and target tensors at the selected index
    input_tensor = input_tensor[index].view(1, -1)
    target_tensor = target_tensor[index].view(1, -1)

    #print("Input Tensor Size:", input_tensor.size())
    #print("Target Tensor Size:", target_tensor.size())

    # Prepare components for printing
    input_sentence = decode(input_tensor.view(-1).tolist())
    predicted_sentence = ' '.join(evaluate(model, input_tensor, lang))
    correct_target = decode(target_tensor.view(-1).tolist())

    #print(f"{decode(input_tensor.view(-1).tolist())} -> {' '.join(evaluate(model, input_tensor, lang))}, CORRECT TARGET: {decode(target_tensor.view(-1).tolist())}")
    #print(f"Input Sentence: {input_sentence:80}, Predicted Output: {predicted_sentence:20}, Correct Target: {correct_target:20}")

    print(f"Input Sentence:     {input_sentence}")
    print(f"Predicted Sentence: {predicted_sentence}")
    print(f"Correct Target:     {correct_target}")
    print()

    number_of_predictions += 1
    if number_of_predictions > count:
        break

Input Sentence:     Tarsus. A room in Cleon’s house. Enter Pericles, Cleon, Dionyza
Predicted Sentence: A room in Cleon’s house. Enter Pericles, Cleon, Dionyza and
Correct Target:     A room in Cleon’s house. Enter Pericles, Cleon, Dionyza and

Input Sentence:     With reservation of an hundred knights, By you to be
Predicted Sentence: reservation of an hundred knights, By you to be sustain’d,
Correct Target:     reservation of an hundred knights, By you to be sustain’d,

Input Sentence:     my arms before the legs of this sweet lass of
Predicted Sentence: arms before the legs this of sweet lass of France._
Correct Target:     arms before the legs of this sweet lass of France._

Input Sentence:     you should safe my going, Is Fulvia’s death. CLEOPATRA. Though
Predicted Sentence: should safe my going, Is Fulvia’s death. CLEOPATRA. Though age
Correct Target:     should safe my going, Is Fulvia’s death. CLEOPATRA. Though age

Input Sentence:     hath put on nature’s power, Fairing the fo

In [61]:
#@title Enter custom text
input_text = input("Enter a sentence (default: 'For, though before his face I speak the words, Your'): ") or "For, though before his face I speak the words, Your"
input_tensor = encode(input_text) # "though before his face I speak the words, Your"
input_tensor = torch.tensor(input_tensor, dtype=torch.long).view(1, -1).to(device)
' '.join(evaluate(model, input_tensor, lang))

Enter a sentence (default: 'For, though before his face I speak the words, Your'): 


'his his his I I the the Your Your Your'

# Save and Load the Trained Model

In [62]:
PATH = '/content/smallGPT.pth'

In [63]:
torch.save(model.state_dict(), PATH)

In [None]:
model.load_state_dict(torch.load(PATH))
model.eval()