In [None]:
import pandas as pd
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [None]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  return text.split()

In [None]:
tokenize('What is the capital of France?')

['what', 'is', 'the', 'capital', 'of', 'france']

In [None]:
# vocab
vocab = {'<UNK>':0}

In [None]:
def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:

    if token not in vocab:
      vocab[token] = len(vocab)


In [None]:
df.apply(build_vocab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [None]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [None]:
len(vocab)

324

In [None]:
# convert words to numerical indices
def text_to_indices(text, vocab):

  indexed_text = []

  for token in tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [None]:
text_to_indices("What is cdac", vocab)

[1, 2, 0]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class QADataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [None]:
dataset = QADataset(df, vocab)

In [None]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [None]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
for question, answer in dataloader:
  print(question, answer[0])

tensor([[ 10,  96,   3, 104, 239]]) tensor([240])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([273])
tensor([[10, 29,  3, 30, 31]]) tensor([32])
tensor([[ 10,  11, 157, 158, 159]]) tensor([160])
tensor([[10, 75, 76]]) tensor([77])
tensor([[ 42, 137,   2, 226,  12,   3, 227, 228]]) tensor([155])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([188])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([65])
tensor([[ 42, 107,   2, 108,  19, 109]]) tensor([110])
tensor([[ 10,   2,  62,  63,   3, 283,   5, 284]]) tensor([285])
tensor([[  1,   2,   3,   4,   5, 109]]) tensor([317])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([36])
tensor([[ 1,  2,  3, 92, 93, 94]]) tensor([95])
tensor([[ 42, 299, 300, 118,  14, 301, 302, 158, 303, 304, 305, 306]]) tensor([307])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([149])
tensor([[  1,   2,   3,  17, 115,  83,  84]]) tensor([116])
tensor([[ 42, 312,   2, 313,  62,  63,   3, 314, 315]]) tensor([316])
tensor([[1, 2, 3, 4

In [None]:
import torch.nn as nn

In [None]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output

In [None]:
 dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [None]:
x = nn.Embedding(324, embedding_dim=50)

In [None]:
x(dataset[0][0]).shape

torch.Size([6, 50])

In [None]:
x(dataset[0][0])

tensor([[-1.3367e+00, -5.9956e-01, -1.0386e+00,  5.5767e-01, -1.0379e-01,
         -9.1471e-01, -2.8720e-01, -1.5431e-01,  1.5720e-01, -5.9365e-02,
          8.9778e-02,  1.1453e-01, -4.5630e-02,  1.0110e+00,  2.0567e-01,
          4.2692e-02,  6.0190e-01,  7.8239e-01,  4.7163e-01,  5.9728e-01,
         -3.7741e-02, -6.0997e-01,  6.0706e-01,  2.3860e-01, -1.3432e+00,
         -1.6339e+00,  8.4532e-01, -2.6685e-01, -1.1701e+00, -1.6580e+00,
         -8.8145e-01,  1.1625e+00,  5.0063e-01, -1.1332e-01,  1.3549e+00,
         -1.9262e+00, -1.0720e+00, -6.6041e-01,  4.2870e-01,  2.6545e+00,
         -2.4052e-01, -1.2221e+00,  1.9607e-01, -1.5283e-01, -4.2967e-01,
         -1.9272e+00, -1.6405e+00, -1.5368e-01, -1.4094e+00, -1.2217e+00],
        [ 2.2553e+00, -7.6744e-01, -3.3912e-01, -1.6554e+00,  1.5016e-01,
          1.3992e+00,  1.2099e+00,  7.6269e-01,  2.9692e-01,  2.3935e+00,
         -2.1426e+00, -3.1006e-01,  5.7872e-01, -3.2635e-01,  7.2219e-01,
         -1.2808e-01, -1.5710e+00,  9

In [None]:
y = nn.RNN(50, 64)

In [None]:
y(a)[0].shape

torch.Size([6, 64])

In [None]:
y(a)[1].shape

torch.Size([1, 64])

In [None]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [None]:
learning_rate = 0.001
epochs = 20

In [None]:
model = SimpleRNN(len(vocab))

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 525.369936
Epoch: 2, Loss: 458.863825
Epoch: 3, Loss: 378.748274
Epoch: 4, Loss: 317.538428
Epoch: 5, Loss: 265.916567
Epoch: 6, Loss: 218.239040
Epoch: 7, Loss: 173.364845
Epoch: 8, Loss: 134.957571
Epoch: 9, Loss: 102.615172
Epoch: 10, Loss: 77.455173
Epoch: 11, Loss: 59.663612
Epoch: 12, Loss: 46.428435
Epoch: 13, Loss: 36.644876
Epoch: 14, Loss: 29.502051
Epoch: 15, Loss: 24.216189
Epoch: 16, Loss: 19.951079
Epoch: 17, Loss: 16.763408
Epoch: 18, Loss: 14.260459
Epoch: 19, Loss: 12.294842
Epoch: 20, Loss: 10.627344


In [None]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [None]:
predict(model, "What is the largest planet in our solar system?")

jupiter


In [None]:
list(vocab.keys())[7]

'paris'