<a href="https://colab.research.google.com/github/saurav3k2/Intro-to-PyTorch/blob/main/PyTorch_RNN_based_Q_A_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
df = pd.read_csv("/content/100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
## tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?' , '')
  text = text.replace('!' , '')
  text = text.replace('.' , '')
  return text.split()

In [4]:
tokenize("What is the Capital of France?")

['what', 'is', 'the', 'capital', 'of', 'france']

In [5]:
# vocab
vocab = {'<UNKK>':0}

In [12]:
def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:

    if token not in vocab:
      vocab[token] = len(vocab)


In [13]:
df.apply(build_vocab, axis = 1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [14]:
len(vocab)

326

In [15]:
# Convert words to numerical indices

def text_to_indices(text , vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in  vocab:
      indexed_text.append(vocab[token])

    else:
      indexed_text.append(vocab['<UNKK>'])
  return indexed_text

In [16]:
text_to_indices("What is compusx" , vocab)

[1, 2, 0]

In [17]:
import torch
from torch.utils.data import Dataset , DataLoader

In [18]:
class QADataset(Dataset):

  def __init__(self, df , vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self , index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'] , self.vocab)
    return torch.tensor(numerical_question) , torch.tensor(numerical_answer)



In [19]:
dataset = QADataset(df,vocab)

In [20]:
dataloader = DataLoader(dataset, batch_size=1 , shuffle = True)

In [21]:
for question , answer in dataloader:
  print(question , answer[0])


tensor([[1, 2, 3, 4, 5, 8]]) tensor([9])
tensor([[ 42,  86,  87, 243, 244,  19,  39, 245]]) tensor([246])
tensor([[ 1,  2,  3, 50, 51, 19,  3, 45]]) tensor([52])
tensor([[10, 75, 76]]) tensor([77])
tensor([[  1,   2,   3,  33,  34,   5, 247]]) tensor([248])
tensor([[10, 96,  3, 97]]) tensor([98])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([68])
tensor([[  1,   2,   3, 141, 117,  83,   3, 279, 280]]) tensor([121])
tensor([[ 42, 101,   2,   3,  17]]) tensor([102])
tensor([[  1,   2,   3,  37,  38,  39, 162]]) tensor([163])
tensor([[ 10,  11, 190, 159, 191]]) tensor([192])
tensor([[ 42, 265, 266,  14, 267, 268, 159, 269]]) tensor([270])
tensor([[ 10,  75, 209]]) tensor([210])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([65])
tensor([[ 42, 137,   2,  62,  39,   3, 324, 325]]) tensor([6])
tensor([[  1,   2,   3,   4,   5, 135]]) tensor([136])
tensor([[  1,   2,   3, 122, 123,  19,   3,  45]]) tensor([124])
tensor([[ 42, 257,   2, 258,  83, 259, 260]]) tensor([261])
tensor([[  1,   2,  

In [22]:
import torch.nn as nn

In [24]:
class SimpleRNN(nn.Module):
  def __init__(self , vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size , embedding_dim=50)
    self.rnn = nn.RNN(50,64, batch_first= True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output

In [25]:
x  = nn.Embedding(324 , embedding_dim = 50)
y = nn.RNN(50,64 , batch_first= True)
z = nn.Linear(64 , 324)


a = dataset[0][0].reshape(1,6)
print("shape of a ", a.shape)
b = x(a)
print("shape of b ", b.shape)
c , d = y(b)
print("shape of c ", c.shape)
print("shape of d ", d.shape)

e  = z(d.squeeze(0))
print("shape of e ", e.shape)



shape of a  torch.Size([1, 6])
shape of b  torch.Size([1, 6, 50])
shape of c  torch.Size([1, 6, 64])
shape of d  torch.Size([1, 1, 64])
shape of e  torch.Size([1, 324])


In [26]:
learning_rate = 0.001
epochs = 50


In [28]:
model = SimpleRNN(len(vocab))

In [29]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters() , lr=learning_rate)

In [31]:
## training loops
for epoch in range(epochs):
  total_loss   = 0
  for question , answer in dataloader:

    optimizer.zero_grad()

    ## foorward pass
    output  = model(question)

    ##loss --> output sjape (1,324 - (1))
    loss  = criterion(output , answer[0])

    ## gradient
    loss.backward()

    ##update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch : {epoch+1} , loss {total_loss:4f}")

Epoch : 1 , loss 0.917262
Epoch : 2 , loss 0.866162
Epoch : 3 , loss 0.819345
Epoch : 4 , loss 0.774879
Epoch : 5 , loss 0.733690
Epoch : 6 , loss 0.694686
Epoch : 7 , loss 0.659396
Epoch : 8 , loss 0.623727
Epoch : 9 , loss 0.591853
Epoch : 10 , loss 0.560862
Epoch : 11 , loss 0.532275
Epoch : 12 , loss 0.505502
Epoch : 13 , loss 0.479447
Epoch : 14 , loss 0.455474
Epoch : 15 , loss 0.432870
Epoch : 16 , loss 0.411451
Epoch : 17 , loss 0.391105
Epoch : 18 , loss 0.372174
Epoch : 19 , loss 0.353938
Epoch : 20 , loss 0.336522
Epoch : 21 , loss 0.320177
Epoch : 22 , loss 0.304869
Epoch : 23 , loss 0.290119
Epoch : 24 , loss 0.276410
Epoch : 25 , loss 0.262907
Epoch : 26 , loss 0.250627
Epoch : 27 , loss 0.238621
Epoch : 28 , loss 0.227290
Epoch : 29 , loss 0.216777
Epoch : 30 , loss 0.206612
Epoch : 31 , loss 0.196965
Epoch : 32 , loss 0.188059
Epoch : 33 , loss 0.178822
Epoch : 34 , loss 0.170570
Epoch : 35 , loss 0.162945
Epoch : 36 , loss 0.155499
Epoch : 37 , loss 0.147978
Epoch : 38

In [34]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [35]:
predict(model, "What is the largest planet in our solar system?")

jupiter


In [37]:
predict(model, "Who wrote 'To Kill a Mockingbird'?")

harper-lee


In [38]:
predict(model, "What is the boiling point of water in Celsius?")

100


In [39]:
predict(model , "What is the capital of Germany?")

berlin
