<a href="https://colab.research.google.com/github/sudip234-source/PyTorch-Tutorial/blob/main/Quention_Answering_using_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [166]:
import pandas as pd

df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head(), df.shape

(                                          question      answer
 0                   What is the capital of France?       Paris
 1                  What is the capital of Germany?      Berlin
 2               Who wrote 'To Kill a Mockingbird'?  Harper-Lee
 3  What is the largest planet in our solar system?     Jupiter
 4   What is the boiling point of water in Celsius?         100,
 (90, 2))

In [167]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  return text.split()

In [168]:
vocab = {'<UNK>':0}

In [169]:
def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:

    if token not in vocab:
      vocab[token] = len(vocab)


In [170]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [171]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [172]:
def text_to_indeces(text,vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [173]:
text_to_indeces("what is campusx",vocab)

[1, 2, 0]

In [174]:
import torch
from torch.utils.data import Dataset, DataLoader

In [175]:
class QADataset(Dataset):
  def __init__(self,df,vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    numeric_qn = text_to_indeces(self.df.iloc[idx]['question'],self.vocab)
    numeric_ans = text_to_indeces(self.df.iloc[idx]['answer'],self.vocab)
    return torch.tensor(numeric_qn), torch.tensor(numeric_ans)

In [176]:
dataset = QADataset(df,vocab)

In [177]:
dataset[10]

(tensor([ 1,  2,  3,  4,  5, 53]), tensor([54]))

In [178]:
dataloader = DataLoader(dataset,batch_size=1,shuffle=True)

In [179]:
for qn, ans in dataloader:
  print(qn,ans)
  break

tensor([[ 42, 200,   2,  14, 201, 202, 203, 204]]) tensor([[205]])


In [180]:
len(vocab)

324

In [181]:
import torch.nn as nn
import torch.optim as optim

In [182]:
class myRNN(nn.Module):
  def __init__(self,vocab_size):
    super(myRNN,self).__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
    self.rnn = nn.RNN(50,64,batch_first=True)
    self.fc = nn.Linear(64,vocab_size)

  def forward(self,question):
    embedded_qn = self.embedding(question)
    hidden,final_out = self.rnn(embedded_qn)
    output = self.fc(final_out)
    output = output.squeeze(0)
    return output

In [183]:
dataset[9][0]

tensor([ 1,  2,  3, 50, 51, 19,  3, 45])

In [184]:
x = nn.Embedding(326,50)
x(dataset[9][0]).shape

torch.Size([8, 50])

In [185]:
a = x(dataset[9][0])
a

tensor([[-4.2636e-01, -2.5281e-01, -1.2102e+00,  8.7221e-01, -1.5227e+00,
         -1.2963e-01, -8.4204e-01, -7.7837e-02, -6.8812e-01, -2.7064e-02,
         -1.6626e-01, -8.3027e-02, -1.6697e-01,  1.4456e-01, -3.1724e-01,
         -1.2725e+00,  1.9381e+00,  2.5518e-01,  8.4667e-01, -1.5931e+00,
         -5.6445e-02, -2.2874e+00,  5.6804e-02,  7.0182e-01,  3.6714e-01,
         -1.3882e-02,  5.0319e-03, -3.4336e-01,  5.8829e-01, -1.8760e-01,
          3.5859e-02,  1.3459e-01,  9.1129e-01, -3.7251e-01, -1.2534e+00,
         -2.8209e-01,  4.8890e-01,  7.4557e-01,  7.6829e-01, -3.0932e-01,
         -2.0636e+00, -3.3635e-01,  1.1414e+00, -1.3551e+00,  6.2582e-01,
          1.8467e+00,  4.0536e-01, -1.6026e+00, -8.7996e-01,  5.1704e-01],
        [-5.1227e-01, -1.6821e-01,  6.7456e-01,  8.3289e-02,  9.9972e-02,
         -3.7696e-01,  1.3546e+00, -5.2545e-01, -9.3383e-01, -6.1769e-01,
         -1.2006e+00,  2.3678e-01,  4.9380e-01,  1.3077e+00,  1.6348e-01,
         -3.1635e-01,  1.2430e+00,  2

In [186]:
y = nn.RNN(50,64)
y(a)[0].shape #hidden state

torch.Size([8, 64])

In [187]:
b = y(a)[1] # final output of RNN
b.shape

torch.Size([1, 64])

In [188]:
z = nn.Linear(64,326)
z(b).shape

torch.Size([1, 326])

In [189]:
learning_rate = 0.001
epochs = 20

In [190]:
model = myRNN(len(vocab))

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=learning_rate)

In [191]:
for epoch in range(epochs):
  total_loss = 0
  for qn, ans in dataloader:
    pred = model(qn)

    loss = criterion(pred,ans[0])

    optimizer.zero_grad
    loss.backward()

    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss/len(dataloader)}")

Epoch: 1, Loss: 5.900683466593424
Epoch: 2, Loss: 2.8149271097448136
Epoch: 3, Loss: 0.6558932097680453
Epoch: 4, Loss: 0.49696841312850465
Epoch: 5, Loss: 0.4139556535026511
Epoch: 6, Loss: 0.18219266300834533
Epoch: 7, Loss: 0.30068978174939875
Epoch: 8, Loss: 0.32678163155315093
Epoch: 9, Loss: 0.4830453135975797
Epoch: 10, Loss: 0.4392198399315027
Epoch: 11, Loss: 0.5600269342296855
Epoch: 12, Loss: 0.5974400406864052
Epoch: 13, Loss: 0.603079562828261
Epoch: 14, Loss: 0.7217928600128324
Epoch: 15, Loss: 0.5265322950871005
Epoch: 16, Loss: 1.3459121903385105
Epoch: 17, Loss: 0.8847922889086157
Epoch: 18, Loss: 0.7708152518590997
Epoch: 19, Loss: 0.6675441623151415
Epoch: 20, Loss: 0.906639703731077


In [192]:
def predict_qn(model,question,threshold=0.5):
  numerix_qn = text_to_indeces(question,vocab)
  qn_tensor = torch.tensor(numerix_qn)
  qn_tensor = qn_tensor.unsqueeze(0)
  pred = model(qn_tensor)

  probs = torch.nn.functional.softmax(pred,dim=1)
  value , index = torch.max(probs,dim=1)
  if value < threshold :
    print("I don't know")

  print(list(vocab.keys())[index])

In [193]:
predict_qn(model,'What is the capital of France?')

paris
