In [117]:
import pandas as pd

In [118]:
df= pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [119]:
def tokenize(text):
  text= text.lower()
  text= text.replace('?','')
  text= text.replace("'","")
  return text.split()


In [120]:
tokenize('what is the capital of france?')

['what', 'is', 'the', 'capital', 'of', 'france']

In [121]:
vocab={'<UNK>':0}

In [122]:
def build_vocab(row):
  print(row['question'],row['answer'])
  tokenized_question=tokenize(row['question'])
  tokenized_answer=tokenize(row['answer'])
  merged_tokens= tokenized_question+tokenized_answer

  for token in merged_tokens:
    if token not in vocab:
      vocab[token]= len(vocab)

In [123]:
vocab

{'<UNK>': 0}

In [124]:
len(vocab)

1

In [125]:
df.apply(build_vocab,axis=1)

What is the capital of France? Paris
What is the capital of Germany? Berlin
Who wrote 'To Kill a Mockingbird'? Harper-Lee
What is the largest planet in our solar system? Jupiter
What is the boiling point of water in Celsius? 100
Who painted the Mona Lisa? Leonardo-da-Vinci
What is the square root of 64? 8
What is the chemical symbol for gold? Au
Which year did World War II end? 1945
What is the longest river in the world? Nile
What is the capital of Japan? Tokyo
Who developed the theory of relativity? Albert-Einstein
What is the freezing point of water in Fahrenheit? 32
Which planet is known as the Red Planet? Mars
Who is the author of '1984'? George-Orwell
What is the currency of the United Kingdom? Pound
What is the capital of India? Delhi
Who discovered gravity? Newton
How many continents are there on Earth? 7
Which gas do plants use for photosynthesis? CO2
What is the smallest prime number? 2
Who invented the telephone? Alexander-Graham-Bell
What is the capital of Australia? Canber

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [126]:
def text_to_indices(text, vocab):
  indexed_text= []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [127]:
text_to_indices('what is pytorch', vocab)

[1, 2, 0]

In [128]:
import torch
from torch.utils.data import Dataset, DataLoader

In [129]:
class QADataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    numerical_question= text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer= text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)


In [130]:
dataset= QADataset(df,vocab)

In [131]:
dataset[1]

(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))

In [132]:
dataloader= DataLoader(dataset, batch_size=1, shuffle=True)

In [133]:
for question, answer in dataloader:
  print(question, answer[0])

tensor([[  1,   2,   3, 163, 164, 165,  83,  84]]) tensor([166])
tensor([[ 78,  79, 195,  81,  19,   3, 196, 197, 198]]) tensor([199])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([215])
tensor([[ 10,  75,   3, 296,  19, 297]]) tensor([298])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([113])
tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([185])
tensor([[  1,   2,   3, 141, 117,  83,   3, 277, 278]]) tensor([121])
tensor([[10, 96,  3, 97]]) tensor([98])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([85])
tensor([[ 42, 174,   2,  62,  39, 175, 176,  12, 177, 178]]) tensor([179])
tensor([[ 1,  2,  3,  4,  5, 73]]) tensor([74])
tensor([[ 42, 137,   2, 138,  39, 175, 269]]) tensor([99])
tensor([[ 1,  2,  3,  4,  5, 53]]) tensor([54])
tensor([[ 42, 200,   2,  14, 201, 202, 203, 204]]) tensor([205])
tensor([[ 42, 299, 300, 118,  14, 301, 302, 158, 303, 304, 305, 306]]) tensor([307])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([131])
tensor([[  1,   2,

In [134]:
import torch.nn as nn

In [135]:
class SimpleRNN(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.embedding= nn.Embedding(vocab_size, embedding_dim= 50)
    self.rnn= nn.RNN(50,64, batch_first= True)
    self.fc= nn.Linear(64, vocab_size)
  def forward(self, quetion):
    embedded_question= self.embedding(question)
    hidden, final= self.rnn(embedded_question)
    output= self.fc(final.squeeze(0))

    return output


In [136]:
x= nn.Embedding(324, embedding_dim=50)
y= nn.RNN(50,64, batch_first=True)
z= nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [137]:
learning_rate= 0.001
epochs=20

In [138]:
model= SimpleRNN(len(vocab))

In [139]:
criterion= nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(model.parameters(), lr= learning_rate)

In [140]:
for epoch in range(epochs):
  total_loss=0
  for question, answer in dataloader:
    optimizer.zero_grad()
    output= model(question)
    loss= criterion(output,answer[0])
    loss.backward()
    optimizer.step()
    total_loss= total_loss + loss.item()
  print(f'Epoch:{epoch+1}, Loss:{total_loss:4f}')

Epoch:1, Loss:524.325831
Epoch:2, Loss:447.263715
Epoch:3, Loss:369.366449
Epoch:4, Loss:313.209911
Epoch:5, Loss:263.419087
Epoch:6, Loss:217.264544
Epoch:7, Loss:173.980086
Epoch:8, Loss:136.262513
Epoch:9, Loss:104.563533
Epoch:10, Loss:79.891196
Epoch:11, Loss:61.655737
Epoch:12, Loss:48.232607
Epoch:13, Loss:38.080654
Epoch:14, Loss:30.648947
Epoch:15, Loss:25.127285
Epoch:16, Loss:20.836035
Epoch:17, Loss:17.692934
Epoch:18, Loss:15.088722
Epoch:19, Loss:12.969803
Epoch:20, Loss:11.254351


In [144]:
def predict(model, question, threshold=0.5):

  numerical_question= text_to_indices(question, vocab)
  quetion_tensor= torch.tensor(numerical_question).unsqueeze(0)

  output= model(quetion_tensor)

  probs= torch.nn.functional.softmax(output,dim=1)

  value, index= torch.max(probs, dim=1)
  if value< threshold:
    print("I don't know")
  print(list(vocab.keys())[index])


In [145]:
predict(model, "what is the largest planet in our solar system?")

jupiter


In [146]:
list(vocab.keys())[7]

'paris'