In [19]:
import pandas as pd

df=pd.read_csv('100_Unique_QA_Dataset.csv')

In [20]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [21]:
#tokenize
def tokenize(text):
  text=text.lower()
  text=text.replace('?','')
  text=text.replace("'","")
  return text.split()


In [22]:
#vocab
vocab={'<UNK>':0}



In [23]:
def build_vocab(row):
  print(row['question'],row['answer'])
  t_question=tokenize(row['question'])
  t_answer=tokenize(row['answer'])

  merged_tokens=t_question+t_answer
  for token in merged_tokens:
    if token not in vocab:
      vocab[token]=len(vocab)

In [24]:
df.apply(build_vocab,axis=1)

What is the capital of France? Paris
What is the capital of Germany? Berlin
Who wrote 'To Kill a Mockingbird'? Harper-Lee
What is the largest planet in our solar system? Jupiter
What is the boiling point of water in Celsius? 100
Who painted the Mona Lisa? Leonardo-da-Vinci
What is the square root of 64? 8
What is the chemical symbol for gold? Au
Which year did World War II end? 1945
What is the longest river in the world? Nile
What is the capital of Japan? Tokyo
Who developed the theory of relativity? Albert-Einstein
What is the freezing point of water in Fahrenheit? 32
Which planet is known as the Red Planet? Mars
Who is the author of '1984'? George-Orwell
What is the currency of the United Kingdom? Pound
What is the capital of India? Delhi
Who discovered gravity? Newton
How many continents are there on Earth? 7
Which gas do plants use for photosynthesis? CO2
What is the smallest prime number? 2
Who invented the telephone? Alexander-Graham-Bell
What is the capital of Australia? Canber

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [25]:
len(vocab)

324

In [26]:
# convert words to numerical indexes
def text_to_indices(text,vocab):
  indexed_text=[]
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [27]:
import torch
from torch.utils.data import Dataset,DataLoader

In [28]:
class QADataset(Dataset):

  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    row=self.df.iloc[idx]
    question=text_to_indices(row['question'],self.vocab)
    answer=text_to_indices(row['answer'],self.vocab)

    return torch.tensor(question),torch.tensor(answer)

In [29]:
dataset=QADataset(df,vocab)


In [30]:
dataset[1]

(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))

In [31]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)

In [32]:
import torch.nn as nn

In [33]:
class SimpleRNN(nn.Module):

  def __init__(self,vocab_size):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim=50)
    self.rnn=nn.RNN(50,64,batch_first=True)
    self.fc=nn.Linear(64,vocab_size)

  def forward(self,question):
    embedded_question=self.embedding(question)
    hidden,final=self.rnn(embedded_question)
    output=self.fc(final.squeeze(0))
    return output



In [34]:
learning_rate=0.001
epochs=20

model=SimpleRNN(len(vocab))

criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [35]:
for epoch in range(epochs):

  total_loss=0
  for question,answer in dataloader:

    optimizer.zero_grad()

    output=model(question)

    loss=criterion(output,answer[0])

    loss.backward()

    optimizer.step()

    total_loss+=loss.item()

  print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss}')


Epoch 1/20, Loss: 526.4741020202637
Epoch 2/20, Loss: 457.98770332336426
Epoch 3/20, Loss: 376.93569135665894
Epoch 4/20, Loss: 316.3291087150574
Epoch 5/20, Loss: 266.70095229148865
Epoch 6/20, Loss: 218.7904553413391
Epoch 7/20, Loss: 175.09483295679092
Epoch 8/20, Loss: 137.9098019003868
Epoch 9/20, Loss: 104.9531601369381
Epoch 10/20, Loss: 80.5190059542656
Epoch 11/20, Loss: 61.72047743201256
Epoch 12/20, Loss: 48.6232038885355
Epoch 13/20, Loss: 38.36487337946892
Epoch 14/20, Loss: 31.709351062774658
Epoch 15/20, Loss: 25.750378251075745
Epoch 16/20, Loss: 21.67113096266985
Epoch 17/20, Loss: 18.269705533981323
Epoch 18/20, Loss: 15.574383042752743
Epoch 19/20, Loss: 13.528826266527176
Epoch 20/20, Loss: 11.734381783753633


In [36]:
def predict(model,question,threshold=0.5):

  numerical_question=text_to_indices(question,vocab)

  # tensor
  question_tensor=torch.tensor(numerical_question).unsqueeze(0)

  output=model(question_tensor)

  probs=torch.nn.functional.softmax(output,dim=1)

  value,index=torch.max(probs,dim=1)

  if value<threshold:
    print('I dont know')

  print(list(vocab.keys())[index])



In [38]:
predict(model,'What is the largest planet in our solar system?')

jupiter
