In [128]:
import pandas as pd

In [129]:
df = pd.read_csv('100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [130]:
# Converting our data to numbers
# tokenization function

def tokenizer(text):
    text = text.lower() # convert to lowercase
    text = text.replace('?', '') # replace ? with nothing
    text = text.replace("'", '') # replace ' with nothing 
    return text.split() # this will return a list of words

In [131]:
# testing the tokenization function
print(tokenizer('What is the capital of France?'))
print(tokenizer('What is the capital of Germany?'))
print(tokenizer('Who wrote "To Kill a Mockingbird"?'))
print(tokenizer('What is the largest planet in our solar system?'))
print(tokenizer('What is the boiling point of water in Celsius?'))

['what', 'is', 'the', 'capital', 'of', 'france']
['what', 'is', 'the', 'capital', 'of', 'germany']
['who', 'wrote', '"to', 'kill', 'a', 'mockingbird"']
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system']
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius']


In [132]:
# creating a vocabulary for unique words
vocab = {'<UNK>':0}

In [133]:
# creating a function to check if there is a unique word
def build_vocab(row):
  print(row['question'], row['answer'])

  tokenized_question = tokenizer(row['question']) # this will return a list of words
  tokenized_answer = tokenizer(row['answer'])

  # merging the question and answer into one list
  tokenized_text = tokenized_question + tokenized_answer

  # looping through the list of words
  for word in tokenized_text:
    if word not in vocab:
      vocab[word] = len(vocab) # this will add the word to the vocabulary


  #print(tokenized_text)

In [134]:
# applying pandas apply function for each row
df.apply(build_vocab, axis=1) # axis=1 means it will apply the function to each row

What is the capital of France? Paris
What is the capital of Germany? Berlin
Who wrote 'To Kill a Mockingbird'? Harper-Lee
What is the largest planet in our solar system? Jupiter
What is the boiling point of water in Celsius? 100
Who painted the Mona Lisa? Leonardo-da-Vinci
What is the square root of 64? 8
What is the chemical symbol for gold? Au
Which year did World War II end? 1945
What is the longest river in the world? Nile
What is the capital of Japan? Tokyo
Who developed the theory of relativity? Albert-Einstein
What is the freezing point of water in Fahrenheit? 32
Which planet is known as the Red Planet? Mars
Who is the author of '1984'? George-Orwell
What is the currency of the United Kingdom? Pound
What is the capital of India? Delhi
Who discovered gravity? Newton
How many continents are there on Earth? 7
Which gas do plants use for photosynthesis? CO2
What is the smallest prime number? 2
Who invented the telephone? Alexander-Graham-Bell
What is the capital of Australia? Canber

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [135]:
# printing the vocabulary
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [136]:
# length of the vocabulary
len(vocab)

324

In [137]:
# converting words to numerical indices
def convert_words_to_indices(text, vocab):

  indexed_text = []

  for word in tokenizer(text):
    if word in vocab:
      indexed_text.append(vocab[word])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [138]:
# testing the function
convert_words_to_indices('What is the capital of Pakistan?', vocab) # Pakistan is not in the vocabulary so it will be replaced with <UNK> = 0

[1, 2, 3, 4, 5, 0]

In [139]:
import torch
from torch.utils.data import Dataset, DataLoader

In [140]:
# creating a custom dataset class
class CustomDataset(Dataset):
  # constructor
  def __init__(self, df, vocab):
    self.df = df # this will store the dataframe
    self.vocab = vocab

  # __len__ method
  def __len__(self):
    return self.df.shape[0] # this will return the number of rows in the dataframe
  
  # __getitem__ method
  def __getitem__(self, index):
    num_question = convert_words_to_indices(self.df.iloc[index]['question'], self.vocab) # this will return a list of indices
    num_answer = convert_words_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(num_question), torch.tensor(num_answer) # this will return a tuple of tensors

In [141]:
# creating a custom dataset object
dataset = CustomDataset(df, vocab) # passing the dataframe and vocabulary

In [142]:
# creating a dataloader
dataloader = DataLoader(dataset, batch_size=1, shuffle=True) # passing the dataset and batch size

In [143]:
# testing the dataloader
for question, answer in dataloader:
  print(question, answer)

tensor([[ 10,  75, 208]]) tensor([[209]])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([[68]])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([[244]])
tensor([[ 78,  79, 150, 151,  14, 152, 153]]) tensor([[154]])
tensor([[10, 29,  3, 30, 31]]) tensor([[32]])
tensor([[ 42, 125,   2,  62,  63,   3, 126, 127]]) tensor([[128]])
tensor([[ 42, 318,   2,  62,  63,   3, 319,   5, 320]]) tensor([[321]])
tensor([[ 1,  2,  3, 24, 25,  5, 26, 19, 27]]) tensor([[28]])
tensor([[ 10,  11, 189, 158, 190]]) tensor([[191]])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([[106]])
tensor([[ 10,  29, 130, 131]]) tensor([[132]])
tensor([[ 42, 137,   2, 138,  39, 175, 269]]) tensor([[99]])
tensor([[  1,   2,   3,  69,   5, 155]]) tensor([[156]])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([[36]])
tensor([[ 10,  11, 157, 158, 159]]) tensor([[160]])
tensor([[ 42,  18,   2,   3, 281,  12,   3, 282]]) tensor([[205]])
tensor([[  1,   2,   3,   4,   5, 113]]) tensor([[114]])
tensor([[ 42,  18,

In [144]:
import torch.nn as nn

In [145]:
# creating a RNN model class
class RNN(nn.Module):

  # constructor
  def __init__(self, vocab_size):
    super().__init__() # calling the constructor of the parent class
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50) # creating an embedding layer
    self.rnn = nn.RNN(50, 64, batch_first=True)  # creating a recurrent neural network, adding batch_first=True
    self.linear = nn.Linear(64, vocab_size) # creating a linear layer

  # forward pass
  def forward(self, question):
    embedded_question = self.embedding(question) # passing the question to the embedding layer
    hidden, final = self.rnn(embedded_question) # passing the embedded question to the recurrent neural network
    output = self.linear(final.squeeze(0)) # passing the final output to the linear layer and squeezing the batch dimension

    return output

In [146]:
# learning rate and number of epochs
learning_rate = 0.001
epochs = 20

In [147]:
# model object
model = RNN(len(vocab)) # passing the vocabulary size

In [148]:
# loss and optimizer
criterion = nn.CrossEntropyLoss() # this will be used for training
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # this will be used for training

In [149]:
# for debugging the training loop
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True) # batch_first=True means that the batch dimension is the first dimension.
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1, 6) # reshaping the question to 1x6
print("shape of question:", a.shape)

b = x(a) # passing the question to the embedding layer
print("shape of embedded question:", b.shape)

c, d = y(b) # passing the embedded question to the recurrent neural network
print("shape of hidden:", c.shape)
print("shape of final:", d.shape)

e = z(d.squeeze(0)) # passing the final output to the linear layer, squeeze(0) means that the batch dimension is removed.
print("shape of output:", e.shape)

shape of question: torch.Size([1, 6])
shape of embedded question: torch.Size([1, 6, 50])
shape of hidden: torch.Size([1, 6, 64])
shape of final: torch.Size([1, 1, 64])
shape of output: torch.Size([1, 324])


In [150]:
# training loop

for epoch in range(epochs):

  total_loss = 0 # variable to store the total loss of the epoch

  for question, answer in dataloader: # iterating over the dataloader

    optimizer.zero_grad() # zero the gradients

    # forward pass
    output = model(question) # passing the question to the model

    # loss
    loss = criterion(output, answer[0]) # passing the output and the answer [0]

    # backward pass
    loss.backward() # this will calculate the gradients

    # update weights
    optimizer.step() # this will update the weights

    total_loss += loss.item() # this will store the total loss of the epoch

  print(f'Epoch: {epoch + 1}, Loss: {total_loss / len(dataloader)}')

Epoch: 1, Loss: 5.800900014241536
Epoch: 2, Loss: 5.006183179219564
Epoch: 3, Loss: 4.110608008172777
Epoch: 4, Loss: 3.448542594909668
Epoch: 5, Loss: 2.891661432054308
Epoch: 6, Loss: 2.3753606107499863
Epoch: 7, Loss: 1.905985372596317
Epoch: 8, Loss: 1.4882905814382765
Epoch: 9, Loss: 1.1534574574894376
Epoch: 10, Loss: 0.8875230507718193
Epoch: 11, Loss: 0.6879603597852919
Epoch: 12, Loss: 0.5393921674953567
Epoch: 13, Loss: 0.4292439606454637
Epoch: 14, Loss: 0.34695204099019367
Epoch: 15, Loss: 0.28532734190424286
Epoch: 16, Loss: 0.2372521021299892
Epoch: 17, Loss: 0.19985258844163684
Epoch: 18, Loss: 0.16833765490187538
Epoch: 19, Loss: 0.1458765564693345
Epoch: 20, Loss: 0.12476225102113353


In [151]:
# defining a predict function
def predict(model, question, threshold=0.5): # threshold is the probability threshold

  # converting the question to numerical indices
  num_question = convert_words_to_indices(question, vocab) # this will return a list of indices

  # converting the list of indices to a tensor
  num_question = torch.tensor(num_question).unsqueeze(0) # this will return a tensor of shape (6,)

  # sending the question to the model
  output = model(num_question)

  # converting logits to probabilities
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max probability
  value, index = torch.max(probs, dim=1)

  # check if the probability is greater than the threshold
  if value < threshold:
    return "I don't know that one"

  print(list(vocab.keys())[index])

In [152]:
# testing the predict function
predict(model, 'What is the capital of Pakistan?')

"I don't know that one"

In [153]:
# testing another question
predict(model, 'What is the boiling point of water in Celsius?')

100


In [154]:
# testing another question
predict(model, 'Who invented the telephone?')

alexander-graham-bell
