## Question Answering system using RNN

### Steps to be performed

1. Load the dataset and apply preprocessing (tokenization ,removing special characters ,lowercase etc)

2. Create a Vocabulary (i.e a dictionary that conists of all the unique words in the entire dataset with corresponding unique indices to every words)

3. Convert the words/tokens to indices using this vocab

4. Create embeddings

5. Build and Train RNN

6. Make prediction

In [167]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader


In [168]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [169]:
df = pd.read_csv("/content/drive/MyDrive/100_Unique_QA_Dataset.csv")

In [170]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


## Step -1 Data Preprocessing

In [171]:
# removing special characters,lowrcase and tokenization
def tokenize(text):
  text=text.replace("?","")
  text = text.lower()
  text = text.replace("'","")
  text=text.replace("?","")

  return text.split()


In [172]:
tokenize("Hey how are you?")

['hey', 'how', 'are', 'you']

In [173]:
# Apply preprocessing steps(tokenization) to question and answer
df['tokenized_question'] = df['question'].apply(tokenize)
df['tokenized_answer'] = df['answer'].apply(tokenize)
df['merged_tokens'] = df['tokenized_question'] + df['tokenized_answer']
# df.drop(columns=['tokenized_question', 'tokenized_answer','question','answer'], inplace=True)

In [174]:
df.head(3)

Unnamed: 0,question,answer,tokenized_question,tokenized_answer,merged_tokens
0,What is the capital of France?,Paris,"[what, is, the, capital, of, france]",[paris],"[what, is, the, capital, of, france, paris]"
1,What is the capital of Germany?,Berlin,"[what, is, the, capital, of, germany]",[berlin],"[what, is, the, capital, of, germany, berlin]"
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee,"[who, wrote, to, kill, a, mockingbird]",[harper-lee],"[who, wrote, to, kill, a, mockingbird, harper-..."


## Step-2 Creating Vocublary

In [175]:
# Initalize empty vocab dictonary ,with and 'unknown' as index 0
# If new unknown word is received during prediction , this word will have index of 'unknown' i.e 0
vocab = {'<UNK>':0}

In [176]:
vocab

{'<UNK>': 0}

### Adding words/tokens to the vocab

In [177]:
def build_vocab(row):
  for token in row:
    if token not in vocab:
      vocab[token] = len(vocab)

In [178]:
df['merged_tokens'].apply(build_vocab)

Unnamed: 0,merged_tokens
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [179]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [180]:
len(vocab)  # 324 unique words in the vocab

324

# Convert the words/tokens to indices using this vocab

In [181]:
def tokens_to_indices(row,vocab):
  indexed_token = []
  for token in row:
      if token in vocab:
        indexed_token.append(vocab[token])
      else:
        indexed_token.append(vocab['<UNK>'])
  return indexed_token



In [182]:
tokens_to_indices(['when','my','name','is'],vocab)

[305, 0, 0, 2]

## Creating a custom dataset class

In [183]:
class customdataset(Dataset):
  def __init__(self,df,vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    tokenized_question = tokens_to_indices(self.df['tokenized_question'][index],self.vocab)
    tokenized_answer = tokens_to_indices(self.df['tokenized_answer'][index],self.vocab)

    return torch.tensor(tokenized_question),torch.tensor(tokenized_answer)




In [184]:
# Custom data class object
train_data = customdataset(df,vocab)

In [185]:
train_data[1]

(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))

## Dataloader object

In [186]:
# Since batch_size = 1 padding is not required , as there is single question in a batch
# In case batch_size > 1 , padding is required as the number of words in each question is dirrefent
train_loader = DataLoader(train_data,batch_size=1,shuffle=True)

In [187]:
for question , answer in train_loader:
  print(question ,answer)

tensor([[ 42, 137,   2, 226,  12,   3, 227, 228]]) tensor([[155]])
tensor([[ 42, 250, 251, 118, 252, 253]]) tensor([[254]])
tensor([[ 42, 174,   2,  62,  39, 175, 176,  12, 177, 178]]) tensor([[179]])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([[131]])
tensor([[ 42, 137,   2, 138,  39, 175, 269]]) tensor([[99]])
tensor([[ 10, 140,   3, 141, 142,  12, 143,  83,   3, 144]]) tensor([[145]])
tensor([[ 1,  2,  3, 37, 38, 39, 40]]) tensor([[41]])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([[91]])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([[85]])
tensor([[ 10, 308,   3, 309, 310]]) tensor([[311]])
tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([[53]])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([[273]])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([[149]])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([[244]])
tensor([[ 42, 167,   2,   3,  17, 168, 169]]) tensor([[170]])
tensor([[  1,   2,   3,   4,   5, 113]]) tensor([[114]])
tensor([[

# Define the RNN network architecture

## Architecture:
1. Embedding layer : that takes input the size of the vocab (i.e The total no of words in vocab) and this layer will have 50 neurons (each word/indices will be represented as embedding vector of dimension 50)

2. 1 hidden layer : (input = 50 , output/neuron = 64 (randomly selected))

3. Output layer : (input= 64 and , output/neuron = size of vocab (total no of words in vocab))

In [188]:
import torch.nn as nn
class RnnNetwork(nn.Module):

  def __init__(self,vocab_size):

    super().__init__()
    #Input=total_no_of_words_in_vocab,Output = each word is represented as vector of dimension 50
    self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
    #Input=embedding_vector_of_dim_50_(for each word)
    self.rnn= nn.RNN(input_size=50,hidden_size=64,batch_first=True)
    #no_of_neurons = total_no_of_word_in_vocab (output probabilities for each word and highest probaility word is output)
    self.fc = nn.Linear(64,vocab_size)

  def forward(self,question):
    embedded_questions = self.embedding(question)
    # Since the RNN layer requrns 'hidden_state' info and the 'final output'
    hidden_state ,final_output =self.rnn(embedded_questions)
    # Only pass the final output of RNN layer to fully connected layer
    output = self.fc(final_output.squeeze(0))
    return output


# Define the Training Loop

In [189]:
learning_rate = 0.001
epochs = 20

In [190]:
model = RnnNetwork(len(vocab))

In [191]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [192]:
for epoch in range(epochs):
  total_epoch_loss = 0
  for question, answer in train_loader:
    #1. forward pass
    y_pred = model(question)

    #2. loss calculate
    optimizer.zero_grad()
    loss = loss_function(y_pred, answer.squeeze(0))

    #3. Backward pass/ Gradients calculate
    loss.backward()

    #4. Update parameters
    optimizer.step()

    total_epoch_loss += loss.item()

  avg_loss = total_epoch_loss / len(train_loader)
  print(f"Epoch: {epoch} Loss: {avg_loss}")

Epoch: 0 Loss: 5.826352140638563
Epoch: 1 Loss: 5.052907477484809
Epoch: 2 Loss: 4.1619210296207
Epoch: 3 Loss: 3.4856611331303915
Epoch: 4 Loss: 2.9019050929281445
Epoch: 5 Loss: 2.366697461075253
Epoch: 6 Loss: 1.869975072145462
Epoch: 7 Loss: 1.4538810465070937
Epoch: 8 Loss: 1.1101935582028495
Epoch: 9 Loss: 0.8483569555812411
Epoch: 10 Loss: 0.6517090754376518
Epoch: 11 Loss: 0.5030296300848325
Epoch: 12 Loss: 0.39766413105858694
Epoch: 13 Loss: 0.318228752248817
Epoch: 14 Loss: 0.26010398459103373
Epoch: 15 Loss: 0.2163213525381353
Epoch: 16 Loss: 0.18078552078869609
Epoch: 17 Loss: 0.15408574235108163
Epoch: 18 Loss: 0.13248613253235816
Epoch: 19 Loss: 0.11594372577965259


## Making Prediction

case 1: If the question asked by user has been seen by model(i.e question used in training) , the model returns the corresponding answer

case 2 : If the question asked by user is not used while training the model respond with 'I dont know the answer'

In [277]:
def make_prediction(model,question,thresthreshold=0.5):

  # Tokenize and remove special characters
  question = tokenize(question)

  # Convert tokens to indices using vocab
  indexed_question = tokens_to_indices(question,vocab)
  indexed_question = torch.tensor(indexed_question)
  indexed_question = indexed_question.unsqueeze(0)

  # make prediction using model
  y_pred = model(indexed_question)

  # Conver the raw logits into probabilities for each word using soft max fxn
  probs = torch.nn.functional.softmax(y_pred, dim=1)


  # Extracting the probability and index of highest probability word
  max_prob,index = torch.max(probs,dim=1)
  print(f'max probability: {max_prob[0]}')
  print(f'max probability index:  {index[0]}')

  # checking if the highest probability is greater then the threshold value(0.5)
  if max_prob<thresthreshold:
    print("I dont know the answer")
  else:
    print(f'Answer: {list(vocab.keys())[index]}') #Extracting the unique words, convert into list and extract element


In [279]:
question = 'What is the capital of Spain'
make_prediction(model,question)

max probability: 0.8531249761581421
max probability index:  280
Answer: madrid


# Checking the Training accuracy



In [304]:
total = 0
correct = 0
with torch.no_grad():
  for question , answer in train_loader:
    # Forward pass
    y_pred = model(question)

    # convert raw logits into probabilities
    prob = torch.nn.functional.softmax(y_pred,dim=1)

    # Extract the index and probabilities of highest probability word
    highest_prob , index = torch.max(prob,dim=1)

    total = total + question.shape[0]
    correct += (index == answer).sum().item()

  accuracy = (correct/total) *100
  print(f'Accuracy: {accuracy}')


Accuracy: 100.0


## The accuracy of the model is 100% on the training data

---

