# QA system rnn

In [210]:
import numpy as np 
import pandas as pd

In [211]:
df=pd.read_csv("../dataset/qn.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  151 non-null    object
 1   answer    151 non-null    object
dtypes: object(2)
memory usage: 2.5+ KB


In [212]:
df.sample(15)

Unnamed: 0,question,answer
17,What is underfitting?,Underfit
6,What is ML?,Learning
27,Who wrote 1984?,Orwell
0,Who wrote Hamlet?,Shakespeare
112,Triangle sides?,3
82,Hydrogen symbol?,H
24,Who wrote Macbeth?,Shakespeare
139,Currency in UK?,Pound
44,What is the capital of India?,Delhi
69,What galaxy do we live in?,MilkyWay


In [213]:
def tokenize_text(input_text):
    the_text=input_text.lower()
    the_text=the_text.replace("?","")
    the_text=the_text.split()
    return the_text
    # print(the_text)

In [214]:
tokenize_text("Who wrote Hamlet?")

['who', 'wrote', 'hamlet']

-----

In [215]:
vocab={"<UNK>":0}

In [216]:
df.columns

Index(['question', 'answer'], dtype='object')

In [217]:
def build_vocab(input_row):
    tokenized_question=tokenize_text(input_row["question"])
    tokenized_answer=tokenize_text(input_row["answer"])
    merged_tokens=tokenized_question+tokenized_answer
    # print(merged_tokens)

    for token in merged_tokens:
        if token not in vocab:
            vocab[token]=len(vocab)

In [218]:
print(df.iloc[0:2,:])
print("------")
build_vocab(df.iloc[0])
build_vocab(df.iloc[1])

               question       answer
0     Who wrote Hamlet?  Shakespeare
1  What color is grass?        Green
------


In [219]:
df.apply(build_vocab,axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
146    None
147    None
148    None
149    None
150    None
Length: 151, dtype: object

In [220]:
list(vocab.items())[:15]

[('<UNK>', 0),
 ('who', 1),
 ('wrote', 2),
 ('hamlet', 3),
 ('shakespeare', 4),
 ('what', 5),
 ('color', 6),
 ('is', 7),
 ('grass', 8),
 ('green', 9),
 ('are', 10),
 ('bananas', 11),
 ('yellow', 12),
 ('coal', 13),
 ('black', 14)]

In [221]:
len(vocab)

321

In [222]:
df["question"].str.split().explode().nunique(),df["answer"].str.split().explode().nunique()


(234, 133)

-----

In [223]:
# word to numerical indices

In [224]:
def text_to_indices(input_text,vocab):
    indexed_text=[]
    for token in tokenize_text(input_text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab["<UNK>"])
    return indexed_text


In [225]:
df.iloc[10,0]

'What is reinforcement learning?'

In [226]:
list(vocab.items())[:9]

[('<UNK>', 0),
 ('who', 1),
 ('wrote', 2),
 ('hamlet', 3),
 ('shakespeare', 4),
 ('what', 5),
 ('color', 6),
 ('is', 7),
 ('grass', 8)]

In [227]:
text_to_indices(df.iloc[10,0],vocab)

[5, 7, 27, 20]

In [228]:
text_to_indices("who is steve apple",vocab)

[1, 7, 0, 0]

------

In [229]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [230]:
class QADataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [231]:
dataset=QADataset(df,vocab)

In [232]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True) #batch size 1 ,taking one qn in a batch 

In [233]:
for question, answer in dataloader:
  print(question, answer[0])

tensor([[186, 194]]) tensor([198])
tensor([[ 5,  7, 60, 71, 72, 85]]) tensor([86])
tensor([[303,  35, 207]]) tensor([309])
tensor([[ 5,  7, 60, 71, 72, 98]]) tensor([99])
tensor([[263,  72, 266]]) tensor([267])
tensor([[  5,   7,  60, 146, 147]]) tensor([148])
tensor([[317, 116, 124]]) tensor([136])
tensor([[ 5,  7, 60, 71, 72, 81]]) tensor([82])
tensor([[  1,   2,  60, 142]]) tensor([143])
tensor([[263,  72, 272]]) tensor([273])
tensor([[  5,   7, 121, 122, 123]]) tensor([119])
tensor([[146, 205]]) tensor([206])
tensor([[ 1,  2, 63]]) tensor([64])
tensor([[235, 249, 250]]) tensor([248])
tensor([[199, 194]]) tensor([200])
tensor([[262, 260]]) tensor([124])
tensor([[ 65, 274, 277]]) tensor([278])
tensor([[181, 178,  72, 179]]) tensor([182])
tensor([[ 65, 274, 279]]) tensor([226])
tensor([[ 5,  7, 27, 20]]) tensor([28])
tensor([[  1,   2,  60, 144]]) tensor([145])
tensor([[263,  72, 268]]) tensor([269])
tensor([[303,  35, 307]]) tensor([308])
tensor([[123, 125, 134]]) tensor([131])
tenso

In [234]:
class MySimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    last_hidden = final[-1]  # last layer, shape [batch, hidden_size]
    output = self.fc(last_hidden)
    print(output.shape)

    return output

-----

-----

In [235]:
dataset[0][0]


tensor([1, 2, 3])

In [236]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,3)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 3])
shape of b: torch.Size([1, 3, 50])
shape of c: torch.Size([1, 3, 64])
shape of d: torch.Size([1, 3, 64])
shape of e: torch.Size([3, 324])


In [237]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,3)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 3])
shape of b: torch.Size([1, 3, 50])
shape of c: torch.Size([1, 3, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


-----

In [238]:
learning_rate = 0.001
epochs = 20

In [239]:
model = MySimpleRNN(len(vocab))

In [240]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [241]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)
    print(output.shape)

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1, 321])
torch.Size([1

In [243]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("This qn is not my knowledge dataset")

  print(list(vocab.keys())[index])

In [254]:
predict(model,"what is the capital of nepal")

torch.Size([1, 321])
kathmandu


In [260]:
df.sample(3)

Unnamed: 0,question,answer
8,What is supervised learning?,Labeled
108,Day before Sunday?,Saturday
114,Pentagon sides?,5


In [261]:
predict(model,"What is supervised learning?")

torch.Size([1, 321])
labeled


In [265]:
df.sample(3)

Unnamed: 0,question,answer
76,Gas plants absorb?,CO2
28,Who wrote Animal Farm?,Orwell
60,Who wrote Pride and Prejudice?,Austen


In [266]:
predict(model,"Who wrote Pride and Prejudice?")

torch.Size([1, 321])
austen


In [267]:
predict(model,"home minister of usa")

torch.Size([1, 321])
This qn is not my knowledge dataset
washington
