In [None]:
# Import libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [23]:
df = pd.read_csv('100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [None]:
# Simple tokenization function: lowercases text, removes '?' and "'", and splits by space.
def tokenize(text):

    text = text.lower()
    text = text.replace('?',"")
    text = text.replace("'","")
    return text.split()

In [None]:
# Initialize vocabulary dictionary with an unknown token '<UNK>'
vocab ={'<UNK>':0}

In [None]:
# Function to build the vocabulary from questions and answers in a dataframe row
def build_vocab(row):
    
    tokenized_ques = tokenize(row['question'])
    tokenized_ans = tokenize(row['answer'])
    merged_tokens = tokenized_ques + tokenized_ans

    for token in merged_tokens:

        if token not in vocab:
            vocab[token] = len(vocab)

In [None]:
# Apply the build_vocab function to each row of the dataframe
df.apply(build_vocab, axis = 1)

['what', 'is', 'the', 'capital', 'of', 'france', 'paris']
['what', 'is', 'the', 'capital', 'of', 'germany', 'berlin']
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee']
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter']
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius', '100']
['who', 'painted', 'the', 'mona', 'lisa', 'leonardo-da-vinci']
['what', 'is', 'the', 'square', 'root', 'of', '64', '8']
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold', 'au']
['which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945']
['what', 'is', 'the', 'longest', 'river', 'in', 'the', 'world', 'nile']
['what', 'is', 'the', 'capital', 'of', 'japan', 'tokyo']
['who', 'developed', 'the', 'theory', 'of', 'relativity', 'albert-einstein']
['what', 'is', 'the', 'freezing', 'point', 'of', 'water', 'in', 'fahrenheit', '32']
['which', 'planet', 'is', 'known', 'as', 'the', 'red', 'planet', 'mars']
['who', 'is', 'the', 'author', 'of', '19

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [28]:
len(vocab)

324

In [None]:
# Function to convert a tokenized text back into a list of numerical indices

def text_to_indices(text, vocab):

    indexed_text = []
    for token in tokenize(text):

        if token in vocab:
            indexed_text.append(vocab[token])

        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text        


In [None]:
# Custom Dataset class for handling question-answer pairs
class QADataset(Dataset):
    def __init__(self, df, vocab):
        super().__init__()
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):

        numerical_ques = text_to_indices(self.df.iloc[index]['question'], self.vocab)
        numerical_ans = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

        return torch.tensor(numerical_ques), torch.tensor(numerical_ans)
           

In [31]:
dataset = QADataset(df, vocab)

In [None]:
# Create a DataLoader
dataloader = DataLoader(dataset, batch_size = 1, shuffle = True)

In [None]:
# Simple RNN Model Definition
class SimpleRNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim = 50)
        self.rnn = nn.RNN(50, 64, batch_first = True)
        self.fc = nn.Linear(64,vocab_size)

    def forward(self, question):
        embedded_ques = self.embedding(question)
        hidden, final = self.rnn(embedded_ques)
        output = self.fc(final).squeeze(0)

        return output


In [35]:
model = SimpleRNN(len(vocab))

In [36]:
learning_rate = 0.001
epochs = 20

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
# --- Training Loop ---
for epoch in range(epochs):
    total_loss = 0

    for question, answer in dataloader:

        optimizer.zero_grad()
        output = model(question)

        loss = loss_fn(output, answer.squeeze(1)) 
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

In [None]:
# --- Prediction Function ---
def predict(model, question, threshold = 0.5):
    #convert ques to numbers
    numerical_ques = text_to_indices(question, vocab)

    # tensor
    ques_tensor = torch.tensor(numerical_ques).unsqueeze(0)

    # send to model
    output = model(ques_tensor)

    #convert logits to probability
    prob = torch.nn.functional.softmax(output, dim = 1)

    #find index of max probability
    value, index = torch.max(prob, dim = 1)

    if value<threshold:
        print("I don't know")
    
    return list(vocab.keys())[index]

In [40]:
predict(model, "what is capital of germany")

'berlin'