In [1]:
import pandas as pd

df = pd.read_csv('100_Unique_QA_Dataset.csv')

In [2]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
## tokenising the dataset (using split)
def tokenize(text):
    text = text.lower()
    text = text.replace('?', '')
    text = text.replace("'", '')
    return text.split() ## Default Param is space for split!

In [4]:
tokenize('Is Alcaraz better than Sinner?')

['is', 'alcaraz', 'better', 'than', 'sinner']

In [5]:
'?'.replace('?', '') ## not in-place!!

''

In [6]:
## forming the vocabulary
vocab = {'<UNK>' : 0}

In [7]:
def build_vocab(row):
    tokenized_q = tokenize(row['question'])
    tokenized_ans = tokenize(row['answer'])
    tokens = tokenized_q + tokenized_ans
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab)



In [8]:
df.apply(build_vocab, axis = 1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [9]:
len(vocab)

324

In [10]:
## words -> numerical indices
def text_to_indices(text, vocab):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text

In [11]:
text_to_indices('What is CampusX?', vocab)

[1, 2, 0]

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

In [55]:
class QADataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        question_num = text_to_indices(self.df.iloc[index]['question'], self.vocab)
        ans_num = text_to_indices(self.df.iloc[index]['answer'], self.vocab)
        return torch.tensor(question_num), torch.tensor(ans_num).squeeze(0)


In [56]:
dataset = QADataset(df, vocab)

In [57]:
dataset[10]

(tensor([ 1,  2,  3,  4,  5, 53]), tensor(54))

In [58]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
for question, ans in dataloader:
    print(question, ans) ## notice that dim of each q is 2!!!

tensor([[ 10,  29, 130, 131]]) tensor([132])
tensor([[ 10,  75, 111]]) tensor([112])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([68])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([287])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([246])
tensor([[ 10,  11, 189, 158, 190]]) tensor([191])
tensor([[ 42, 216, 118, 217, 218,  19,  14, 219,  43]]) tensor([220])
tensor([[ 42, 200,   2,  14, 201, 202, 203, 204]]) tensor([205])
tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([162])
tensor([[ 42, 107,   2, 108,  19, 109]]) tensor([110])
tensor([[ 78,  79, 288,  81,  19,  14, 289]]) tensor([85])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([85])
tensor([[10, 55,  3, 56,  5, 57]]) tensor([58])
tensor([[ 42, 101,   2,   3,  17]]) tensor([102])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([91])
tensor([[ 42, 174,   2,  62,  39, 175, 176,  12, 177, 178]]) tensor([179])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([131])
tensor([[ 78,  79, 195,  81,  19,   3, 196, 197, 198]]) tenso

In [60]:
import torch.nn as nn

In [138]:
class SimpleRNN(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(50, 64)
        self.fc = nn.Linear(64, vocab_size)

    def forward(self, q):
        embed_q = self.emb(q)
        hidden, final = self.rnn(embed_q)
        output = self.fc(final)
        return output

In [139]:
dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [140]:
x = nn.Embedding(324, embedding_dim=50)

In [141]:
x(dataset[0][1]).shape ## Every word converted to a 50 dim vector!

torch.Size([50])

In [143]:
a = x(dataset[0][0])

In [149]:
a.shape

torch.Size([6, 50])

In [150]:
a.shape

torch.Size([6, 50])

In [151]:
y = nn.RNN(50, 64)

In [152]:
y(a) ## A tuple! That's why we can't use Sequential!!

(tensor([[ 7.0196e-01, -6.9564e-01, -3.6728e-01,  6.5588e-01, -3.5761e-01,
           2.5379e-01, -2.9545e-01,  8.4917e-02, -3.8642e-02,  5.1495e-01,
           2.6000e-01,  4.6536e-01, -6.0811e-01, -3.7547e-01,  2.6443e-01,
           3.7483e-02,  1.8018e-01, -1.0811e-01,  1.6788e-01,  2.5754e-01,
           2.2548e-01, -3.5617e-01, -3.9842e-01, -3.7144e-01, -2.4607e-01,
           2.7526e-01,  5.0866e-01,  5.6276e-01,  7.6535e-02,  9.1755e-02,
           2.2196e-01,  5.1086e-01, -6.6129e-01,  7.8560e-03,  5.8002e-01,
           2.7302e-01, -3.7445e-01, -6.3570e-02,  1.7417e-01,  2.4752e-01,
          -1.3555e-01, -5.2452e-02,  5.5346e-01, -6.6101e-01,  4.2013e-01,
           4.2597e-01,  2.1429e-01,  6.3348e-01, -4.7691e-01,  7.6595e-02,
           1.8219e-01,  1.4340e-01, -4.8715e-02,  2.7063e-01,  1.4113e-01,
           1.9320e-01,  4.5166e-01,  9.6882e-03,  5.6700e-01,  4.0551e-01,
          -3.2973e-01,  2.2357e-01, -2.9693e-01,  3.3199e-01],
         [-7.4074e-03,  2.1195e-01,  

In [153]:
y(a)[0].shape ## All hidden outputs(hidden states) stacked together! o1-o6

torch.Size([6, 64])

In [154]:
y(a)[1].shape ## Final output

torch.Size([1, 64])

In [155]:
t = y(a)

In [156]:
z = nn.Linear(64, 324)

In [157]:
z(t[1]).shape

torch.Size([1, 324])

In [158]:
learning_rate = 0.001
epochs = 20

In [159]:
nn.functional.one_hot(torch.tensor([1]), 5)

tensor([[0, 1, 0, 0, 0]])

In [168]:
model = SimpleRNN(len(vocab))

In [169]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [170]:
for epoch in range(epochs):

    total_loss = 0

    for q, a in dataloader:
        # print(q)
        optimizer.zero_grad()
        out = model(q.squeeze(0)).squeeze(0)
        # print(out.shape)
        # print(nn.functional.one_hot(a[0], len(vocab)).shape)
        loss = loss_fn(out, nn.functional.one_hot(a[0], len(vocab)).to(torch.float64))

        loss.backward()

        optimizer.step()

        total_loss += loss.item()
    print(f'Epoch: {epoch + 1}, Loss = {total_loss:4f}')

Epoch: 1, Loss = 524.390729
Epoch: 2, Loss = 460.442861
Epoch: 3, Loss = 383.409234
Epoch: 4, Loss = 318.014286
Epoch: 5, Loss = 264.671083
Epoch: 6, Loss = 216.002807
Epoch: 7, Loss = 170.529908
Epoch: 8, Loss = 132.718000
Epoch: 9, Loss = 101.663375
Epoch: 10, Loss = 78.004884
Epoch: 11, Loss = 59.742183
Epoch: 12, Loss = 46.410664
Epoch: 13, Loss = 36.689010
Epoch: 14, Loss = 29.633395
Epoch: 15, Loss = 24.232887
Epoch: 16, Loss = 20.165634
Epoch: 17, Loss = 16.951263
Epoch: 18, Loss = 14.384564
Epoch: 19, Loss = 12.323876
Epoch: 20, Loss = 10.709019


In [194]:
def predict(model, question, threshold = 0.5):
    numerical_q = text_to_indices(question, vocab)
    q_tensor = torch.tensor(numerical_q)
    with torch.no_grad():
        output = model(q_tensor)
    output = nn.functional.softmax(output, dim = 1)
    val, ind = torch.max(output, dim = 1)
    if val < threshold:
        print('Idk')
    else:
        return list(vocab.keys())[ind]
    # print(output)

In [195]:
torch.argmax(model(dataset[3][0]).flatten())

tensor(23)

In [200]:
ans = predict(model , 'Capital of Germany?')

In [201]:
ans

'berlin'

In [176]:
df.iloc[2]

question    Who wrote 'To Kill a Mockingbird'?
answer                              Harper-Lee
Name: 2, dtype: object

In [177]:
list(vocab.keys())[ans]

'harper-lee'

In [None]:
predict()