In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("100_Unique_QA_Dataset.csv")

In [6]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


## Tokennize

In [7]:
def tokenize(text):
    text = text.lower()
    text = text.replace('?','')
    text = text.replace("'",'')
    return text.split()

In [8]:
tokenize("What is the capital of France?")

['what', 'is', 'the', 'capital', 'of', 'france']

## Vocab

In [9]:
vocab = {'<UNK>':0}

In [14]:
def bulid_vocab(row):
    tokenize_question = tokenize(row['question'])
    tokenzie_answer = tokenize(row['answer'])
    merged_tokens = tokenize_question + tokenzie_answer
    print(merged_tokens)

    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)
    # print(tokenize_question,tokenzie_answer)

In [15]:
df.apply(bulid_vocab,axis=1)

['what', 'is', 'the', 'capital', 'of', 'france', 'paris']
['what', 'is', 'the', 'capital', 'of', 'germany', 'berlin']
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee']
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter']
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius', '100']
['who', 'painted', 'the', 'mona', 'lisa', 'leonardo-da-vinci']
['what', 'is', 'the', 'square', 'root', 'of', '64', '8']
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold', 'au']
['which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945']
['what', 'is', 'the', 'longest', 'river', 'in', 'the', 'world', 'nile']
['what', 'is', 'the', 'capital', 'of', 'japan', 'tokyo']
['who', 'developed', 'the', 'theory', 'of', 'relativity', 'albert-einstein']
['what', 'is', 'the', 'freezing', 'point', 'of', 'water', 'in', 'fahrenheit', '32']
['which', 'planet', 'is', 'known', 'as', 'the', 'red', 'planet', 'mars']
['who', 'is', 'the', 'author', 'of', '19

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [18]:
print(vocab)

{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'capital': 4, 'of': 5, 'france': 6, 'paris': 7, 'germany': 8, 'berlin': 9, 'who': 10, 'wrote': 11, 'to': 12, 'kill': 13, 'a': 14, 'mockingbird': 15, 'harper-lee': 16, 'largest': 17, 'planet': 18, 'in': 19, 'our': 20, 'solar': 21, 'system': 22, 'jupiter': 23, 'boiling': 24, 'point': 25, 'water': 26, 'celsius': 27, '100': 28, 'painted': 29, 'mona': 30, 'lisa': 31, 'leonardo-da-vinci': 32, 'square': 33, 'root': 34, '64': 35, '8': 36, 'chemical': 37, 'symbol': 38, 'for': 39, 'gold': 40, 'au': 41, 'which': 42, 'year': 43, 'did': 44, 'world': 45, 'war': 46, 'ii': 47, 'end': 48, '1945': 49, 'longest': 50, 'river': 51, 'nile': 52, 'japan': 53, 'tokyo': 54, 'developed': 55, 'theory': 56, 'relativity': 57, 'albert-einstein': 58, 'freezing': 59, 'fahrenheit': 60, '32': 61, 'known': 62, 'as': 63, 'red': 64, 'mars': 65, 'author': 66, '1984': 67, 'george-orwell': 68, 'currency': 69, 'united': 70, 'kingdom': 71, 'pound': 72, 'india': 73, 'delhi': 74, 'discov

## Convert the words to numerical index

In [24]:
# convert the words to numerical indices

def text_to_indices(text,vocab):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text

In [23]:
text_to_indices("What is this",vocab)

[1, 2, 0]

In [25]:
import torch
from torch.utils.data import DataLoader,Dataset

In [27]:
class QADataset(Dataset):
    def __init__(self,df,vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return df.shape[0]
    
    def __getitem__(self, index):
        numerical_question = text_to_indices(self.df.iloc[index]['question'],self.vocab)
        numerical_answer = text_to_indices(self.df.iloc[index]['answer'],self.vocab)

        return torch.tensor(numerical_question),torch.tensor(numerical_answer)

In [None]:
dataset = QADataset(df,vocab)

In [31]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [32]:
DataLoader = DataLoader(dataset,batch_size=1,shuffle=True)


In [33]:
for question,answer in DataLoader:
    print(question,answer)

tensor([[ 42, 255,   2, 256,  83, 257, 258]]) tensor([[259]])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([[36]])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([[215]])
tensor([[  1,   2,   3, 146,  86,  19, 192, 193]]) tensor([[194]])
tensor([[ 42,  18,   2,   3, 281,  12,   3, 282]]) tensor([[205]])
tensor([[ 42, 250, 251, 118, 252, 253]]) tensor([[254]])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([[113]])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([[106]])
tensor([[10, 96,  3, 97]]) tensor([[98]])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([[244]])
tensor([[ 1,  2,  3,  4,  5, 73]]) tensor([[74]])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([[273]])
tensor([[ 10,  75,   3, 296,  19, 297]]) tensor([[298]])
tensor([[ 42, 137,   2,  62,  39,   3, 322, 323]]) tensor([[6]])
tensor([[ 1,  2,  3, 24, 25,  5, 26, 19, 27]]) tensor([[28]])
tensor([[ 78,  79, 150, 151,  14, 152, 153]]) tensor([[154]])
tensor([[ 

In [34]:
import torch.nn as nn

In [84]:
class SimpleRNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
        self.rnn = nn.RNN(50,64,batch_first=True)
        self.fc = nn.Linear(64,vocab_size)

    def forward(self,question):
        embadded_question =  self.embedding(question)
        hideen_state ,final_output = self.rnn(embadded_question)
        output = self.fc(final_output.squeeze(0))
        return output

In [85]:
learing_rate = 0.001
epochs = 20

In [86]:
model = SimpleRNN(len(vocab))

In [87]:
cirtertion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learing_rate)

In [88]:
# training loop
for epoch in range(epochs):
    total_loss = 0
    for question , answer in DataLoader:
        optimizer.zero_grad()

        # Forward pass
        output = model(question)
        # print(output.shape)
        # loss  
        loss = cirtertion(output,answer[0])

        # gradients 
        loss.backward()

        # update
        optimizer.step()

        total_loss = total_loss + loss.item()

    print(f"Epoch :{epoch+1} , Loss : {total_loss}")

Epoch :1 , Loss : 521.7672834396362
Epoch :2 , Loss : 455.27131748199463
Epoch :3 , Loss : 379.1168894767761
Epoch :4 , Loss : 316.01873087882996
Epoch :5 , Loss : 263.05425548553467
Epoch :6 , Loss : 214.00405550003052
Epoch :7 , Loss : 169.62807834148407
Epoch :8 , Loss : 131.17900997400284
Epoch :9 , Loss : 99.82066175341606
Epoch :10 , Loss : 75.81815725564957
Epoch :11 , Loss : 57.6555519849062
Epoch :12 , Loss : 44.60003152489662
Epoch :13 , Loss : 34.8943462818861
Epoch :14 , Loss : 28.190353855490685
Epoch :15 , Loss : 23.010728985071182
Epoch :16 , Loss : 19.223744496703148
Epoch :17 , Loss : 16.15590975433588
Epoch :18 , Loss : 13.84206361323595
Epoch :19 , Loss : 11.855203241109848
Epoch :20 , Loss : 10.266066789627075


In [114]:
def predict(model,question,threshhold=0.5):

    # convert question to numeric
    numerical_question = text_to_indices(question,vocab)
    # print(numerical_question)
    # tensor 
    question_tensor = torch.tensor(numerical_question).unsqueeze(0)
    # print(question_tensor)

    # send to model
    output = model(question_tensor)

    # convet logits to probs
    probs = torch.nn.functional.softmax(output,dim=1)

    # find the max probs 
    value , index = torch.max(probs,dim=1)
    print(value)
    if value<threshhold:
        print("I don't know")
    
    print(list(vocab.keys())[index])

In [115]:
predict(model , "what is the captital of france")

tensor([0.5477], grad_fn=<MaxBackward0>)
paris


## Model Working

In [35]:
dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [41]:
x = nn.Embedding(324,embedding_dim=50)
x

Embedding(324, 50)

In [43]:
a = x(dataset[0][0])
a

tensor([[ 8.5539e-01,  8.0644e-01, -1.3124e+00, -1.9226e-01,  4.1745e-01,
          4.4776e-01, -1.2109e-01, -1.9486e-01,  6.4431e-01,  1.5087e+00,
          2.7660e-01, -5.9878e-01,  1.1756e+00, -6.4748e-01, -2.0304e+00,
         -1.2889e-01, -1.2045e+00,  1.1808e+00,  2.3235e-01, -7.2151e-01,
          2.0419e+00, -2.8482e+00,  1.3036e+00, -1.3826e+00, -1.1541e+00,
          4.2362e-02, -1.0344e-02, -1.3030e+00,  2.5148e+00, -5.7832e-01,
         -5.1541e-01, -3.3687e-01,  7.4102e-01, -1.0479e+00, -1.1568e-01,
         -3.5308e-01,  8.7438e-01, -2.0649e+00, -1.2401e+00, -1.7648e+00,
          3.4438e-01,  4.0180e-01,  4.5518e-01,  1.6867e-04,  9.5165e-01,
          6.8536e-01,  8.6587e-01, -8.5315e-01,  1.6526e+00, -2.9678e-01],
        [ 1.1449e-01,  1.1872e+00, -6.2184e-01, -2.3275e+00, -1.2257e-01,
         -2.4952e+00,  1.0248e+00,  1.4591e+00,  1.0074e+00,  2.6644e-01,
          1.9978e-01,  1.1186e+00, -8.1435e-01, -1.0156e+00, -3.1636e-01,
          2.4951e-01,  6.6935e-02, -1

In [63]:
b = nn.RNN(50,64)

In [64]:
b(a)

(tensor([[ 0.5713, -0.1468, -0.0578,  0.1772, -0.4891,  0.3963,  0.1013, -0.3394,
          -0.3546,  0.6813, -0.5481,  0.3044, -0.3667,  0.0095,  0.4669, -0.1087,
           0.1114, -0.5379, -0.0802, -0.4273,  0.4638,  0.4010,  0.3501,  0.0519,
          -0.5569,  0.3172, -0.3344, -0.3617, -0.0439,  0.1049,  0.1354,  0.3751,
          -0.3015,  0.8012, -0.5431, -0.1486,  0.1363, -0.5006,  0.5863,  0.7942,
          -0.0355,  0.4526,  0.1344, -0.2949,  0.5729, -0.3690,  0.0108, -0.0689,
           0.1197,  0.9092,  0.0289,  0.4153,  0.2494, -0.1558, -0.6656,  0.3849,
          -0.2223, -0.3677, -0.6679, -0.8771, -0.5918, -0.2253,  0.5065,  0.4253],
         [-0.5796,  0.5715,  0.4406,  0.7288,  0.0516,  0.3404,  0.5540,  0.1703,
          -0.3477, -0.5640,  0.5952, -0.6653, -0.4771,  0.6180, -0.0175,  0.8407,
           0.2950, -0.6804, -0.2169,  0.4037,  0.7082,  0.7359, -0.0832, -0.0047,
           0.6454,  0.6861, -0.5036,  0.6848, -0.4112, -0.5860,  0.6941,  0.3136,
          -0.20

In [65]:
b(a)[0].shape ## Output Layers  ## Hiddent state layer

torch.Size([6, 64])

In [66]:
b = b(a)[1].shape # Final Output  # final State Layer

In [67]:
b

torch.Size([1, 64])

In [68]:
z = nn.Linear(64,324)

In [69]:
z(b)

TypeError: linear(): argument 'input' (position 1) must be Tensor, not torch.Size