In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
X = df['question']
Y = df["answer"]

In [4]:
X.head()

0                     What is the capital of France?
1                    What is the capital of Germany?
2                 Who wrote 'To Kill a Mockingbird'?
3    What is the largest planet in our solar system?
4     What is the boiling point of water in Celsius?
Name: question, dtype: str

In [5]:
Y.head()

0         Paris
1        Berlin
2    Harper-Lee
3       Jupiter
4           100
Name: answer, dtype: str

In [6]:
# tokenize
def tokenize(text):
    text = text.lower()
    text = text.replace('?','')
    text = text.replace("'","")
    return text.split()

In [7]:
tokenize("What is the capital of France?")

['what', 'is', 'the', 'capital', 'of', 'france']

In [8]:
vocab = {"<UNK>":0}
def build_vocab(row):
    token = tokenize(row["question"]) + tokenize(row["answer"])
    for i in token :
        if i not in vocab:
            vocab[i] = len(vocab)
    return token

In [9]:
df.apply(build_vocab,axis =1)

0           [what, is, the, capital, of, france, paris]
1         [what, is, the, capital, of, germany, berlin]
2     [who, wrote, to, kill, a, mockingbird, harper-...
3     [what, is, the, largest, planet, in, our, sola...
4     [what, is, the, boiling, point, of, water, in,...
                            ...                        
85    [who, directed, the, movie, titanic, jamescame...
86    [which, superhero, is, also, known, as, the, d...
87       [what, is, the, capital, of, brazil, brasilia]
88    [which, fruit, is, known, as, the, king, of, f...
89    [which, country, is, known, for, the, eiffel, ...
Length: 90, dtype: object

In [10]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [11]:
def text_to_indices(text):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else :
            indexed_text.append(vocab["<UNK>"])
    return indexed_text


In [12]:
text_to_indices("What is the boiling point of water in Celsius")

[1, 2, 3, 24, 25, 5, 26, 19, 27]

In [13]:
class CustomDataset(Dataset):
    def __init__(self,df):
        self.df = df
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self,index):
        return torch.tensor(text_to_indices(self.df.iloc[index]["question"])) , torch.tensor(text_to_indices(self.df.iloc[index]["answer"])[0])

train_dataset = CustomDataset(df)
train_loder = DataLoader(dataset=train_dataset,batch_size=1,shuffle=True)

In [14]:
train_dataset[10]

(tensor([ 1,  2,  3,  4,  5, 53]), tensor(54))

In [15]:
for question , answer in train_loder:
    print(question , answer)

tensor([[ 42, 167,   2,   3,  17, 168, 169]]) tensor([170])
tensor([[  1,   2,   3,   4,   5, 113]]) tensor([114])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([121])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([7])
tensor([[ 10, 308,   3, 309, 310]]) tensor([311])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([65])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([68])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([260])
tensor([[  1,   2,   3, 163, 164, 165,  83,  84]]) tensor([166])
tensor([[ 10,  29, 130, 131]]) tensor([132])
tensor([[ 10,  75,   3, 296,  19, 297]]) tensor([298])
tensor([[ 78,  79, 261, 151,  14, 262, 153]]) tensor([36])
tensor([[ 10,  96,   3, 104, 239]]) tensor([240])
tensor([[ 42, 125,   2,  62,  63,   3, 126, 127]]) tensor([128])
tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([162])
tensor([[42, 43, 44, 45, 46, 47, 48]]) tensor([49])
tensor([[ 42, 299, 300, 118,  14, 301, 302, 158, 303, 304, 305, 306]]) tensor([307])
tensor([[ 42, 290, 291, 118, 292, 158, 293, 2

In [16]:
class MySimpleRNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
        self.rnn = nn.RNN(50,64,batch_first=True)
        self.out = nn.Linear(64,vocab_size)
    def forward(self, X):
        embeded_question = self.embedding(X)
        hidden , final = self.rnn(embeded_question)
        output = self.out(final).squeeze(0)
        return output

In [22]:
lr = 0.001
epochs = 100

In [23]:
model = MySimpleRNN(len(vocab))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=lr)

In [24]:
for epoche in range(epochs):
    epoch_loss = 0
    for batch_features , batch_labels in train_loder:
        
        y_pred = model(batch_features)
        
        loss = loss_function(y_pred,batch_labels.long())
        
        epoch_loss += loss.item()
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
    print('Epoch: ', epoche+1," ---> ","loss :",epoch_loss)

Epoch:  1  --->  loss : 518.0644617080688
Epoch:  2  --->  loss : 447.4632263183594
Epoch:  3  --->  loss : 367.93790435791016
Epoch:  4  --->  loss : 308.1329391002655
Epoch:  5  --->  loss : 256.96036410331726
Epoch:  6  --->  loss : 209.11036431789398
Epoch:  7  --->  loss : 165.1399273276329
Epoch:  8  --->  loss : 128.4382963180542
Epoch:  9  --->  loss : 97.67245382070541
Epoch:  10  --->  loss : 74.2771055996418
Epoch:  11  --->  loss : 57.02625694870949
Epoch:  12  --->  loss : 44.40392832458019
Epoch:  13  --->  loss : 35.346337616443634
Epoch:  14  --->  loss : 28.588118866086006
Epoch:  15  --->  loss : 23.47801499068737
Epoch:  16  --->  loss : 19.758972741663456
Epoch:  17  --->  loss : 16.736809514462948
Epoch:  18  --->  loss : 14.169082626700401
Epoch:  19  --->  loss : 12.296500772237778
Epoch:  20  --->  loss : 10.691181667149067
Epoch:  21  --->  loss : 9.3989700935781
Epoch:  22  --->  loss : 8.250853691250086
Epoch:  23  --->  loss : 7.3054195791482925
Epoch:  24  

In [25]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")
  else:
    print(list(vocab.keys())[index])

In [26]:
predict(model, "largest planet our solar system?")


jupiter
