In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data pre-processing

In [None]:
from collections import Counter, defaultdict
import pandas as pd

%matplotlib inline

import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchtext
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence

import json
import spacy



In [None]:
items = ['data','paragraphs','qas','answers']
file = json.loads(open('/content/drive/MyDrive/train-v2.0.json').read())
all_data = pd.json_normalize(file , items )
x = pd.json_normalize(file, items[:-1] )
y = pd.json_normalize(file,items[:-2])

spacy_tokenizer = spacy.load("en_core_web_sm")
tokenizer = get_tokenizer("spacy", language='en_core_web_sm')

In [None]:
idx = np.repeat(y['context'].values, y.qas.str.len())
ndx  = np.repeat(x['id'].values,x['answers'].str.len())
x['context'] = idx
all_data['q_idx'] = ndx
required_data_format = pd.concat([ x[['id','question','context']].set_index('id'),all_data.set_index('q_idx')],1,sort=False).reset_index()
required_data_format['c_id'] = required_data_format['context'].factorize()[0]

  required_data_format = pd.concat([ x[['id','question','context']].set_index('id'),all_data.set_index('q_idx')],1,sort=False).reset_index()


In [None]:
required_data_format.head()

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0,0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0


In [None]:
required_data_format.isnull().sum()

index               0
question            0
context             0
text            43498
answer_start    43498
c_id                0
dtype: int64

In [None]:
required_data_format=required_data_format.dropna()

In [None]:
len(required_data_format)

86821

## LIST OF QUESTIONS

In [None]:
questions=list(required_data_format['question'])


In [None]:
questions_word_list=[]
for q in questions:
    x=q.split()
    questions_word_list.append(x)
print(questions_word_list[1:10])

[['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up?'], ['When', 'did', 'Beyonce', 'leave', "Destiny's", 'Child', 'and', 'become', 'a', 'solo', 'singer?'], ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up?'], ['In', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous?'], ['In', 'what', 'R&B', 'group', 'was', 'she', 'the', 'lead', 'singer?'], ['What', 'album', 'made', 'her', 'a', 'worldwide', 'known', 'artist?'], ['Who', 'managed', 'the', "Destiny's", 'Child', 'group?'], ['When', 'did', 'Beyoncé', 'rise', 'to', 'fame?'], ['What', 'role', 'did', 'Beyoncé', 'have', 'in', "Destiny's", 'Child?']]


In [None]:
flat_list = []
flat_list = [x for sublist in questions_word_list for x in sublist]

## VOCABULARY OF QUESTIONS

In [None]:
vocabulary=dict(Counter(flat_list))
pad_unk={"<pad>":0,"<unk>":1}
vocab={**pad_unk,**vocabulary}
print(vocab)



In [None]:
unq_idx=0
word2idx = defaultdict(lambda: 1,vocab)
for i in vocab:
    word2idx[i]=unq_idx
    unq_idx=unq_idx+1

print(word2idx)



## LIST OF PARAGRAPHS/CONTEXTS

In [None]:
context=list(required_data_format['context'])
context

['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead s

In [None]:
context_word_list=[]
for c in context:
    x=c.split()
    context_word_list.append(x)
print(context_word_list[1:10])

[['Beyoncé', 'Giselle', 'Knowles-Carter', '(/biːˈjɒnseɪ/', 'bee-YON-say)', '(born', 'September', '4,', '1981)', 'is', 'an', 'American', 'singer,', 'songwriter,', 'record', 'producer', 'and', 'actress.', 'Born', 'and', 'raised', 'in', 'Houston,', 'Texas,', 'she', 'performed', 'in', 'various', 'singing', 'and', 'dancing', 'competitions', 'as', 'a', 'child,', 'and', 'rose', 'to', 'fame', 'in', 'the', 'late', '1990s', 'as', 'lead', 'singer', 'of', 'R&B', 'girl-group', "Destiny's", 'Child.', 'Managed', 'by', 'her', 'father,', 'Mathew', 'Knowles,', 'the', 'group', 'became', 'one', 'of', 'the', "world's", 'best-selling', 'girl', 'groups', 'of', 'all', 'time.', 'Their', 'hiatus', 'saw', 'the', 'release', 'of', "Beyoncé's", 'debut', 'album,', 'Dangerously', 'in', 'Love', '(2003),', 'which', 'established', 'her', 'as', 'a', 'solo', 'artist', 'worldwide,', 'earned', 'five', 'Grammy', 'Awards', 'and', 'featured', 'the', 'Billboard', 'Hot', '100', 'number-one', 'singles', '"Crazy', 'in', 'Love"', '

In [None]:
flat_list_context = []
flat_list_context = [x for sublist in context_word_list for x in sublist]

## VOCABULARY OF CONTEXT

In [None]:
vocabulary_context=dict(Counter(flat_list_context))
pad_unk={"<pad>":0,"<unk>":1}
vocab_context={**pad_unk,**vocabulary}
print(vocab_context)



In [None]:
unq_idx=0
word2idx_context = defaultdict(lambda: 1,vocab_context)
for i in vocab_context:
    word2idx_context[i]=unq_idx
    unq_idx=unq_idx+1

print(word2idx_context)



In [None]:
def padding(batch):
    question,paragraph,start = zip(*batch)

    seq_length_question=torch.tensor([len(x) for x in question])
    seq_length_paragraph=torch.tensor([len(x) for x in paragraph])

    padded_questions=pad_sequence(question)
    padded_paragraph=pad_sequence(paragraph)

    starts=list(start)
    starts=torch.Tensor([int(x.numpy()) for x in start])
    return seq_length_question,seq_length_paragraph,padded_questions,padded_paragraph,starts

In [None]:
class get_Converted_Sentence_And_Labels(Dataset):
    def __init__(self, required_data_format, tokenizer, word2idx,word2idx_context):
        self.tokenizer = tokenizer
        self.data=required_data_format
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        tokens_questions = self.tokenizer(self.data.iloc[idx]["question"])
        tokens_context = self.tokenizer(self.data.iloc[idx]["context"])
        question_data = torch.tensor([word2idx[x] for x in tokens_questions])
        context_data = torch.tensor([word2idx_context[x] for x in tokens_context])
        start_idx=torch.tensor(required_data_format.iloc[idx]["answer_start"])
        return question_data, context_data,start_idx

In [None]:
train_data = get_Converted_Sentence_And_Labels(required_data_format, tokenizer, word2idx,word2idx_context)

In [None]:
train_loader = DataLoader(train_data, batch_size=2, shuffle=True, collate_fn=padding)

In [None]:
loss_fn = nn.CrossEntropyLoss()

# ENCODER CLASS

In [None]:
class Encoder(nn.Module):
    def __init__(self,input_size_question,input_size_paragraph,embedding_size,hidden_size,num_layers):
        super(Encoder,self).__init__()

        self.hidden_size=hidden_size
        self.num_layers=num_layers
        self.embedding_q=nn.Embedding(input_size_question,embedding_size)
        self.embedding_p=nn.Embedding(input_size_paragraph,embedding_size)
        self.rnn=nn.LSTM(embedding_size,hidden_size,num_layers,bidirectional=True)
        self.rnn2=nn.LSTM(hidden_size,hidden_size)

    def forward(self,question,paragraph):

        embedding_question=self.embedding_q(question)
        embedding_paragraph=self.embedding_p(paragraph)
        outputs_question,(hidden_q,cell_q)=self.rnn(embedding_question)
        outputs_paragraph,(hidden_p,cell_p)=self.rnn(embedding_paragraph)
        Hq_Hp=  (hidden_q * hidden_p).sum(dim=1, keepdims=True)
        score_hp=hidden_p*Hq_Hp
        outputs_c,(hidden_c,cell_c)= self.rnn2(score_hp)

        return hidden_c,hidden_p


# DECODER CLASS

In [None]:
class Decoder(nn.Module):
    def __init__(self,inp_size,op_size,c_size):
        super(Decoder,self).__init__()
        self.rnn_start=nn.LSTM(inp_size,op_size,bidirectional=True)
        self.rnn_end=nn.LSTM(inp_size,op_size,bidirectional=True)
        self.fc1=nn.Linear(op_size,c_size)
        self.sm1 = nn.Softmax(dim=1)
        self.fc2=nn.Linear(op_size,c_size)
        self.sm2 = nn.Softmax(dim=1)

    def forward(self,hc_hp_concat):
        a,(b,c)=self.rnn_start(hc_hp_concat)
        d,(e,f)=self.rnn_end(hc_hp_concat)
        x1=self.fc1(b)
        x2=self.fc2(e)
        out_start=self.sm1(x1)
        out_end=self.sm2(x2)
        return out_start,out_end

In [None]:
hidden_layer_neurons = 5
vocabulary_size_questions = len(vocab)
vocabulary_size_paragraphs= len(vocab_context)
embedding_dim = 24
num_layers=1
decoder_op_size=10

encoder_model = Encoder(vocabulary_size_questions,vocabulary_size_paragraphs, embedding_dim, hidden_layer_neurons,num_layers)
# decoder_model = Decoder(hidden_layer_neurons,decoder_op_size,vocabulary_size_paragraphs)

for idx_b, batch in enumerate(train_loader):
            if(idx_b==1):
                break
            #Calling ENCODER
            seq_length_question,seq_length_paragraph,padded_questions,padded_paragraph,starts = batch
            hidden_c,hidden_p= encoder_model(padded_questions, padded_paragraph)

            print("This is hidden_c \n",hidden_c)
            print("This is hidden_p \n",hidden_p)

            #Calling DECODER
            hc_hp_concat=torch.cat([hidden_c,hidden_p],dim=0)
            print("The concatenated hc_hp_concat \n",hc_hp_concat)

            decoder_model = Decoder(hidden_layer_neurons,decoder_op_size,padded_paragraph.shape[0])
            out_start,out_end=decoder_model(hc_hp_concat)

            print("The output of the decoder module: ")
            print("Predicted starting index:")

            start_index= out_start.argmax(2)

            print(start_index)
            print("\n The Actual start indexes")
            print(starts)






This is hidden_c 
 tensor([[[ 0.1172,  0.0936,  0.0217, -0.0334, -0.1442],
         [ 0.0948,  0.0913,  0.0204, -0.0265, -0.1428]]],
       grad_fn=<StackBackward0>)
This is hidden_p 
 tensor([[[ 0.1068, -0.0342,  0.7005,  0.1099, -0.5368],
         [ 0.3174, -0.5578, -0.1458,  0.0173,  0.0420]],

        [[-0.0490,  0.3812,  0.4162,  0.1403,  0.2259],
         [-0.0437, -0.3002,  0.0314, -0.0181,  0.2052]]],
       grad_fn=<StackBackward0>)
The concatenated hc_hp_concat 
 tensor([[[ 0.1172,  0.0936,  0.0217, -0.0334, -0.1442],
         [ 0.0948,  0.0913,  0.0204, -0.0265, -0.1428]],

        [[ 0.1068, -0.0342,  0.7005,  0.1099, -0.5368],
         [ 0.3174, -0.5578, -0.1458,  0.0173,  0.0420]],

        [[-0.0490,  0.3812,  0.4162,  0.1403,  0.2259],
         [-0.0437, -0.3002,  0.0314, -0.0181,  0.2052]]],
       grad_fn=<CatBackward0>)
The output of the decoder module: 
Predicted starting index:
tensor([[123,  24],
        [ 52,  71]])

 The Actual start indexes
tensor([117.,  13.])