In [114]:
%load_ext autoreload
%autoreload 2

import numpy as np
import json
from tqdm import tqdm_notebook as tqdm
import string
import codecs

from constant import *
from model import *

USE GPU
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
def data_generator(filepath):
    with open(filepath,'r') as f:
        for line in f:
            yield json.loads(line)
        return

In [3]:
def create_mapping(filepath,min_count):
    gen = data_generator(filepath)
    word_count = {}
    word_map = {}
    sent_lengths = []
    para_lengths = []
    for data in tqdm(gen):
        ques = data['question'].split()
        paragraph = data['paragraph'].split()
        sentence = data['sentence'].split()
        para_lengths.append(len(paragraph))
        sent_lengths.append(len(sentence))
        text = ques + paragraph
        for item in text:
            if word_count.get(item):
                word_count[item]+=1
            else:
                word_count[item] = 1
    for k,v in word_count.iteritems():
        if v>min_count:
            word_map[k] = len(word_map)+1
    word_map['<pad>'] = 0
    word_map['<unk>'] = len(word_map)
    word_map['<start>'] = len(word_map)
    word_map['<end>'] = len(word_map)
    return word_map,para_lengths,sent_lengths

In [4]:
word_mapping,para_len,sent_len = create_mapping('../data/processed/train_data.json',5)

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))




In [5]:
print "maximum sentence length : {}".format(max(sent_len))
print "minimum sentence length : {}".format(min(sent_len))
print "mean sentence length : {}".format(np.mean(sent_len))
print "std dev sentence length : {}".format(np.std(sent_len))
print "mean + 3*std_dev sentence length : {}".format(np.mean(sent_len)+3*np.std(sent_len))
print "\n"
print "maximum paragraph length : {}".format(max(para_len))
print "minimum paragraph length : {}".format(min(para_len))
print "mean paragraph length : {}".format(np.mean(para_len))
print "std dev paragraph length : {}".format(np.std(para_len))
print "mean + 3*std_dev paragraph length : {}".format(np.mean(para_len)+3*np.std(para_len))

maximum sentence length : 431
minimum sentence length : 4
mean sentence length : 32.8592020884
std dev sentence length : 17.3015877763
mean + 3*std_dev sentence length : 84.7639654174


maximum paragraph length : 767
minimum paragraph length : 22
mean paragraph length : 139.626156291
std dev paragraph length : 55.4859031596
mean + 3*std_dev paragraph length : 306.08386577


In [6]:
MAX_SENT_LEN = 90
MAX_PARA_LEN = 350

In [7]:
len(word_mapping)

48009

In [8]:
################################################
## pre trained embedding
#################################################
all_word_embeds = {}
for i, line in enumerate(codecs.open(PRE_TRAINED_EMBEDDING_PATH, 'r', 'utf-8')):
    s = line.strip().split()
    if len(s) == WORD_DIM + 1:
        all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])

#Intializing Word Embedding Matrix
pretrained_word_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_mapping), WORD_DIM))

for w in word_mapping:
    if w.lower() in all_word_embeds:
        pretrained_word_embeds[word_mapping[w]] = all_word_embeds[w.lower()]

print('Loaded %i pretrained embeddings.' % len(all_word_embeds))
## To save memory
del all_word_embeds

Loaded 2195875 pretrained embeddings.


In [115]:
####################################
### Dataloader
####################################
class CustomDataset():

    def __init__(self,file_path,length,word2idx):
        self.file_path = file_path
        self.length = length
        self.word2idx = word2idx
        self.gen = data_generator(self.file_path)

    def __getitem__(self,index):
        try:
            text = self.gen.next()
        except StopIteration:
            self.gen = data_generator(self.file_path)
            text = self.gen.next()
        paragraph = text['paragraph'].split()
        paragraph.insert(0, "<start>")
        paragraph.append('<end>')
        sentence = text['sentence'].split()
        sentence.insert(0, "<start>")
        sentence.append('<end>')
        question = text['question'].split()
        question.insert(0, "<start>")
        question.append('<end>')
        
        x_paragraph = [self.word2idx.get(word) if self.word2idx.get(word) else self.word2idx['<unk>'] for word in paragraph]
        x_sentence = [self.word2idx.get(word) if self.word2idx.get(word) else self.word2idx['<unk>'] for word in sentence]
        x_question = [self.word2idx.get(word) if self.word2idx.get(word) else self.word2idx['<unk>'] for word in question]
        paragraph_word_len = len(paragraph)
        sentence_word_len = len(sentence)
        question_word_len = len(question)
        return {'paragraph': paragraph,'sentence': sentence,'question': question,
                'paragraph_word_id':x_paragraph,'sentence_word_id':x_sentence,'question_word_id':x_question,
                "paragraph_word_len": paragraph_word_len,"sentence_word_len": sentence_word_len,"question_word_len":question_word_len}
    
    def __len__(self):
        return self.length

In [116]:
def collate_fn(batch):
    #paragraph_max_word = [item['paragraph_word_len'] for item in batch]
    #sentence_max_word = [item['sentence_word_len'] for item in batch]
    question_max_word = [item['question_word_len'] for item in batch]
    paragraph_max_len = MAX_PARA_LEN
    sentence_max_len = MAX_SENT_LEN
    question_max_len = max(question_max_word)

    paragraph_word_data = np.zeros((len(batch),paragraph_max_len))
    sentence_word_data = np.zeros((len(batch),sentence_max_len))
    question_word_data = np.zeros((len(batch),question_max_len))
    for i,item in enumerate(batch):
        paragraph_word_data[i,:len(item['paragraph_word_id'])] = item['paragraph_word_id']
        sentence_word_data[i,:len(item['sentence_word_id'])] = item['sentence_word_id']
        question_word_data[i,:len(item['question_word_id'])] = item['question_word_id']
    paragraph =[item['paragraph'] for item in batch]
    sentence =[item['sentence'] for item in batch]
    question =[item['question'] for item in batch]
    return torch.tensor(paragraph_word_data),torch.tensor(sentence_word_data),torch.tensor(question_word_data),paragraph,sentence,question

In [117]:
import torch
from torch.utils.data import DataLoader

In [118]:
### Load datas
train_dataloader = DataLoader(CustomDataset(TRAIN_DATA_PATH,TRAIN_DATA_LENGTH,word_mapping),
                              batch_size=BATCH_SIZE,collate_fn = collate_fn,shuffle=False)

valid_dataloader = DataLoader(CustomDataset(VALID_DATA_PATH,VALID_DATA_LENGTH,word_mapping),
                              batch_size=BATCH_SIZE,collate_fn = collate_fn,shuffle=False)

## Train

In [119]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src1 = batch[0]
        src2 = batch[1]
        trg = batch[2]
        src1 = Variable(src1.type(torch.LongTensor))
        src2 = Variable(src2.type(torch.LongTensor))
        trg = Variable(trg.type(torch.LongTensor))
        if USE_GPU:
            src1 = src1.cuda()
            src2 = src2.cuda()
            trg = trg.cuda()
        
        optimizer.zero_grad()
        
        output = model(src1,src2, trg)
        
        #trg = [batch size,sent len]
        #output = [batch size,sent len, output dim]
        
        #reshape to:
        #trg = [(sent len - 1) * batch size]
        #output = [(sent len - 1) * batch size, output dim]
        
        loss = criterion(output[:,1:,:].contiguous().view(-1, output.shape[2]), trg[:,1:].contiguous().view(-1))
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / iterator.dataset.length,optimizer

In [120]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src1 = batch[0]
            src2 = batch[1]
            trg = batch[2]
            src1 = Variable(src1.type(torch.LongTensor))
            src2 = Variable(src2.type(torch.LongTensor))
            trg = Variable(trg.type(torch.LongTensor))
            if USE_GPU:
                src1 = src1.cuda()
                src2 = src2.cuda()
                trg = trg.cuda()

            output = model(src1,src2, trg, 0) #turn off teacher forcing

            loss = criterion(output[:,1:,:].contiguous().view(-1, output.shape[2]), trg[:,1:].contiguous().view(-1))

            epoch_loss += loss.item()
        
    return epoch_loss / iterator.dataset.length

In [121]:
def save(filename):
    state = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
             'optimizer': optimizer.state_dict(), 'valid_loss': valid_loss}
    torch.save(state, filename)

In [122]:
WORD_SIZE = len(word_mapping)
WORD_DIM = 100
HIDDEN_SIZE = 128

In [123]:
WORD_SIZE

48009

In [124]:
enc1 = EncoderParagraph(WORD_SIZE,WORD_DIM,HIDDEN_SIZE,pretrained_word_embeds)
enc2 = EncoderSentence(WORD_SIZE,WORD_DIM,HIDDEN_SIZE,pretrained_word_embeds)
dec = AttnDecoderLSTM(WORD_SIZE,WORD_DIM,HIDDEN_SIZE*4,MAX_SENT_LEN,pretrained_word_embeds)
model = QuestionGeneration(enc1,enc2, dec)

if USE_GPU:
    enc1 = enc1.cuda()
    enc2 = enc2.cuda()
    dec = dec.cuda()
    model = model.cuda()

In [125]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = optim.Adam(model_parameters)

In [126]:
pad_idx = word_mapping['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [127]:
N_EPOCHS = 10
CLIP = 10

best_valid_loss = float('inf')

#if not os.path.isdir(f'{SAVE_DIR}'):
#    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    train_loss,opti = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_dataloader, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        save('../models/checkpoint_epoch_'+str(epoch)+'_valid_loss_'+str(valid_loss)+'_'+'.pth.tar')
    
    print('Epoch [{}/{}] Train Loss: {:.4f} | Val. Loss: {:.4f}'.format(epoch+1, N_EPOCHS, train_loss,valid_loss))

torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size([32, 256])
torch.Size([32, 356])
torch.Size([32, 100])
torch.Size

ValueError: cannot copy sequence with size 94 to array axis with dimension 90