<a href="https://colab.research.google.com/github/shivammehta007/QuestionGenerator/blob/master/BaseModel-RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Question Generation

Additional Dependencies

In [0]:
%%capture
!pip install fairseq
!pip install sacremoses subword_nmt
!pip install -U tqdm

In [0]:
import os
import json
import logging
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import random

In [0]:
# For results duplication
SEED=1234
random.seed(SEED)

In [0]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

## DataSet

In [0]:
SQUAD_DIR = '/content/drive/My Drive/Colab Notebooks/SQuAD'
SQUAD_TRAIN = os.path.join(SQUAD_DIR, 'train_v2.json')
# SQUAD_DEV = os.path.join(SQUAD_DIR, 'dev.json')
SQUAD_TEST = os.path.join(SQUAD_DIR, 'test_v2.json')
print(SQUAD_TRAIN, SQUAD_TEST) # , SQUAD_DEV

/content/drive/My Drive/Colab Notebooks/SQuAD/train_v2.json /content/drive/My Drive/Colab Notebooks/SQuAD/test_v2.json


In [0]:
with open(SQUAD_TRAIN) as train_file:
    train_data = json.load(train_file)
    train_data = train_data['data']

with open(SQUAD_TEST) as test_file:
    test_data = json.load(test_file)
    test_data = test_data['data']

### PreProcessing Function

In [0]:
def convert_to_file_without_answers(dataset, dataset_type='train', get_impossible=False):
    """
    Takes an input json and generates dataset_type.paragraphs and dataset_type.questions
    Input:
    dataset : string -> Name of json input
    dataset_type: string -> Type of dataset like (Train, test, valid)
    get_impossible: boolean -> Flag to get unanswerable questions
    """
    para_output = open(dataset_type + '.paragraphs', 'w')
    question_output = open(dataset_type + '.questions', 'w')
    d = []
    for paragraphs in tqdm(dataset):
        paragraphs = paragraphs['paragraphs']
        for i, paragraph in enumerate(paragraphs):
            para = paragraph['context']
            for questionanswers in paragraph['qas']:
                if questionanswers['is_impossible']:
                    continue
                question = questionanswers['question']
                para = para.replace('\n', ' ')
                para_output.write(para.strip() + '\n')
                question_output.write(question.strip() + '\n')
                d.append(i)
    print(len(d))
    para_output.close()
    question_output.close()

In [0]:
convert_to_file_without_answers(train_data, 'train')
convert_to_file_without_answers(test_data, 'test')


HBox(children=(FloatProgress(value=0.0, max=442.0), HTML(value='')))


86821


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


5928


In [0]:
def split_train_valid(filename_paragraph='train.paragraphs', filename_questions='train.questions', split_ratio=0.8):
    """Splits the train set to a validation set"""

    with open(filename_paragraph) as paragraphs_file, open(filename_questions) as questions_file:
        data_paragraphs = paragraphs_file.readlines()
        data_questions = questions_file.readlines()
    
    # Output files
    train_paragraphs_file = open('train.paragraphs', 'w')
    valid_paragraphs_file = open('valid.paragraphs', 'w')
    train_questions_file = open('train.questions', 'w')
    valid_questions_file = open('valid.questions', 'w')

    train_count, valid_count = 0, 0

    for i in tqdm(range(len(data_paragraphs))):
        if random.random() < split_ratio:
            train_paragraphs_file.write(data_paragraphs[i].strip() + '\n')
            train_questions_file.write(data_questions[i].strip() + '\n')
            train_count += 1
        else:
            valid_paragraphs_file.write(data_paragraphs[i].strip() + '\n')
            valid_questions_file.write(data_questions[i].strip() + '\n')
            valid_count += 1

    logger.info('Total Trainset: {} | Total ValidSet: {}'.format(train_count, valid_count))



In [0]:
split_train_valid()

HBox(children=(FloatProgress(value=0.0, max=86821.0), HTML(value='')))

INFO:__main__:Total Trainset: 69284 | Total ValidSet: 17537





### Generate Binary of Dataset for FairSeq to process

In [0]:
!fairseq-preprocess --source-lang paragraphs --target-lang questions \
     --trainpref train --testpref test --validpref valid\
     --destdir preprocessed_data --seed 1234 --nwordssrc 45002 --nwordstgt 28002

Namespace(align_suffix=None, alignfile=None, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='preprocessed_data', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=1000, lr_scheduler='fixed', memory_efficient_fp16=False, min_loss_scale=0.0001, no_progress_bar=False, nwordssrc=45002, nwordstgt=28002, only_source=False, optimizer='nag', padding_factor=8, seed=1234, source_lang='paragraphs', srcdict=None, target_lang='questions', task='translation', tensorboard_logdir='', testpref='test', tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtgt=0, tokenizer=None, trainpref='train', user_dir=None, validpref='valid', workers=1)
| [paragraphs] Dictionary: 45007 types
| [paragraphs] train.paragraphs: 69284 sents, 8365124 tokens, 7.53% replaced by <unk>
| [paragraphs] Dictionary: 45007 types
| [paragraphs] valid.paragraphs: 17537 sents, 2122362 t

### Training a default ConvSeq2Seq Model

In [0]:
# !fairseq-generate data-bin/iwslt14.tokenized.de-en \
#     --path checkpoints/fconv/checkpoint20.pt \
#     --batch-size 128 --beam 5

!CUDA_VISIBLE_DEVICES=0 fairseq-train preprocessed_data/ \
     --lr 0.01 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
     --arch fconv_iwslt_de_en --save-dir checkpoints/fconv

In [0]:
!fairseq-generate preprocessed_data \
    --path checkpoints/fconv/checkpoint43.pt \
    --batch-size 128 | tee gen.out

In [0]:
!grep ^H gen.out | cut -f3- > gen.out.sys
!grep ^T gen.out | cut -f2- > gen.out.ref
!fairseq-score --sys gen.out.sys --ref gen.out.ref

Namespace(ignore_case=False, order=4, ref='gen.out.ref', sacrebleu=False, sentence_bleu=False, sys='gen.out.sys')
BLEU4 = 2.43, 15.4/3.8/1.3/0.5 (BP=1.000, ratio=1.451, syslen=89115, reflen=61434)


In [0]:
# %cd fairseq/examples/translation/
# !bash prepare-iwslt14.sh
# !fairseq-preprocess --source-lang de --target-lang en \
#     --trainpref examples/translation/iwslt14.tokenized.de-en/train --validpref examples/translation/iwslt14.tokenized.de-en/valid --testpref examples/translation/iwslt14.tokenized.de-en/test \
#     --destdir data-bin/iwslt14.tokenized.de-en
# !mkdir -p checkpoints/fconv
# !fairseq-train data-bin/iwslt14.tokenized.de-en \
#     --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
#     --arch fconv_iwslt_de_en --save-dir checkpoints/fconv

In [0]:
# !fairseq-generate data-bin/iwslt14.tokenized.de-en \
#     --path checkpoints/fconv/checkpoint20.pt \
#     --batch-size 128 --beam 5

### Trying Baseline LSTM Model

In [0]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2020-03-05 13:00:02--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2020-03-05 13:00:02--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2020-03-05 13:00:03--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [0]:
!unzip glove.840B.300d.zip

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [60]:
!CUDA_VISIBLE_DEVICES=0 fairseq-train preprocessed_data/ \
     --lr 0.001 --clip-norm 5 --dropout 0.2 --batch-size 64 \
     --arch lstm --max-epoch 15 --encoder-hidden-size 600 --encoder-layers 2 \
     --decoder-hidden-size 600 --decoder-layers 2 --optimizer adam --dropout 0.3 --encoder-embed-path glove.840B.300d.txt \
     --encoder-bidirectional --encoder-embed-dim 300 --decoder-embed-dim 300 --no-epoch-checkpoints --decoder-embed-path glove.840B.300d.txt--lr-shrink

Namespace(adam_betas='(0.9, 0.999)', adam_eps=1e-08, adaptive_softmax_cutoff='10000,50000,200000', arch='lstm', best_checkpoint_metric='loss', bpe=None, bucket_cap_mb=25, clip_norm=5.0, cpu=False, criterion='cross_entropy', curriculum=0, data='preprocessed_data/', dataset_impl=None, ddp_backend='c10d', decoder_attention='1', decoder_dropout_in=0.3, decoder_dropout_out=0.3, decoder_embed_dim=300, decoder_embed_path='glove.840B.300d.txt', decoder_freeze_embed=False, decoder_hidden_size=600, decoder_layers=2, decoder_out_embed_dim=512, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.3, empty_cache_freq=0, encoder_bidirectional=True, encoder_dropout_in=0.3, encoder_dropout_out=0.3, encoder_embed_dim=300, encoder_embed_path='glove.840B.300d.txt', encoder_freeze_embed=False, encoder_hidden_size=600, encoder_layers=2, fast_stat_sync=False, f

In [0]:
!fairseq-generate preprocessed_data \
    --path checkpoints/checkpoint_last.pt \
    --batch-size 64 | tee gen.out

In [0]:
!grep ^H gen.out | cut -f3- > gen.out.sys
!grep ^T gen.out | cut -f2- > gen.out.ref
!fairseq-score --sys gen.out.sys --ref gen.out.ref