<a href="https://colab.research.google.com/github/shivammehta007/QuestionGenerator/blob/master/QGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Question Generation

Additional Dependencies

In [3]:
!pip install fairseq
!pip install sacremoses subword_nmt
!pip install -U tqdm

Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/47/55/fd9170ba08a1a64a18a7f8a18f088037316f2a41be04d2fe6ece5a653e8f/tqdm-4.43.0-py2.py3-none-any.whl (59kB)
[K     |████████████████████████████████| 61kB 5.4MB/s 
[?25hInstalling collected packages: tqdm
  Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.43.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
import json
import logging
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import random

In [0]:
# For results duplication
SEED=1234
random.seed(SEED)

In [0]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)

## DataSet

#### Using Author's Dataset

In [0]:
!cp -r /content/drive/My\ Drive/Data/data/processed processed

In [0]:
!mv processed/src-train.txt train.paragraphs
!mv processed/src-test.txt test.paragraphs
!mv processed/src-dev.txt valid.paragraphs
!mv processed/tgt-train.txt train.questions
!mv processed/tgt-test.txt test.questions
!mv processed/tgt-dev.txt valid.questions

Now I can skip preprocessing and go directly to the generating binary for training

#### Using Myself

In [0]:
SQUAD_DIR = '/content/drive/My Drive/Colab Notebooks/SQuAD'
SQUAD_TRAIN = os.path.join(SQUAD_DIR, 'train_v2.json')
# SQUAD_DEV = os.path.join(SQUAD_DIR, 'dev.json')
SQUAD_TEST = os.path.join(SQUAD_DIR, 'test_v2.json')
print(SQUAD_TRAIN, SQUAD_TEST) # , SQUAD_DEV

In [0]:
with open(SQUAD_TRAIN) as train_file:
    train_data = json.load(train_file)
    train_data = train_data['data']

with open(SQUAD_TEST) as test_file:
    test_data = json.load(test_file)
    test_data = test_data['data']

### PreProcessing Function

In [0]:
def convert_to_file_without_answers(dataset, dataset_type='train', get_impossible=False):
    """
    Takes an input json and generates dataset_type.paragraphs and dataset_type.questions
    Input:
    dataset : string -> Name of json input
    dataset_type: string -> Type of dataset like (Train, test, valid)
    get_impossible: boolean -> Flag to get unanswerable questions
    """
    para_output = open(dataset_type + '.paragraphs', 'w')
    question_output = open(dataset_type + '.questions', 'w')
    d = []
    for paragraphs in tqdm(dataset):
        paragraphs = paragraphs['paragraphs']
        for i, paragraph in enumerate(paragraphs):
            para = paragraph['context']
            for questionanswers in paragraph['qas']:
                if questionanswers['is_impossible']:
                    continue
                question = questionanswers['question']
                para = para.replace('\n', ' ')
                para_output.write(para.strip().lower() + '\n')
                question_output.write(question.strip().lower() + '\n')
                d.append(i)
    print(len(d))
    para_output.close()
    question_output.close()

In [0]:
convert_to_file_without_answers(train_data, 'train')
convert_to_file_without_answers(test_data, 'test')


In [0]:
def split_train_valid(filename_paragraph='train.paragraphs', filename_questions='train.questions', split_ratio=0.8):
    """Splits the train set to a validation set"""

    with open(filename_paragraph) as paragraphs_file, open(filename_questions) as questions_file:
        data_paragraphs = paragraphs_file.readlines()
        data_questions = questions_file.readlines()
    
    # Output files
    train_paragraphs_file = open('train.paragraphs', 'w')
    valid_paragraphs_file = open('valid.paragraphs', 'w')
    train_questions_file = open('train.questions', 'w')
    valid_questions_file = open('valid.questions', 'w')

    train_count, valid_count = 0, 0

    for i in tqdm(range(len(data_paragraphs))):
        if random.random() < split_ratio:
            train_paragraphs_file.write(data_paragraphs[i].strip() + '\n')
            train_questions_file.write(data_questions[i].strip() + '\n')
            train_count += 1
        else:
            valid_paragraphs_file.write(data_paragraphs[i].strip() + '\n')
            valid_questions_file.write(data_questions[i].strip() + '\n')
            valid_count += 1

    logger.info('Total Trainset: {} | Total ValidSet: {}'.format(train_count, valid_count))



In [0]:
split_train_valid()

### Generate Binary of Dataset for FairSeq to process

In [6]:
!fairseq-preprocess --source-lang paragraphs --target-lang questions \
     --trainpref train --testpref test --validpref valid\
     --destdir preprocessed_data --seed 1234 --nwordssrc 45000 --nwordstgt 28000

Namespace(align_suffix=None, alignfile=None, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='preprocessed_data', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=1000, lr_scheduler='fixed', memory_efficient_fp16=False, min_loss_scale=0.0001, no_progress_bar=False, nwordssrc=45000, nwordstgt=28000, only_source=False, optimizer='nag', padding_factor=8, seed=1234, source_lang='paragraphs', srcdict=None, target_lang='questions', task='translation', tensorboard_logdir='', testpref='test', tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtgt=0, tokenizer=None, trainpref='train', user_dir=None, validpref='valid', workers=1)
| [paragraphs] Dictionary: 44999 types
| [paragraphs] train.paragraphs: 70484 sents, 2386532 tokens, 1.32% replaced by <unk>
| [paragraphs] Dictionary: 44999 types
| [paragraphs] valid.paragraphs: 10570 sents, 368586 to

### Training a default ConvSeq2Seq Model

In [7]:
# !fairseq-generate data-bin/iwslt14.tokenized.de-en \
#     --path checkpoints/fconv/checkpoint20.pt \
#     --batch-size 128 --beam 5

!CUDA_VISIBLE_DEVICES=0 fairseq-train preprocessed_data/ \
     --lr 0.001 --clip-norm 0.1 --dropout 0.3 --max-epoch 15 --optimizer adam\
     --arch fconv_iwslt_de_en --save-dir checkpoints/fconv --batch-size 128 --no-epoch-checkpoints

Namespace(adam_betas='(0.9, 0.999)', adam_eps=1e-08, arch='fconv_iwslt_de_en', best_checkpoint_metric='loss', bpe=None, bucket_cap_mb=25, clip_norm=0.1, cpu=False, criterion='cross_entropy', curriculum=0, data='preprocessed_data/', dataset_impl=None, ddp_backend='c10d', decoder_attention='True', decoder_embed_dim=256, decoder_embed_path=None, decoder_layers='[(256, 3)] * 3', decoder_out_embed_dim=256, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.2, empty_cache_freq=0, encoder_embed_dim=256, encoder_embed_path=None, encoder_layers='[(256, 3)] * 4', fast_stat_sync=False, find_unused_parameters=False, fix_batches_to_gpus=False, fixed_validation_seed=None, force_anneal=None, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, keep_interval_updates=-1, keep_last_epochs=-1, lazy_load=False, left_pad_source=

In [8]:
!fairseq-generate preprocessed_data \
    --path checkpoints/fconv/checkpoint_best.pt \
    --batch-size 128 | tee gen.out

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
P-8821	-1.1683 -1.4110 -0.6346 -0.5053 -0.0080 -1.5803 -2.1670 -2.3217 -0.1067 -0.1860 -0.0001
S-7664	others focused on the personalities in the show ; <unk> <unk> of newsweek accused judge simon <unk> 's cruel critiques in the show of helping to establish in the wider world a culture of <unk> , that `` simon <unk> has dragged the rest of us in the mud with him . ''
T-7664	who was accused of helping <<unk>> thrive ?
H-7664	-1.0994532108306885	who was responsible for landscaping <unk> 's cruel 's hemline ?
P-7664	-0.9886 -1.4830 -0.8922 -0.0167 -3.3858 -1.6848 -0.1276 -1.2828 -0.7600 -2.3414 -0.2305 -0.0000
S-6426	in a series of encounters -- known in buddhist literature as the four sights -- he learned of the suffering of ordinary people , encountering an old man , a sick man , a corpse and , finally , an ascetic holy man , apparently content and at peace with the world .
T-6426	what were his first four encounters called 

In [0]:
!cp checkpoints/fconv/checkpoint_best.pt /content/drive/My\ Drive/Data

In [9]:
!grep ^H gen.out | cut -f3- > gen.out.sys
!grep ^T gen.out | cut -f2- > gen.out.ref
!fairseq-score --sys gen.out.sys --ref gen.out.ref

Namespace(ignore_case=False, order=4, ref='gen.out.ref', sacrebleu=False, sentence_bleu=False, sys='gen.out.sys')
BLEU4 = 6.78, 36.8/9.8/4.6/2.2 (BP=0.876, ratio=0.883, syslen=121774, reflen=137927)


In [0]:
# %cd fairseq/examples/translation/
# !bash prepare-iwslt14.sh
# !fairseq-preprocess --source-lang de --target-lang en \
#     --trainpref examples/translation/iwslt14.tokenized.de-en/train --validpref examples/translation/iwslt14.tokenized.de-en/valid --testpref examples/translation/iwslt14.tokenized.de-en/test \
#     --destdir data-bin/iwslt14.tokenized.de-en
# !mkdir -p checkpoints/fconv
# !fairseq-train data-bin/iwslt14.tokenized.de-en \
#     --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
#     --arch fconv_iwslt_de_en --save-dir checkpoints/fconv

In [0]:
# !fairseq-generate data-bin/iwslt14.tokenized.de-en \
#     --path checkpoints/fconv/checkpoint20.pt \
#     --batch-size 128 --beam 5

### Trying Baseline LSTM Model

In [0]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

In [0]:
!unzip glove.840B.300d.zip

In [0]:
# --lr 0.001. --lr-shrink
! rm -rf checkpoints

In [0]:
!CUDA_VISIBLE_DEVICES=0 fairseq-train preprocessed_data/ \
     --clip-norm 5 --batch-size 64 \
     --arch lstm --max-epoch 15 --encoder-hidden-size 600 --encoder-layers 2 \
     --decoder-hidden-size 600 --decoder-layers 2 --optimizer sgd  --dropout 0.3 --encoder-embed-path glove.840B.300d.txt \
     --encoder-bidirectional --encoder-embed-dim 300 --decoder-embed-dim 300 --no-epoch-checkpoints --decoder-embed-path glove.840B.300d.txt --decoder-out-embed-dim 300 --num-workers 3 \
     --lr 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,0.25,0.125,0.0625,0.03125,0.015625,0.0078125

In [0]:
!fairseq-generate preprocessed_data \
    --path checkpoints/checkpoint_best.pt \
    --batch-size 64 --beam 3 > gen.out

In [0]:
!grep ^H gen.out | cut -f3- > gen.out.sys
!grep ^T gen.out | cut -f2- > gen.out.ref
!fairseq-score --sys gen.out.sys --ref gen.out.ref

In [0]:
!head gen.out.sys 
print('---------')
!head gen.out.ref

### BaseLine LSTM with Sentence Filtered

#### Filtering the Squad Dataset

In [0]:
from spacy.lang.en import English

nlp_sentence = English()
nlp_sentence.add_pipe(nlp_sentence.create_pipe("sentencizer"))

In [0]:
def extract_filtered_sentences(questionanswers, para):
    """
    Method returns filtered sentences from the answers and para for SQUAD
    """
    tokenized_paragraph = nlp_sentence(para)
    sentences = [sent.string for sent in tokenized_paragraph.sents]

    filtered_sentences = set()

    # This iterates over every answer in question
    for answer in questionanswers["answers"]:
        answer_index = answer["answer_start"]
        length = 0

        # find sentence that has answer and filter them
        for sentence in sentences:
            if answer_index <= length + len(sentence):
                filtered_sentences.add(sentence.replace("\n", " ").strip())
                break
            length += len(sentence)

        if not filtered_sentences:
            print("Length : {}".format(length))
            raise Exception("One of the Answers had no sentence please check the data")

    return " ".join(filtered_sentences)

In [0]:
def filter_sentences_on_answer(dataset, dataset_type="train", get_impossible=False):
    """
    Filter the paragraph with only sentences relevant to answer and generates files
    with sentences and questions instead of paragraphs and questions
    Input:
    dataset: string
    dataset_type: string
    get_impossible: boolean
    """

    para_output = open(dataset_type + '.paragraphs', 'w')
    question_output = open(dataset_type + '.questions', 'w')

    dataset_size = 0

    logger.debug("Starting to filter sentences on answer")

    # This loops iterates over every paragraph
    for paragraphs in tqdm(dataset):
        paragraphs = paragraphs["paragraphs"]
        for i, paragraph in enumerate(paragraphs):
            para = paragraph["context"]
            # This loop iterates over every question in para
            for questionanswers in paragraph["qas"]:
                if questionanswers["is_impossible"]:
                    continue
                question = questionanswers["question"]

                filtered_sentences = extract_filtered_sentences(questionanswers, para)

                para_output.write(filtered_sentences.strip().lower() + "\n")
                question_output.write(question.strip().lower() + "\n")

                dataset_size += 1

    logger.info("Size of the {} dataset: {}".format(dataset_type, dataset_size))
    para_output.close()
    question_output.close()

    logger.debug("Sentences Filtered on Answers")

In [0]:
filter_sentences_on_answer(train_data, 'train')
filter_sentences_on_answer(test_data, 'test')

In [0]:
split_train_valid()

#### Training

In [0]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip
!rm -rf checkpoints

In [0]:
!rm -rf

In [0]:
!fairseq-preprocess --source-lang paragraphs --target-lang questions \
     --trainpref train --testpref test --validpref valid\
     --destdir preprocessed_data --nwordssrc 45000 --nwordstgt 28000

#--nwordssrc 45000 --nwordstgt 28000

In [0]:
!CUDA_VISIBLE_DEVICES=0 fairseq-train preprocessed_data/ \
     --clip-norm 5 --batch-size 64 \
     --arch lstm --max-epoch 15 --encoder-hidden-size 600 --encoder-layers 2 \
     --decoder-hidden-size 600 --decoder-layers 2 --optimizer sgd  --dropout 0.3 --encoder-embed-path glove.840B.300d.txt \
     --encoder-bidirectional --encoder-embed-dim 300 --decoder-embed-dim 300 --no-epoch-checkpoints --decoder-embed-path glove.840B.300d.txt --decoder-out-embed-dim 300 --num-workers 3 \
     --lr 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,0.25,0.125,0.0625,0.03125,0.015625,0.0078125

In [0]:
!fairseq-generate preprocessed_data \
    --path checkpoints/checkpoint_best.pt \
    --batch-size 64 --beam 3 | tee gen.out

In [0]:
!grep ^H gen.out | cut -f3- > gen.out.sys
!grep ^T gen.out | cut -f2- > gen.out.ref
!fairseq-score --sys gen.out.sys --ref gen.out.ref

In [0]:
!head -n 100 gen.out.sys