# WikiQA Question Answering through Transfer Learning


## 0. Install necessary packages and restart de kernel

In [None]:
#pip requirements
!python -m pip install --user --upgrade pip
!pip install --user --upgrade kfp

In [None]:
#Restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
from kfp.components import OutputPath, InputPath, func_to_container_op, OutputBinaryFile, InputBinaryFile


## 1. Download data

In [None]:
def download(squad_url, dataset_path: OutputPath()):
    import os
    import tempfile
    import zipfile

    import requests

    from tqdm import tqdm

    # Download WikiQA
    r = requests.get(
        "https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip",
        stream=True,
    )
    total_size_in_bytes = int(r.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r.iter_content(chunk_size=128):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(dataset_path)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    # Download GloVe
    GLOVE_DIR = dataset_path + "/glove"
    os.makedirs(GLOVE_DIR, exist_ok=True)
    r = requests.get("http://nlp.stanford.edu/data/glove.6B.zip", stream=True)
    total_size_in_bytes = int(r.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r.iter_content(chunk_size=1024):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(GLOVE_DIR)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    # Download Squad
    r_squad = requests.get(squad_url)
    total_size_in_bytes = int(r_squad.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r_squad.iter_content(chunk_size=1024):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(dataset_path)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    print(os.listdir(dataset_path))


## 2. Preprocess wikiqa

In [None]:
def prepro_class(dataset_path: InputPath(str), wikiqa_path: OutputPath(str)):
    import nltk

    from wikiqa.prepro_class import prepro

    nltk.download("punkt")

    def get_args():
        from types import SimpleNamespace

        source_dir = dataset_path + "/WikiQACorpus"
        target_dir = wikiqa_path + "/wikiqa-class"
        glove_dir = dataset_path + "/glove"
        args = SimpleNamespace(
            source_dir=source_dir,
            target_dir=target_dir,
            debug=False,
            glove_corpus="6B",
            glove_dir=glove_dir,
            glove_vec_size="100",
            tokenizer="PTB",
        )
        return args

    args = get_args()
    prepro(args)

## 3. WikiQA train

In [None]:
def wikiqa_train(
    dataset_path: InputPath(str),
    wikiqa_path: InputPath(str),
    load_path,
    shared_path,
    run_id,
    sent_size_th,
    ques_size_th,
    num_epochs,
    num_steps,
    eval_period,
    save_period,
    model_path: OutputPath(str),
):
    import tensorflow as tf

    from basic.cli import main

    input_dir = wikiqa_path + "/wikiqa-class"
    output_dir = model_path + "/out/wikiqa"
    full_load_path = dataset_path + load_path
    full_shared_path = dataset_path + shared_path
    tf.app.run(
        main,
        argv=[
            "./basic/cli.py",
            "--data_dir",
            input_dir,
            "--out_base_dir",
            output_dir,
            "--load_path",
            full_load_path,
            "--shared_path",
            full_shared_path,
            "--load_trained_model",
            "--run_id",
            run_id,
            "--sent_size_th",
            sent_size_th,
            "--ques_size_th",
            ques_size_th,
            "--num_epochs",
            num_epochs,
            "--num_steps",
            num_steps,
            "--eval_period",
            eval_period,
            "--save_period",
            save_period,
        ],
    )


## 4. WikiQA test

In [None]:
def wikiqa_test(
    dataset_path: InputPath(str),
    wikiqa_path: InputPath(str),
    prev_model_path: InputPath(str),
    shared_path,
    run_id,
    sent_size_th,
    ques_size_th,
    num_epochs,
    num_steps,
    eval_period,
    save_period,
    mlpipeline_metrics_path: OutputPath("Metrics"),
    model_path: OutputPath(str),
):
    input_dir = wikiqa_path + "/wikiqa-class"
    output_dir = model_path + "/out/wikiqa"

    import shutil

    src = prev_model_path + "/out/wikiqa"
    dst = model_path + "/out/wikiqa"
    shutil.copytree(src, dst)

    full_shared_path = dataset_path + shared_path
    import os

    import tensorflow as tf

    flags = tf.app.flags

    # Names and directories
    flags.DEFINE_string("model_name", "basic-class", "Model name [basic | basic-class]")
    flags.DEFINE_string("data_dir", input_dir, "Data dir [data/squad]")
    flags.DEFINE_string("run_id", run_id, "Run ID [0]")
    flags.DEFINE_string("out_base_dir", output_dir, "out base dir [out]")
    flags.DEFINE_string("forward_name", "single", "Forward name [single]")
    flags.DEFINE_string("answer_path", "", "Answer path []")
    flags.DEFINE_string("eval_path", "", "Eval path []")
    flags.DEFINE_string("load_path", "", "Load path []")
    flags.DEFINE_string("shared_path", full_shared_path, "Shared path []")

    # Device placement
    flags.DEFINE_string(
        "device", "/cpu:0", "default device for summing gradients. [/cpu:0]"
    )
    flags.DEFINE_string(
        "device_type",
        "gpu",
        "device for computing gradients (parallelization). cpu | gpu [gpu]",
    )
    flags.DEFINE_integer(
        "num_gpus", 1, "num of gpus or cpus for computing gradients [1]"
    )

    # Essential training and test options
    flags.DEFINE_string("mode", "test", "train | test | forward [test]")
    flags.DEFINE_boolean("load", True, "load saved data? [True]")
    flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
    flags.DEFINE_boolean("debug", False, "Debugging mode? [False]")
    flags.DEFINE_bool(
        "load_ema", True, "load exponential average of variables when testing?  [True]"
    )
    flags.DEFINE_bool("eval", True, "eval? [True]")
    flags.DEFINE_bool("train_only_output", False, "Train only output module?")
    flags.DEFINE_bool("load_trained_model", False, "Load SQUAD trained model")
    flags.DEFINE_bool("freeze_phrase_layer", False, "Freeze phrase layer")
    flags.DEFINE_bool("freeze_att_layer", False, "Freeze att layer")
    flags.DEFINE_bool(
        "freeze_span_modelling_layer", False, "Freeze modelling layer for span"
    )

    flags.DEFINE_bool("using_shared", False, "using pre-created shared.json")
    flags.DEFINE_bool("load_shared", False, "load shared.json for each batch")
    flags.DEFINE_string("dev_name", "test", "using dev or test?")
    flags.DEFINE_string("test_name", "test", "using test or dev?")

    # Training / test parameters
    flags.DEFINE_integer("batch_size", 60, "Batch size [60]")
    flags.DEFINE_integer("val_num_batches", 100, "validation num batches [100]")
    flags.DEFINE_integer("test_num_batches", 0, "test num batches [0]")
    flags.DEFINE_integer(
        "num_epochs", int(num_epochs), "Total number of epochs for training [12]"
    )
    flags.DEFINE_integer("num_steps", int(num_steps), "Number of steps [20000]")
    flags.DEFINE_integer("load_step", 0, "load step [0]")
    flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
    flags.DEFINE_float(
        "input_keep_prob", 0.8, "Input keep prob for the dropout of LSTM weights [0.8]"
    )
    flags.DEFINE_float(
        "keep_prob", 0.8, "Keep prob for the dropout of Char-CNN weights [0.8]"
    )
    flags.DEFINE_float("wd", 0.0, "L2 weight decay for regularization [0.0]")
    flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]")
    flags.DEFINE_integer("char_out_size", 100, "char-level word embedding size [100]")
    flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
    flags.DEFINE_string(
        "out_channel_dims",
        "100",
        "Out channel dims of Char-CNN, separated by commas [100]",
    )
    flags.DEFINE_string(
        "filter_heights", "5", "Filter heights of Char-CNN, separated by commas [5]"
    )
    flags.DEFINE_bool("finetune", False, "Finetune word embeddings? [False]")
    flags.DEFINE_bool("highway", True, "Use highway? [True]")
    flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
    flags.DEFINE_bool("share_cnn_weights", True, "Share Char-CNN weights [True]")
    flags.DEFINE_bool(
        "share_lstm_weights",
        True,
        "Share pre-processing (phrase-level) LSTM weights [True]",
    )
    flags.DEFINE_float(
        "var_decay", 0.999, "Exponential moving average decay for variables [0.999]"
    )
    flags.DEFINE_string("classifier", "maxpool", "[maxpool, sumpool, default]")

    # Optimizations
    flags.DEFINE_bool("cluster", True, "Cluster data for faster training [False]")
    flags.DEFINE_bool("len_opt", True, "Length optimization? [False]")
    flags.DEFINE_bool(
        "cpu_opt", False, "CPU optimization? GPU computation can be slower [False]"
    )

    # Logging and saving options
    flags.DEFINE_boolean("progress", True, "Show progress? [True]")
    flags.DEFINE_integer("log_period", 100, "Log period [100]")
    flags.DEFINE_integer("eval_period", int(eval_period), "Eval period [1000]")
    flags.DEFINE_integer("save_period", int(save_period), "Save Period [1000]")
    flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
    flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
    flags.DEFINE_bool("dump_answer", False, "dump answer? [True]")
    flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
    flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
    flags.DEFINE_float(
        "decay", 0.9, "Exponential moving average decay for logging values [0.9]"
    )

    # Thresholds for speed and less memory usage
    flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
    flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
    flags.DEFINE_integer("sent_size_th", int(sent_size_th), "sent size th [64]")
    flags.DEFINE_integer("num_sents_th", 1, "num sents th [8]")
    flags.DEFINE_integer("ques_size_th", int(ques_size_th), "ques size th [32]")
    flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
    flags.DEFINE_integer("para_size_th", 256, "para size th [256]")

    # Advanced training options
    flags.DEFINE_bool("lower_word", True, "lower word [True]")
    flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
    flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
    flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
    flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
    flags.DEFINE_bool(
        "known_if_glove", True, "consider as known if present in glove [False]"
    )
    flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
    flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
    flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")

    # Ablation options
    flags.DEFINE_bool("use_char_emb", True, "use char emb? [True]")
    flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
    flags.DEFINE_bool("q2c_att", True, "question-to-context attention? [True]")
    flags.DEFINE_bool("c2q_att", True, "context-to-question attention? [True]")
    flags.DEFINE_bool("dynamic_att", False, "Dynamic attention [False]")

    def main(_):
        from basic.main import main as m

        config = flags.FLAGS
        config.model_name = "basic-class"
        config.out_dir = os.path.join(
            config.out_base_dir, config.model_name, str(config.run_id).zfill(2)
        )

        print(config.out_dir)
        evaluator = m(config)

        """Generating metrics for the squad model"""
        metrics = {
            "metrics": [
                {
                    "name": "accuracy-score",
                    "numberValue": str(evaluator.acc),
                    "format": "RAW",
                },
                {
                    "name": "loss",
                    "numberValue": str(evaluator.loss),
                    "format": "RAW",
                },
            ]
        }

        import json

        with open(mlpipeline_metrics_path, "w") as f:
            json.dump(metrics, f)

    tf.app.run(main)


## 5. WikiQA evaluate

In [None]:
def wikiqa_evaluate(
    wikiqa_path: InputPath(str),
    model_path: InputPath(str),
    start_step,
    end_step,
    eval_period,
    run_ids,
    mlpipeline_metrics_path: OutputPath("Metrics"),
):
    from wikiqa.result import evaluate
    from wikiqa.result import load

    end_step = int(end_step)
    start_step = int(start_step)
    eval_period = int(eval_period)

    def get_args():
        from types import SimpleNamespace

        data_dir = wikiqa_path + "/wikiqa-class"
        eval_dir = model_path + "/out/wikiqa/basic-class"
        args = SimpleNamespace(
            data_dir=data_dir,
            eval_dir=eval_dir,
            run_ids=run_ids,
            eval_name="test",
            eval_period=eval_period,
            start_step=start_step,
            end_step=end_step,
            steps="",
            ensemble=False,
        )
        return args

    def main():
        metrics_list = []
        args = get_args()
        data = load(args)
        for run_id in args.run_ids.split(","):
            best_eval, best_global_step = (0, 0, 0), -1
            print("Evaluate run_id = %s..." % run_id)
            for global_step in range(
                args.start_step, args.end_step + args.eval_period, args.eval_period
            ):
                curr_eval = evaluate(args, [run_id], data, [global_step])
                if curr_eval[0] > best_eval[0]:
                    best_eval, best_global_step = curr_eval, global_step
            print(
                "Best MAP: %.2f\tMRR: %.2f\tP@1: %.2f in global step %d"
                % (best_eval[0], best_eval[1], best_eval[2], best_global_step)
            )

            """Generating metrics for the squad model"""
            metrics_list.append(
                {
                    "name": "MAP_for_run_%d" % best_global_step,
                    "numberValue": str(best_eval[0]),
                    "format": "RAW",
                }
            )
            metrics_list.append(
                {
                    "name": "MRR_for_run_%d" % best_global_step,
                    "numberValue": str(best_eval[1]),
                    "format": "RAW",
                }
            )
            metrics_list.append(
                {
                    "name": "P1_for_run_%d" % best_global_step,
                    "numberValue": str(best_eval[2]),
                    "format": "RAW",
                }
            )

        metrics = {"metrics": metrics_list}

        import json

        with open(mlpipeline_metrics_path, "w") as f:
            json.dump(metrics, f)

    main()


## 6. Transform python functions into components

In [None]:
download_op = func_to_container_op(
    download,
    base_image="tensorflow/tensorflow:latest-gpu-py3",
    packages_to_install=["tqdm"],
)

wikiqa_prepro_op = func_to_container_op(
    prepro_class,
    base_image="sciling/tensorflow:0.12.0-py3",
    packages_to_install=[
        "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer"
    ],
)

train_op = func_to_container_op(
    wikiqa_train,
    base_image="sciling/tensorflow:0.12.0-py3",
    packages_to_install=[
        "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer"
    ],
)

evaluate_op = func_to_container_op(
    wikiqa_evaluate,
    base_image="sciling/tensorflow:0.12.0-py3",
    packages_to_install=[
        "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer"
    ],
)

test_op = func_to_container_op(
    wikiqa_test,
    base_image="sciling/tensorflow:0.12.0-py3",
    packages_to_install=[
        "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer"
    ],
)

## 7. Define pipeline and component connections

In [None]:
def qa_pipeline(
    squad_url: str = "https://github.com/sciling/qatransfer/releases/download/v0.1/save_class.zip",
    squad_load_path: str = "/save/out/squad/basic-class/00/save/basic-class-1",
    squad_shared_path: str = "/save/out/squad/basic-class/00/shared.json",
    run_id: str = "00",
    sent_size_th: str = "500",
    ques_size_th: str = "30",
    num_epochs: str = "12",
    num_steps: str = "5000",
    eval_period: str = "200",
    save_period: str = "200",
    start_step: int = 2001,
    end_step: int = 2201,
):
    # Download
    dataset_path = download_op(squad_url)

    # Preprocess wikiqa
    wikiqa_prepro = wikiqa_prepro_op(dataset_path.output)

    # Train wikiqa with pretrained model SQUAD
    trained_model = train_op(
        dataset_path.output,
        wikiqa_prepro.output,
        load_path=squad_load_path,
        shared_path=squad_shared_path,
        run_id=run_id,
        sent_size_th=sent_size_th,
        ques_size_th=ques_size_th,
        num_epochs=num_epochs,
        num_steps=num_steps,
        eval_period=eval_period,
        save_period=save_period,
    )

    evaluate_op(
        wikiqa_prepro.output,
        trained_model.output,
        start_step,
        end_step,
        eval_period,
        run_id,
    )

    test_op(
        dataset_path.output,
        wikiqa_prepro.output,
        trained_model.output,
        shared_path=squad_shared_path,
        run_id=run_id,
        sent_size_th=sent_size_th,
        ques_size_th=ques_size_th,
        num_epochs=num_epochs,
        num_steps=num_steps,
        eval_period=eval_period,
        save_period=save_period,
    )


## 8. Connect with AWS Client

In [None]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Disable ssl verification
from kfp_server_api.configuration import Configuration as Configuration
if 'old_init' not in globals():
    old_init = Configuration.__init__
def new_init(self, *k, **kw):
    old_init(self, *k, **kw)
    self.verify_ssl = False
Configuration.__init__ = new_init
cookies = "YOUR TOKEN"
client = kfp.Client(host='http://istio-ingressgateway.istio-system.svc/pipeline', namespace='admin', cookies=cookies)
client.list_experiments(namespace="admin")

## 9. Compile the pipeline

In [None]:
pipeline_func = qa_pipeline
experiment_name = 'wikiqa'
run_name = pipeline_func.__name__ + ' run'

kfp.compiler.Compiler().compile(pipeline_func,  '{}.zip'.format(experiment_name))

## 10. Define arguments and create a run

In [None]:
# ARGUMENTS DEFINITION
arguments = {
    'sent_size_th' : '10',
    'ques_size_th':'10', 
    "num_epochs" : '1',
    "num_steps" : '3',
    "eval_period" : '1',
    "save_period" : '1',
    "start_step" : 2, 
    "end_step" : 2,
}

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  namespace='admin',
                                                  arguments=arguments)