# Squad Pipeline for Question Answering


## 0. Install necessary packages and restart de kernel

In [None]:
#pip requirements
!python -m pip install --user --upgrade pip
!pip install --user --upgrade kfp

In [None]:
#Restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
from kfp.components import OutputPath, InputPath, func_to_container_op

## 1. Download datasets

In [None]:
def download(dataset_path: OutputPath(str)):
    import json
    import os
    import tempfile
    import zipfile

    import requests

    from tqdm import tqdm

    # Download GloVe
    print("Downloading glove")
    GLOVE_DIR = dataset_path + "/data/glove"
    os.makedirs(GLOVE_DIR, exist_ok=True)
    r = requests.get("http://nlp.stanford.edu/data/glove.6B.zip", stream=True)
    total_size_in_bytes = int(r.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r.iter_content(chunk_size=1024):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(GLOVE_DIR)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    print("Finished downloading glove")

    # Download SQuAD
    SQUAD_DIR = dataset_path + "/data/squad"
    os.makedirs(SQUAD_DIR, exist_ok=True)
    r_train = requests.get(
        "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
    )
    squad_train_json = json.loads(r_train.text)
    with open(SQUAD_DIR + "/train-v1.1.json", "w") as f:
        json.dump(squad_train_json, f)
    print(os.listdir(SQUAD_DIR))

    r_dev = requests.get(
        "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
    )
    squad_dev_json = json.loads(r_dev.text)
    with open(SQUAD_DIR + "/dev-v1.1.json", "w") as f:
        json.dump(squad_dev_json, f)
    print(os.listdir(SQUAD_DIR))


## 2. Preprocess squad dataset

In [None]:
def prepro_basic(
    dataset_path: InputPath(str),
    train_ratio,
    glove_vec_size,
    mode,
    tokenizer,
    url,
    port,
    prepro_squad_dir: OutputPath(str),
):
    import nltk

    from squad.prepro import prepro

    nltk.download("punkt")

    train_ratio = float(train_ratio)
    glove_vec_size = int(glove_vec_size)
    port = int(port)

    def main():
        args = get_args()
        prepro(args)

    def get_args():
        source_dir = dataset_path + "/data/squad"
        target_dir = prepro_squad_dir + "/squad"
        glove_dir = dataset_path + "/data/glove"
        from types import SimpleNamespace

        args = SimpleNamespace(
            source_dir=source_dir,
            target_dir=target_dir,
            debug=False,
            train_ratio=train_ratio,
            glove_corpus="6B",
            glove_dir=glove_dir,
            glove_vec_size=glove_vec_size,
            mode=mode,
            single_path="",
            tokenizer=tokenizer,
            url=url,
            port=port,
            split=False,
        )
        print(args)
        return args

    main()


## 3. Train SQUAD model

In [None]:
def train(
    prepro_dir: InputPath(str),
    sent_size_th,
    ques_size_th,
    num_epochs,
    num_steps,
    eval_period,
    save_period,
    learning_rate,
    batch_size,
    hidden_size,
    var_decay,
    model_dir: OutputPath(str),
):
    import tensorflow as tf

    from basic.cli import main

    model_name = "basic"
    data_dir = prepro_dir + "/squad"
    output_dir = model_dir + "/out/squad"
    argv = [
        "./basic/cli.py",
        "--model_name",
        model_name,
        "--data_dir",
        data_dir,
        "--out_base_dir",
        output_dir,
        "--noload",
        "--dev_name",
        "dev",
        "--sent_size_th",
        sent_size_th,
        "--ques_size_th",
        ques_size_th,
        "--num_epochs",
        num_epochs,
        "--num_steps",
        num_steps,
        "--eval_period",
        eval_period,
        "--save_period",
        save_period,
        "--init_lr",
        learning_rate,
        "--batch_size",
        batch_size,
        "--hidden_size",
        hidden_size,
        "--var_decay",
        var_decay,
    ]
    tf.app.run(main, argv)


## 4. Test SQUAD model and generate metrics

In [None]:
def test(
    prepro_dir: InputPath(str),
    prev_model_dir: InputPath(str),
    sent_size_th,
    ques_size_th,
    num_epochs,
    num_steps,
    eval_period,
    save_period,
    learning_rate,
    batch_size,
    hidden_size,
    var_decay,
    mlpipeline_metrics_path: OutputPath("Metrics"),
    model_dir: OutputPath(str),
):

    import os
    import shutil

    import tensorflow as tf

    src = prev_model_dir + "/out/squad"
    dst = model_dir + "/out/squad"
    shutil.copytree(src, dst)

    model_name = "basic"
    data_dir = prepro_dir + "/squad"
    output_dir = model_dir + "/out/squad"

    flags = tf.app.flags

    # Names and directories
    flags.DEFINE_string("model_name", model_name, "Model name [basic | basic-class]")
    flags.DEFINE_string("data_dir", data_dir, "Data dir [data/squad]")
    flags.DEFINE_string("run_id", "0", "Run ID [0]")
    flags.DEFINE_string("out_base_dir", output_dir, "out base dir [out]")
    flags.DEFINE_string("forward_name", "single", "Forward name [single]")
    flags.DEFINE_string("answer_path", "", "Answer path []")
    flags.DEFINE_string("eval_path", "", "Eval path []")
    flags.DEFINE_string("load_path", "", "Load path []")
    flags.DEFINE_string("shared_path", "", "Shared path []")

    # Device placement
    flags.DEFINE_string(
        "device", "/cpu:0", "default device for summing gradients. [/cpu:0]"
    )
    flags.DEFINE_string(
        "device_type",
        "gpu",
        "device for computing gradients (parallelization). cpu | gpu [gpu]",
    )
    flags.DEFINE_integer(
        "num_gpus", 1, "num of gpus or cpus for computing gradients [1]"
    )

    # Essential training and test options
    flags.DEFINE_string("mode", "test", "train | test | forward [test]")
    flags.DEFINE_boolean("load", True, "load saved data? [True]")
    flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
    flags.DEFINE_boolean("debug", False, "Debugging mode? [False]")
    flags.DEFINE_bool(
        "load_ema", True, "load exponential average of variables when testing?  [True]"
    )
    flags.DEFINE_bool("eval", True, "eval? [True]")
    flags.DEFINE_bool("train_only_output", False, "Train only output module?")
    flags.DEFINE_bool("load_trained_model", False, "Load SQUAD trained model")
    flags.DEFINE_bool("freeze_phrase_layer", False, "Freeze phrase layer")
    flags.DEFINE_bool("freeze_att_layer", False, "Freeze att layer")
    flags.DEFINE_bool(
        "freeze_span_modelling_layer", False, "Freeze modelling layer for span"
    )

    flags.DEFINE_bool("using_shared", False, "using pre-created shared.json")
    flags.DEFINE_bool("load_shared", False, "load shared.json for each batch")
    flags.DEFINE_string("dev_name", "test", "using dev or test?")
    flags.DEFINE_string("test_name", "dev", "using test or dev?")

    # Training / test parameters
    flags.DEFINE_integer("batch_size", int(batch_size), "Batch size [60]")
    flags.DEFINE_integer("val_num_batches", 100, "validation num batches [100]")
    flags.DEFINE_integer("test_num_batches", 0, "test num batches [0]")
    flags.DEFINE_integer(
        "num_epochs", int(num_epochs), "Total number of epochs for training [12]"
    )
    flags.DEFINE_integer("num_steps", int(num_steps), "Number of steps [20000]")
    flags.DEFINE_integer("load_step", 0, "load step [0]")
    flags.DEFINE_float("init_lr", float(learning_rate), "Initial learning rate [0.5]")
    flags.DEFINE_float(
        "input_keep_prob", 0.8, "Input keep prob for the dropout of LSTM weights [0.8]"
    )
    flags.DEFINE_float(
        "keep_prob", 0.8, "Keep prob for the dropout of Char-CNN weights [0.8]"
    )
    flags.DEFINE_float("wd", 0.0, "L2 weight decay for regularization [0.0]")
    flags.DEFINE_integer("hidden_size", int(hidden_size), "Hidden size [100]")
    flags.DEFINE_integer("char_out_size", 100, "char-level word embedding size [100]")
    flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
    flags.DEFINE_string(
        "out_channel_dims",
        "100",
        "Out channel dims of Char-CNN, separated by commas [100]",
    )
    flags.DEFINE_string(
        "filter_heights", "5", "Filter heights of Char-CNN, separated by commas [5]"
    )
    flags.DEFINE_bool("finetune", False, "Finetune word embeddings? [False]")
    flags.DEFINE_bool("highway", True, "Use highway? [True]")
    flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
    flags.DEFINE_bool("share_cnn_weights", True, "Share Char-CNN weights [True]")
    flags.DEFINE_bool(
        "share_lstm_weights",
        True,
        "Share pre-processing (phrase-level) LSTM weights [True]",
    )
    flags.DEFINE_float(
        "var_decay",
        float(var_decay),
        "Exponential moving average decay for variables [0.999]",
    )
    flags.DEFINE_string("classifier", "maxpool", "[maxpool, sumpool, default]")

    # Optimizations
    flags.DEFINE_bool("cluster", True, "Cluster data for faster training [False]")
    flags.DEFINE_bool("len_opt", True, "Length optimization? [False]")
    flags.DEFINE_bool(
        "cpu_opt", False, "CPU optimization? GPU computation can be slower [False]"
    )

    # Logging and saving options
    flags.DEFINE_boolean("progress", True, "Show progress? [True]")
    flags.DEFINE_integer("log_period", 100, "Log period [100]")
    flags.DEFINE_integer("eval_period", int(eval_period), "Eval period [1000]")
    flags.DEFINE_integer("save_period", int(save_period), "Save Period [1000]")
    flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
    flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
    flags.DEFINE_bool("dump_answer", False, "dump answer? [True]")
    flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
    flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
    flags.DEFINE_float(
        "decay", 0.9, "Exponential moving average decay for logging values [0.9]"
    )

    # Thresholds for speed and less memory usage
    flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
    flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
    flags.DEFINE_integer("sent_size_th", int(sent_size_th), "sent size th [64]")
    flags.DEFINE_integer("num_sents_th", 1, "num sents th [8]")
    flags.DEFINE_integer("ques_size_th", int(ques_size_th), "ques size th [32]")
    flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
    flags.DEFINE_integer("para_size_th", 256, "para size th [256]")

    # Advanced training options
    flags.DEFINE_bool("lower_word", True, "lower word [True]")
    flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
    flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
    flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
    flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
    flags.DEFINE_bool(
        "known_if_glove", True, "consider as known if present in glove [False]"
    )
    flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
    flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
    flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")

    # Ablation options
    flags.DEFINE_bool("use_char_emb", True, "use char emb? [True]")
    flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
    flags.DEFINE_bool("q2c_att", True, "question-to-context attention? [True]")
    flags.DEFINE_bool("c2q_att", True, "context-to-question attention? [True]")
    flags.DEFINE_bool("dynamic_att", False, "Dynamic attention [False]")

    def main(_):
        from basic.main import main as m

        config = flags.FLAGS
        config.out_dir = os.path.join(
            config.out_base_dir, config.model_name, str(config.run_id).zfill(2)
        )
        evaluator = m(config)

        """Generating metrics for the squad model"""
        metrics = {
            "metrics": [
                {
                    "name": "accuracy-score",
                    "numberValue": str(evaluator.acc),
                    "format": "RAW",
                },
                {
                    "name": "f1-score",
                    "numberValue": str(evaluator.f1),
                    "format": "RAW",
                },
            ]
        }
        import json

        with open(mlpipeline_metrics_path, "w") as f:
            json.dump(metrics, f)

    tf.app.run(main)


## 5. Transform python functions into components

In [None]:
download_op = func_to_container_op(
        download,
        base_image="tensorflow/tensorflow:latest-gpu-py3",
        packages_to_install=["tqdm"],
    )
squad_preprocess_op = func_to_container_op(
        prepro_basic,
        base_image="sciling/tensorflow:0.12.0-py3",
        packages_to_install=[
            "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer",
        ]
    )
squad_span_pretrain_op = func_to_container_op(
        train,
        base_image="sciling/tensorflow:0.12.0-py3",
        packages_to_install=[
            "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer",
        ]
    )
squad_test_op = func_to_container_op(
        test,
        base_image="sciling/tensorflow:0.12.0-py3",
        packages_to_install=[
            "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer"
        ]
    )


## 6. Define pipeline and component connections

In [None]:
# Define the pipeline
@dsl.pipeline(name="pipeline_model_squad", description="")
def qa_pipeline(
    prepro_train_ratio: float = 0.9,
    prepro_glove_vec_size: int = 100,
    prepro_mode: str = "full",
    prepro_tokenizer: str = "PTB",
    prepro_url: str = "vision-server2.corp.ai2",
    prepro_port: int = 8000,
    train_sent_size_th: str = "500",
    train_ques_size_th: str = "30",
    train_num_epochs: str = "12",
    train_num_steps: str = "20000",
    train_eval_period: str = "500",
    train_save_period: str = "500",
    train_learning_rate: float = 0.5,
    train_batch_size: int = 60,
    train_hidden_size: int = 100,
    train_var_decay: float = 0.999,
):
    dataset_path = download_op()
    prepro_span = squad_preprocess_op(
        dataset_path.output,
        prepro_train_ratio,
        prepro_glove_vec_size,
        prepro_mode,
        prepro_tokenizer,
        prepro_url,
        prepro_port,
    )
    model = squad_span_pretrain_op(
        prepro_span.output,
        train_sent_size_th,
        train_ques_size_th,
        train_num_epochs,
        train_num_steps,
        train_eval_period,
        train_save_period,
        train_learning_rate,
        train_batch_size,
        train_hidden_size,
        train_var_decay,
    ).set_memory_request("4G")
    squad_test_op(
        prepro_span.output,
        model.output,
        train_sent_size_th,
        train_ques_size_th,
        train_num_epochs,
        train_num_steps,
        train_eval_period,
        train_save_period,
        train_learning_rate,
        train_batch_size,
        train_hidden_size,
        train_var_decay,
    )


## 7. Connect with AWS Client

In [None]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Disable ssl verification
from kfp_server_api.configuration import Configuration as Configuration
if 'old_init' not in globals():
    old_init = Configuration.__init__
def new_init(self, *k, **kw):
    old_init(self, *k, **kw)
    self.verify_ssl = False
Configuration.__init__ = new_init
cookies = "authservice_session=YOUR_TOKEN"
client = kfp.Client(host='http://istio-ingressgateway.istio-system.svc/pipeline', namespace='admin', cookies=cookies)
client.list_experiments(namespace="admin")

## 8. Compile the pipeline

In [None]:
pipeline_func = qa_pipeline
experiment_name = 'squad_experiment'
run_name = pipeline_func.__name__ + ' run'

kfp.compiler.Compiler().compile(pipeline_func,  '{}.zip'.format(experiment_name))

## 9. Define arguments and create a run

In [None]:
# ARGUMENTS DEFINITION
arguments = {
    'train_sent_size_th' : '10',
    'train_ques_size_th':'10', 
    "train_num_epochs" : '1',
    "train_num_steps" : '1',
    "train_eval_period" : '1',
    "train_save_period" : '1',
}


# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  namespace='admin',
                                                  arguments=arguments)