# Semeval Question Answering through Transfer Learning


## 0. Install necessary packages and restart de kernel

In [None]:
#pip requirements
!python -m pip install --user --upgrade pip
!pip install --user --upgrade kfp

In [None]:
#Restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
from kfp.components import OutputPath, InputPath, func_to_container_op, OutputBinaryFile, InputBinaryFile


## 1. Download data

In [None]:
def download(squad_url, dataset_path: OutputPath()):
    import os
    import tempfile
    import zipfile

    import requests

    from tqdm import tqdm

    # Download SemEval
    SEMEVAL = dataset_path + "/semeval"
    os.makedirs(SEMEVAL, exist_ok=True)
    r_semeval = requests.get(
        "http://alt.qcri.org/semeval2016/task3/data/uploads/semeval2016-task3-cqa-ql-traindev-v3.2.zip"
    )
    total_size_in_bytes = int(r_semeval.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r_semeval.iter_content(chunk_size=1024):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(SEMEVAL)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    r_semeval_test = requests.get(
        "http://alt.qcri.org/semeval2016/task3/data/uploads/semeval2016_task3_test.zip"
    )
    total_size_in_bytes = int(r_semeval_test.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r_semeval_test.iter_content(chunk_size=1024):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(SEMEVAL)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    # Download GloVe
    GLOVE_DIR = dataset_path + "/glove"
    os.makedirs(GLOVE_DIR, exist_ok=True)
    r = requests.get("http://nlp.stanford.edu/data/glove.6B.zip", stream=True)
    total_size_in_bytes = int(r.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r.iter_content(chunk_size=1024):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(GLOVE_DIR)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    r_squad = requests.get(squad_url)
    total_size_in_bytes = int(r_squad.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r_squad.iter_content(chunk_size=1024):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(dataset_path)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    print(os.listdir(dataset_path))


## 2. Preprocess semeval

In [None]:
def semeval_prepro(dataset_path: InputPath(str), semeval_path: OutputPath(str)):
    import json
    import os

    from collections import Counter

    import nltk
    import numpy as np
    import xmltodict

    from tqdm import tqdm

    nltk.download("punkt")

    def prepro(args):
        if not os.path.exists(args.target_dir):
            os.makedirs(args.target_dir)
        prepro_each(args, "train")
        prepro_each(args, "dev")
        prepro_each(args, "test")

    def get_args():
        source_dir = dataset_path + "/semeval"
        target_dir = semeval_path + "/semeval"
        glove_dir = dataset_path + "/glove"

        # Create args object
        from types import SimpleNamespace

        args = SimpleNamespace(
            source_dir=source_dir, target_dir=target_dir, glove_dir=glove_dir
        )
        return args

    def save(target_dir, data, shared, data_type):
        data_path = os.path.join(target_dir, "data_{}.json".format(data_type))
        shared_path = os.path.join(target_dir, "shared_{}.json".format(data_type))
        json.dump(data, open(data_path, "w"))
        json.dump(shared, open(shared_path, "w"))

    def get_word2vec(glove_dir, word_counter):
        glove_corpus = "6B"
        glove_vec_size = 100
        glove_path = os.path.join(
            glove_dir, "glove.{}.{}d.txt".format(glove_corpus, glove_vec_size)
        )
        sizes = {
            "6B": int(4e5),
            "42B": int(1.9e6),
            "840B": int(2.2e6),
            "2B": int(1.2e6),
        }
        total = sizes[glove_corpus]
        word2vec_dict = {}
        with open(glove_path, "r", encoding="utf-8") as fh:
            for line in tqdm(fh, total=total):
                array = line.lstrip().rstrip().split(" ")
                word = array[0]
                vector = list(map(float, array[1:]))
                if word in word_counter:
                    word2vec_dict[word] = vector
                elif word.capitalize() in word_counter:
                    word2vec_dict[word.capitalize()] = vector
                elif word.lower() in word_counter:
                    word2vec_dict[word.lower()] = vector
                elif word.upper() in word_counter:
                    word2vec_dict[word.upper()] = vector

        print(
            "{}/{} of word vocab have corresponding vectors in {}".format(
                len(word2vec_dict), len(word_counter), glove_path
            )
        )
        return word2vec_dict

    def word_tokenize(tokens):
        return [
            token.replace("''", '"').replace("``", '"')
            for token in nltk.word_tokenize(tokens)
        ]

    def prepro_each(args, data_type):

        data_list = []
        sub_dir = (
            "SemEval2016_task3_test/English"
            if data_type == "test"
            else "v3.2/%s" % (data_type)
        )
        fileName = "SemEval2016-Task3-CQA-QL-%s-subtaskA.xml"
        if data_type == "train":
            with open(
                os.path.join(args.source_dir, sub_dir, fileName % ("train-part1")),
                encoding="utf-8",
            ) as f:
                data_list += xmltodict.parse(f.read())["xml"]["Thread"]
            with open(
                os.path.join(args.source_dir, sub_dir, fileName % ("train-part2")),
                encoding="utf-8",
            ) as f:
                data_list += xmltodict.parse(f.read())["xml"]["Thread"]
        else:
            with open(
                os.path.join(args.source_dir, sub_dir, fileName % (data_type)),
                encoding="utf-8",
            ) as f:
                data_list += xmltodict.parse(f.read())["xml"]["Thread"]
        questions, comments, answers, question_ids, answer_ids = [], [], [], [], []

        text2answer = {"Good": 0, "PotentiallyUseful": 1, "Bad": 2}
        for data in data_list:
            q = str(data["RelQuestion"]["RelQBody"])
            qid = data["RelQuestion"]["@RELQ_ID"]
            cs = data["RelComment"]
            comment = [c["RelCText"] for c in cs]
            answer = [text2answer[c["@RELC_RELEVANCE2RELQ"]] for c in cs]
            cids = [c["@RELC_ID"] for c in cs]
            questions.append(q)
            comments.append(comment)
            answers.append(answer)
            question_ids.append(qid)
            answer_ids.append(cids)

        q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
        char_counter, lower_word_counter = Counter(), Counter()

        print("start for preprocessing for %s" % (data_type))

        q_len, x_len = [], []
        for ai, (question, comment, answer, question_id, answer_id) in tqdm(
            enumerate(zip(questions, comments, answers, question_ids, answer_ids))
        ):
            qi = word_tokenize(question)
            q_len.append(len(qi))
            cqi = [list(qij) for qij in qi]
            for qij in qi:
                lower_word_counter[qij.lower()] += 1
                for qijk in qij:
                    char_counter[qijk] += 1
            for pi, (story, yi, a_id) in enumerate(zip(comment, answer, answer_id)):
                rxi = [ai, pi]
                xi = [word_tokenize(story)]
                x_len.append(len(xi[0]))
                cxi = [list(xij) for xij in xi]
                for xij in xi:
                    for xijk in xij:
                        lower_word_counter[xijk.lower()] += 1
                        for xijkl in xijk:
                            char_counter[xijkl] += 1
                json.dump(
                    {"x": xi, "cx": cxi, "p": story},
                    open(
                        os.path.join(
                            args.target_dir,
                            "shared_%s_%s_%s.json"
                            % (data_type, str(ai).zfill(3), str(pi).zfill(3)),
                        ),
                        "w",
                    ),
                )

                def put():
                    q.append(qi)
                    cq.append(cqi)
                    y.append(yi)
                    rx.append(rxi)
                    rcx.append(rxi)
                    ids.append((question_id, a_id))
                    idxs.append(len(idxs))

                put()
                if data_type == "train" and yi == 0:
                    for t in range(3):
                        put()
                    if np.random.randint(10) > 0:
                        put()
                elif data_type == "train" and yi == 2:
                    for t in range(3):
                        put()
        lower_word2vec_dict = get_word2vec(args.glove_dir, lower_word_counter)
        data = {
            "q": q,
            "cq": cq,
            "y": y,
            "*x": rx,
            "*cx": rcx,
            "idxs": idxs,
            "ids": ids,
            "*p": rx,
        }
        shared = {
            "char_counter": char_counter,
            "lower_word_counter": lower_word_counter,
            "lower_word2vec": lower_word2vec_dict,
        }
        print("saving ...")
        save(args.target_dir, data, shared, data_type)

    args = get_args()
    prepro(args)

    print(os.listdir(semeval_path + "/semeval"))


## 3. Semeval train

In [None]:
def semeval_train(
    dataset_path: InputPath(str),
    semeval_path: InputPath(str),
    load_path,
    shared_path,
    run_id,
    sent_size_th,
    ques_size_th,
    num_epochs,
    num_steps,
    eval_period,
    save_period,
    model_path: OutputPath(str),
):
    import tensorflow as tf

    from basic.cli import main

    input_dir = semeval_path + "/semeval"
    output_dir = model_path + "/out/semeval"
    full_load_path = dataset_path + load_path
    full_shared_path = dataset_path + shared_path
    tf.app.run(
        main,
        argv=[
            "./basic/cli.py",
            "--data_dir",
            input_dir,
            "--out_base_dir",
            output_dir,
            "--load_path",
            full_load_path,
            "--shared_path",
            full_shared_path,
            "--load_trained_model",
            "--run_id",
            run_id,
            "--load_shared",
            "--nocluster",
            "--sent_size_th",
            sent_size_th,
            "--ques_size_th",
            ques_size_th,
            "--num_epochs",
            num_epochs,
            "--num_steps",
            num_steps,
            "--eval_period",
            eval_period,
            "--save_period",
            save_period,
        ],
    )


## 4. Semeval test file generation

In [None]:
def generate_semeval_test_files(
    semeval_path: InputPath(str),
    model_path: InputPath(str),
    start_step,
    end_step,
    eval_period,
    run_ids,
    threshold,
    test_path: OutputPath(str),
):
    import os

    from semeval.result import evaluate
    from semeval.result import load_data

    end_step = int(end_step)
    start_step = int(start_step)
    eval_period = int(eval_period)
    threshold = float(threshold)

    def get_args():
        data_dir = semeval_path + "/semeval"
        eval_dir = model_path + "/out/semeval/basic-class"
        store_dir = test_path + "/semeval/store"
        from types import SimpleNamespace

        args = SimpleNamespace(
            data_dir=data_dir,
            end_step=end_step,
            ensemble=False,
            eval_dir=eval_dir,
            eval_name="test",
            eval_period=eval_period,
            run_ids=run_ids,
            start_step=start_step,
            steps="",
            store_dir=store_dir,
            threshold=threshold,
        )
        return args

    def main():
        args = get_args()
        print(args)
        if args.ensemble:
            print("Ensemble not implemented yet")
            return

        data = load_data(args)
        if not os.path.exists(args.store_dir):
            os.makedirs(args.store_dir)

        for i, run_id in enumerate(args.run_ids.split(",")):
            for step in range(args.start_step, args.end_step, args.eval_period):
                evaluate(
                    args,
                    data,
                    run_id,
                    step,
                    dump_gold=(i == 0 and step == args.start_step),
                )

    main()
    print("Generated files:")
    print(os.listdir(test_path + "/semeval/store"))


## 5. Semeval test

In [None]:
def semeval_test(
    test_path: InputPath(str),
    run_id,
    start_step,
    end_step,
    th,
    reranking_th,
    op_format,
    verbose,
    ignore_noanswer,
):
    import sys

    from semeval.evaluation.MAP_scripts.ev import eval_reranker
    from semeval.evaluation.MAP_scripts.ev import eval_search_engine

    th = int(th)
    reranking_th = int(reranking_th)

    def main(options, args):
        if len(args) == 1:
            res_fname = args[0]
            eval_search_engine(res_fname, options["format"], options["th"])
        elif len(args) == 2:
            res_fname = args[0]
            pred_fname = args[1]
            eval_reranker(
                res_fname,
                pred_fname,
                options["format"],
                options["th"],
                options["verbose"],
                options["reranking_th"],
                options["ignore_noanswer"],
            )
        else:
            sys.exit(1)

    start_step = int(start_step)
    end_step = int(end_step)
    if verbose == "False":
        verbose = False
    else:
        verbose = True
    if ignore_noanswer == "False":
        ignore_noanswer = False
    else:
        ignore_noanswer = True
    options = {
        "th": th,
        "reranking_th": reranking_th,
        "format": op_format,
        "verbose": verbose,
        "ignore_noanswer": ignore_noanswer,
    }
    for step in range(start_step, end_step + 1, 200):
        args = [
            test_path + "/semeval/store/test-gold",
            test_path + "/semeval/store/test-" + run_id + "-" + str(step).zfill(6),
        ]
        main(options, args)

## 6. Transform python functions into components

In [None]:
download_op = func_to_container_op(
        download,
        base_image="tensorflow/tensorflow:latest-gpu-py3",
        packages_to_install=["tqdm"],
    )
semeval_prepro_op = func_to_container_op(
        semeval_prepro,
        base_image="sciling/tensorflow:0.12.0-py3",
        packages_to_install=[
            "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer"
        ],
    )
semeval_train_op = func_to_container_op(
        semeval_train,
        base_image="sciling/tensorflow:0.12.0-py3",
        packages_to_install=[
            "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer"
        ],
    )
semeval_generate_test_files_op = func_to_container_op(
        generate_semeval_test_files,
        base_image="sciling/tensorflow:0.12.0-py3",
        packages_to_install=[
            "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer"
        ],
    )
semeval_test_op = func_to_container_op(
        semeval_test,
        base_image="sciling/tensorflow:0.12.0-py3",
        packages_to_install=[
            "https://github.com/sciling/qatransfer/archive/refs/heads/master.zip#egg=qatransfer"
        ],
    )

## 7. Define pipeline and component connections

In [None]:
#@dsl.pipeline(name="question-answering-pipeline", description="")
def qa_pipeline(
    squad_url: str = "http://github.com/sciling/qatransfer/releases/download/v0.1/save.zip",
    squad_load_path: str = "/save/out/squad/basic/00/save/basic-2000",
    squad_shared_path: str = "/save/out/squad/basic/00/shared.json",
    train_run_id: str = "00",
    train_sent_size_th: str = "150",
    train_ques_size_th: str = "100",
    train_num_epochs: str = "12",
    train_num_steps: str = "5000",
    train_eval_period: str = "200",
    train_save_period: str = "200",
    test_start_step: int = 2001,
    test_end_step: int = 2201,
    test_eval_period: int = 200,
    test_threshold: float = 0.5,
    test_th: int = 10,
    test_reranking_th: int = 10,
    test_format: str = "trec",
    test_verbose: bool = False,
    test_ignore_noanswer: bool = False,
):

    # Download
    dataset_path = download_op(squad_url)

    # Preprocess semeval
    semeval_prepro = semeval_prepro_op(dataset_path.output)

    # Train semeval with pretrained model SQUAD
    semeval_model = semeval_train_op(
        dataset_path.output,
        semeval_prepro.output,
        load_path=squad_load_path,
        shared_path=squad_shared_path,
        run_id=train_run_id,
        sent_size_th=train_sent_size_th,
        ques_size_th=train_ques_size_th,
        num_epochs=train_num_epochs,
        num_steps=train_num_steps,
        eval_period=train_eval_period,
        save_period=train_save_period,
    )

    # Generate files for testing
    test_files = semeval_generate_test_files_op(
        semeval_prepro.output,
        semeval_model.output,
        test_start_step,
        test_end_step,
        test_eval_period,
        train_run_id,
        test_threshold,
    )

    # Test
    semeval_test_op(
        test_files.output,
        train_run_id,
        test_start_step,
        test_end_step,
        test_th,
        test_reranking_th,
        test_format,
        test_verbose,
        test_ignore_noanswer,
    )


## 8. Connect with AWS Client

In [None]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Disable ssl verification
from kfp_server_api.configuration import Configuration as Configuration
if 'old_init' not in globals():
    old_init = Configuration.__init__
def new_init(self, *k, **kw):
    old_init(self, *k, **kw)
    self.verify_ssl = False
Configuration.__init__ = new_init
cookies = "authservice_session=YOUR_TOKEN"
client = kfp.Client(host='http://istio-ingressgateway.istio-system.svc/pipeline', namespace='admin', cookies=cookies)
client.list_experiments(namespace="admin")

## 9. Compile the pipeline

In [None]:
pipeline_func = qa_pipeline
experiment_name = 'semeval'
run_name = pipeline_func.__name__ + ' run'

kfp.compiler.Compiler().compile(pipeline_func,  '{}.zip'.format(experiment_name))

## 10. Define arguments and create a run

In [None]:
# ARGUMENTS DEFINITION
arguments = {
    'train_sent_size_th' : '10',
    'train_ques_size_th':'10', 
    "train_num_epochs" : '1',
    "train_num_steps" : '3',
    "train_eval_period" : '1',
    "train_save_period" : '1',
    "test_start_step" : 2001, 
    "test_end_step" : 2004,
}

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  namespace='admin',
                                                  arguments=arguments)