<a href="https://colab.research.google.com/github/ris27hav/devrevs_domain_specific_qa/blob/main/DevRev_Inference_with_Final_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load packages and import libraries

In [None]:
# Install packages
!pip install --upgrade --no-cache-dir gdown
!pip install -U sentence-transformers
!pip install -U faiss-cpu
!pip install transformers sentencepiece
!pip install optimum[onnxruntime]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import libraries
import gdown
import nltk
import faiss
import json
import os
import time
import numpy as np
import pandas as pd
import collections
import json
import re
import string
import timeit
import tarfile
import os

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig
from optimum.pipelines import pipeline
from tqdm import tqdm
from ast import literal_eval
from zipfile import ZipFile

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Helper functions

### Sentence Encoder

For a given theme, break its paragraphs into sentences and store their paragraph id. Load sentence encoder and calculate embeddings for the sentences from paragraphs and the queries.

In [None]:
def para_to_sentences(para):
    """Splits a paragraph into sentences."""
    para = para.replace('\n', ' ').replace('\t', ' ').replace('\x00', ' ')
    return nltk.sent_tokenize(para)

def load_sents_from_para(paras):
    """Splits a list of paragraphs into sentences and returns the sentences
    and their corresponding paragraph id"""
    sents = []
    para_id = []
    for i,p in enumerate(paras):
        new_sents = para_to_sentences(p['paragraph'])
        sents += new_sents
        para_id += [p['id']]*len(new_sents)
    return sents, para_id

In [None]:
def load_encoder():
    """Load mpnet-base-v2 Sentence Encoder"""
    # model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    gdown.download(
        "https://drive.google.com/file/d/137tZvp-iTMR2xIogasglSH4jTTLW4_Sf/view",
        fuzzy=True, use_cookies=False, quiet=True
    )
    with ZipFile('/content/finetuned_mpnet_triplet.zip') as zobj:
        zobj.extractall()
    model = SentenceTransformer('/content/kaggle/working/finetuned_mpnet_triplet')
    return model

def get_embeddings(sents, model):
    """Generates embeddings for each sentence in the list of 768 dimesions"""
    return model.encode(sents)

### Nearest Neighbour Search using FAISS

Based on the embeddings calculated, indexes them based on L2 distance and then applies nearest neighbour search to get top k closest sentences for each query

In [None]:
def save_index(source_embeds, output_path):
    """Creates and saves the faiss L2 Index using source_embeds"""
    index = faiss.IndexFlatL2(source_embeds.shape[1])
    index.add(np.array(source_embeds))
    faiss.write_index(index, output_path)

def load_index(path):
    """Loads faiss index from the disk"""
    index = faiss.read_index(path)
    return index

def get_k_nearest_neighbours(index, query_embeds, k = 10):
    """Returns k nearest neighbours of target_embeds in source_embeds"""
    return index.search(np.array(query_embeds), k)

def get_nearest_queries(ques_embed, theme):
    """Retrieve nearest already answered queries to the questions"""
    index = load_index(f'/content/indices/{theme}_ques_l2_index')
    return get_k_nearest_neighbours(index, ques_embed, 3)

def get_nearest_sentences(ques_embed, theme):
    """Retrieve nearest sentences to the questions"""
    index = load_index(f'/content/indices/{theme}_para_l2_index')
    return get_k_nearest_neighbours(index, ques_embed, k)

### Load Existing QA and paragraphs data

Load validation data for testing, based on missing data in the training data from squad 2.0 dataset. Round 1 data contains themes that are not present in training data. While, round 2 data contains themes that are present in training data.

In [None]:
def download_existing_data():
    """Download the test data (4 csv files)"""
    ids = [
        "1LY3tKSdcVMb6Q38ZlgLEP9vYF4NPSP-Q",
        "1feZeSoxc2zIBZ3_VPZtgtk1jUoml6J82",
        "1deZKNy6oV3PSnfMpFP576m5XlFLiLWnT",
        "1xHYf9vQbG_y9GCew0mavzdlDSeS9jOTP"
    ]
    for id in ids:
        url = f"https://drive.google.com/u/1/uc?id={id}&export=download"
        gdown.download(url, quiet=True)


def load_existing_data():
    """Load already answered questions and paragraphs, theme-wise.
    Also breaks the paragraphs into sentences"""
    paras, solved_ques = {}, {}
    paragraphs = json.loads(pd.read_csv("input_paragraph.csv").to_json(orient="records"))
    questions = json.loads(pd.read_csv("input_question.csv").to_json(orient="records"))
    theme_intervals = json.loads(pd.read_csv("theme_interval.csv").to_json(orient="records"))
    truth = pd.read_csv("ground_truth.csv")
    truth.fillna(value='', inplace=True)
    truth.paragraph_id = truth.paragraph_id.apply(literal_eval)
    truth.answers = truth.answers.apply(literal_eval)

    for theme_interval in theme_intervals:
        theme = theme_interval["theme"]
        theme_paras = [p for p in paragraphs if p["theme"] == theme]
        sents, para_id = load_sents_from_para(theme_paras)
        paras[theme] = {
            'id': para_id,
            'sentences': sents
        }

        theme_ques = questions[int(theme_interval["start"]) - 1: int(theme_interval["end"])]
        solved_ques[theme] = {
            'id': [],
            'question': [],
            'paragraph_id': [],
            'answers': []
        }
        for q in theme_ques:
            truth_row = truth.loc[truth['question_id'] == q["id"]].iloc[-1]
            truth_paragraph_id = [ int(i) for i in truth_row["paragraph_id"] ]
            solved_ques[theme]['id'].append(q["id"])
            solved_ques[theme]['question'].append(q["question"])
            solved_ques[theme]['paragraph_id'].append(truth_paragraph_id)
            solved_ques[theme]['answers'].append(truth_row["answers"])

    return paras, solved_ques


def store_faiss_indices(paras, solved_ques, encoder):
    """Generates embeddings for paragraph sentences and queries. Then it creates
    and saves the faiss index using them into disk"""
    if not os.path.exists('/content/indices/'):
        os.mkdir('/content/indices/')
    for theme in paras:
        theme_paras = paras[theme]
        theme_ques = solved_ques[theme]

        para_embeds = get_embeddings(theme_paras['sentences'], encoder)
        output_path = f'/content/indices/{theme}_para_l2_index'
        save_index(para_embeds, output_path)

        ques_embeds = get_embeddings(theme_ques['question'], encoder)
        output_path = f'/content/indices/{theme}_ques_l2_index'
        save_index(ques_embeds, output_path)

### Search previously answered queries

In [None]:
def search_previously_answered_queries(q_id, dist, query_idx, solved_queries):
    """Search previously answered queries and return its answer if it exists"""
    if dist > query_threshold:
        return False, None
    ans = {
        "question_id": q_id,
        "answers": solved_queries['answers'][query_idx][0] if len(solved_queries['answers'][query_idx]) > 0 else '',
        "paragraph_id": solved_queries['paragraph_id'][query_idx]
    }
    return True, ans

### Context Generation

Generates a context for a given query and its nearest neighbours. Also provides a method to get the paragraph id given the start idx of the answer.

In [None]:
def get_context(sents, para_ids, nearest_neighbours, distances):
    """Generate the context for a given query and store the para_id for
    each sentence"""
    context = ""
    context_para_ids, sent_length = [], []
    for sent_id, dist in zip(nearest_neighbours, distances):
        if dist > distance_threshold*distances[0]:
            break
        context += sents[sent_id] + ' '
        context_para_ids.append(para_ids[sent_id])
        sent_length.append(len(sents[sent_id]))
        if len(context.split()) >= context_length_threshold:
            break
    sum = -1
    for i in range(len(sent_length)):
        sum += sent_length[i] + 1
        sent_length[i] = sum
    return context.strip(), context_para_ids, sent_length


def para_id_retriever(start_idx, sent_length, context_para_ids):
    """Given start index of the answer, return the id of the paragraph
    in which the answer belongs"""
    if start_idx == -1:
        return -1
    for j in range(len(sent_length)):
        if start_idx <= sent_length[j]:
            return context_para_ids[j]
    return context_para_ids[-1]

### Load fine-tuned QA models

Given a theme, load the corresponding fine-tuned QA model and load the QA pipeline

In [None]:
def download_fine_tuned_models():
    """Download and unzip cluster-wise fine-tuned QA models"""
    urls = [
        ("1-7XfPhjfmUo8xz0iqmFHusbZ74q-SS3A", "zipped_0_11.tar.gz"),
        ("1-BIhfqK992YZW1eWiG5yOCLiX8vOyrNI", "zipped_12_22.tar.gz"),
        ("1-B8b2_s9i2pwTn7EPgzMg50nNM6Dp4B-", "zipped_23_34.tar.gz"),
        ("1-KDxa6wWMGqrDR7ZJq-bYSyWaa_Zsikq", "zipped_35_42.tar.gz")
    ]
    for url, filename in urls:
        if not os.path.exists(filename):
            link = f"https://drive.google.com/u/1/uc?id={url}&export=download"
            gdown.download(link, quiet=True, use_cookies=False)
            with tarfile.open(filename, 'r') as tar:
                tar.extractall()
            # os.remove(filename)

def download_generic_model():
    """Download and optimize electra base model using onnx"""
    model_id = 'PremalMatalia/electra-base-best-squad2'
    save_path = "/content/models/generic_model/"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    ort_model = ORTModelForQuestionAnswering.from_pretrained(
        model_id, from_transformers=True
    )
    optimizer = ORTOptimizer.from_pretrained(ort_model)
    optimization_config = OptimizationConfig(optimization_level=99)
    optimizer.optimize(save_dir=save_path, optimization_config=optimization_config)

In [None]:
def load_models_mapping():
    """Loads map for checking cluster of a theme and vice versa"""
    theme_to_cluster = {}
    cluster_to_themes = {}
    if not os.path.exists("clusters.json"):
        file_url = "https://drive.google.com/file/d/1P6dp7f2m67-iPaUbaNZiDYTmTH7Mw9ec/view?usp=share_link"
        gdown.download(url=file_url, output='clusters.json', quiet=False, fuzzy=True)
    with open('clusters.json') as fo:
        map = json.load(fo)
    for cluster, themes in map.items():
        cluster = int(cluster)
        if cluster not in cluster_to_themes:
            cluster_to_themes[cluster] = []
        for theme in themes:
            theme_to_cluster[theme] = cluster
            cluster_to_themes[cluster].append(theme)
    return theme_to_cluster, cluster_to_themes


def load_qa_model_pipeline(model_path):
    """Load QA model pipeline for a given cluster"""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    for i in range(5):
        try:
            model = ORTModelForQuestionAnswering.from_pretrained(
                model_path, file_name="model_optimized.onnx"
            )
        except:
            continue
        else:
            break
    optimum_qa = pipeline(
        task = 'question-answering', model=model,
        tokenizer=tokenizer, handle_impossible_answer=True
    )
    return optimum_qa

## Execution

In [None]:
# Download sentence encoder model and fine-tuned QA models
download_fine_tuned_models()
download_generic_model()
sentence_encoder = load_encoder()
theme_to_cluster, cluster_to_themes = load_models_mapping()

# Load existing QA pairs for themes and pre-process it
download_existing_data()
paras, solved_ques = load_existing_data()
store_faiss_indices(paras, solved_ques, sentence_encoder)

In [None]:
# Parameters for context generation
k = 8
query_threshold = 0.3
distance_threshold = 1.8
context_length_threshold = 205

In [None]:
def get_theme_model(theme):
    """Load theme model if available, otherwise use generic model"""
    if theme in theme_to_cluster:
        cluster = theme_to_cluster[theme]
        model_path = f'/content/models/electra-base-best-squad2-finetuned-squad-{cluster}'
        if os.path.exists(model_path):
            return load_qa_model_pipeline(model_path)
    model_path = f'/content/models/generic_model'
    return load_qa_model_pipeline(model_path)

In [None]:
def pred_theme_ans(questions, theme_model, pred_out):
    ann_inference_time, qna_inference_time = 0., 0.
    theme = questions[0]["theme"]
    solved_queries = solved_ques[theme]
    print(f'Theme: {theme}')

    # Nearest Neighbour Search
    start_time = time.time()
    ques_list = [q['question'] for q in questions]
    ques_embed = get_embeddings(ques_list, sentence_encoder)
    D_ques, I_ques = get_nearest_queries(ques_embed, theme)
    D_sents, I_sents = get_nearest_sentences(ques_embed, theme)
    ann_inference_time = (time.time() - start_time)*1000.

    # QA Model Prediction
    start_time = time.time()
    for i in tqdm(range(len(questions))):
        q = questions[i]
        # Check previously answered queries
        found, ans = search_previously_answered_queries(
            q["id"], D_ques[i][0], I_ques[i][0], solved_queries
        )
        if found:
            pred_out.append(ans)
            continue
        # Context Generation
        context, context_para_ids, sent_length = get_context(
            paras[theme]['sentences'], paras[theme]['id'], I_sents[i], D_sents[i]
        )
        # Answer Prediction and Paragraph Retrieval
        prediction = theme_model(question=q['question'], context=context)
        ans = {
            "question_id": q['id'],
            "answers": prediction['answer'],
            "paragraph_id": -1
        }
        if prediction['answer'] != "":
            ans["paragraph_id"] = para_id_retriever(
                prediction['start'], sent_length, context_para_ids
            )
        pred_out.append(ans)

    # Print Inference Time
    qna_inference_time = (time.time() - start_time)*1000.
    print(
        f'Avg. ANN IT = {round(ann_inference_time/len(questions), 2)} ms, ' +
        f'Avg. QnA IT = {round(qna_inference_time/len(questions),2)} ms\n'
    )

In [None]:
# NOT allowed to make changes.

# All theme prediction.
questions = json.loads(pd.read_csv("input_question.csv").to_json(orient="records"))
theme_intervals = json.loads(pd.read_csv("theme_interval.csv").to_json(orient="records"))
pred_out = []
theme_inf_time = {}
for theme_interval in theme_intervals[:2]:
    theme_ques = questions[int(theme_interval["start"]) - 1: int(theme_interval["end"])]
    theme = theme_ques[0]["theme"]
    # Load model fine-tuned for this theme.
    theme_model = get_theme_model(theme)
    execution_time = timeit.timeit(lambda: pred_theme_ans(theme_ques, theme_model, pred_out), number=1)
    theme_inf_time[theme_interval["theme"]] = execution_time * 1000 # in milliseconds.
pred_df = pd.DataFrame.from_records(pred_out)
pred_df.fillna(value='', inplace=True)
# Write prediction to a CSV file. Teams are required to submit this csv file.
pred_df.to_csv('output_prediction.csv', index=False)

Theme: IPod


IndexError: ignored

In [None]:
# NOT allowed to make changes.

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

def calc_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def calc_max_f1(predicted, ground_truths):
    max_f1 = 0
    if len(ground_truths) == 0:
        return len(predicted) == 0
    for ground_truth in ground_truths:
        f1 = calc_f1(predicted, ground_truth)
        max_f1 = max(max_f1, f1)
    return max_f1

In [None]:
# NOT allowed to make changes.

# Evaluation methodology.
metrics = {}
pred = pd.read_csv("output_prediction.csv")
pred.fillna(value='', inplace=True)
truth = pd.read_csv("ground_truth.csv")
truth.fillna(value='', inplace=True)
truth.paragraph_id = truth.paragraph_id.apply(literal_eval)
truth.answers = truth.answers.apply(literal_eval)
questions = pd.read_csv("input_question.csv")
for idx in pred.index:
    q_id = pred["question_id"][idx]
    q_rows = questions.loc[questions['id'] == q_id].iloc[-1]
    theme = q_rows["theme"]
    predicted_paragraph = pred["paragraph_id"][idx]
    predicted_ans = pred["answers"][idx]

    if theme not in metrics.keys():
        metrics[theme] = {"true_positive": 0, "true_negative": 0, "total_predictions": 0, "f1_sum": 0}

    truth_row = truth.loc[truth['question_id'] == q_id].iloc[-1]
    truth_paragraph_id = [ int(i) for i in truth_row["paragraph_id"] ]
    if predicted_paragraph in truth_paragraph_id:
        # Increase TP for that theme.
        metrics[theme]["true_positive"] = metrics[theme]["true_positive"] + 1
    # -1 prediction in case there is no paragraph which can answer the query.
    if predicted_paragraph == -1 and truth_row["paragraph_id"] == []:
        # Increase TN.
        metrics[theme]["true_negative"] = metrics[theme]["true_negative"] + 1
    # Increase total predictions for that theme.
    metrics[theme]["total_predictions"] = metrics[theme]["total_predictions"] + 1
    f1 = calc_max_f1(predicted_ans, truth_row["answers"])
    metrics[theme]["f1_sum"] = metrics[theme]["f1_sum"] + f1

In [None]:
# NOT allowed to make changes.

# Final score.
inf_time_threshold = 1000.0 # milliseconds.
final_para_score = 0.0
final_qa_score = 0.0
# Weight would stay hidden from teams.
theme_weights = {"Kubernetes": 0.5, "ChatGPT": 0.4, "Football world cup": 0.1}
for theme in metrics:
    inf_time_score = 1.0
    metric = metrics[theme]
    para_score = (metric["true_positive"] + metric["true_negative"]) / metric["total_predictions"]
    qa_score = metric["f1_sum"] / metric["total_predictions"]
    avg_inf_time = theme_inf_time[theme] / metric["total_predictions"]
    if avg_inf_time > inf_time_threshold:
        inf_time_score = inf_time_threshold / avg_inf_time
    final_qa_score += 1. * inf_time_score * qa_score
    final_para_score += 1. * 1. * para_score
print (final_para_score/len(metrics))
print (final_qa_score/len(metrics))


0.8898943749043251
0.8689999198401515


In [None]:
# 0.8872037021772665
# 0.8519277289714773

# 0.8898943749043251
# 0.8689999198401515