# ChAII inference notebook

This notebook is a continuation of the [ChAII-1 Starter Notebook](https://www.kaggle.com/deeplearning10/chaii-1-starter-notebook?scriptVersionId=71032838) and will be used for inference and submitting to the competition.

We will use the output of the starter notebook, i.e. trained model as an input of this notebook. To do this, click on Add data button, select Notebook Output Files option and upload from ChAII-1 Starter Notebook. 
You can also upload the model trained locally and use it for inference. 

In [None]:
import sys
print(sys.path)

In [None]:
import sys
sys.path.append("/kaggle/working/chaii-packages")

In [None]:
%%bash
mkdir /kaggle/working/chaii-packages
cd /kaggle/working/chaii-packages
cp /kaggle/input/external-packages/* /kaggle/working/chaii-packages
mv ./botocore-1.21.17.xyz ./botocore-1.21.17.tar.gz
mv ./jieba-0.42.1.xyz ./jieba-0.42.1.tar.gz
mv ./seqeval-1.2.2.xyz ./seqeval-1.2.2.tar.gz
# Copy saved model from training notebook to /kaggle/working directory
mkdir /kaggle/working/saved-model
# Change this line to the name of your trained model
cp -r /kaggle/input/chaii-output/outputs-temp/* /kaggle/working/saved-model

In [None]:
%%bash
# First, we need to install required dependencies. Instead of running their install_tools.sh, run this cell, which has a few minor modifications. This may take a few minutes to run.

cd /kaggle/input/ # Optional but recommended
cd modified-xtreme/
# Copyright 2020 Google and DeepMind.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

REPO=$PWD
echo $REPO
LIB=$REPO/third_party
mkdir -p $LIB

# install latest transformer
cd $LIB
cd transformers
pip install . --no-index --find-links /kaggle/working/chaii-packages/
cd $LIB

# pip install seqeval --no-index --find-links /kaggle/working/chaii-packages/
# pip install tensorboardx --no-index --find-links /kaggle/working/chaii-packages/
# pip install tqdm --no-index --find-links /kaggle/working/chaii-packages/

# # install XLM tokenizer
# pip install sacremoses --no-index --find-links /kaggle/working/chaii-packages/
# pip install pythainlp --no-index --find-links /kaggle/working/chaii-packages/
# pip install jieba --no-index --find-links /kaggle/working/chaii-packages/

# #git clone https://github.com/neubig/kytea.git && cd kytea
# #./configure --prefix=${CONDA_PREFIX}
# #make && make install
# pip install kytea --no-index --find-links /kaggle/working/chaii-packages/

## Data preparation

In [None]:
# Load ChAII dataset
import json
import random
import pandas as pd
from pathlib import Path

pd.set_option("display.max_rows", 20, "display.max_columns", None)

data_path = Path("/kaggle/input/chaii-hindi-and-tamil-question-answering/")
json_dicts = []

def get_dataframe(file_path):
    df = pd.DataFrame()
    with open(file_path,'r') as f:
        df = pd.read_csv(f)
    df = df.astype(str)
    df = df.apply(lambda x: x.str.strip())
    return df

test_data = get_dataframe(data_path / "test.csv")
test_data

In [None]:
# Convert TyDiQA format to a QA format
def convert_to_qa_format_kaggle(row):
    answer = {}
    try:
        answer["text"] = row["answer_text"]
        answer["answer_start"] = int(row["answer_start"])
    except:
        answer["text"] = ''
        answer["answer_start"] = -1
    qa_json = {
        "title": "",
        "paragraphs": [
            {
                "context": row["context"],
                "qas": [
                    {
                        "question": row["question"],
                        "id": row["language"] + '-' + str(row["id"]),
                        "answers": [answer]
                    }
                ]
            }
        ],
    }
    
    return qa_json

# Process one language at a time
# Here chaii_data is a pandas dataframe
def get_qa_data_from_kaggle_format(chaii_data, language):
    qa_data = {"data":[], "version":f"chaii_{language}"}
    for index, row in chaii_data.iterrows():
        if row["language"] == language:
            qa_datapoint = convert_to_qa_format_kaggle(row)
            qa_data["data"].append(qa_datapoint)

    print("QA (SQuAD) format:")
    print(qa_data["data"][0])
    return qa_data

hi_test_qa_data = get_qa_data_from_kaggle_format(test_data, 'hindi')
ta_test_qa_data = get_qa_data_from_kaggle_format(test_data, 'tamil')

In [None]:
# Splitting data into train and dev and saving converted QA formats
def split_data(test_qa_data, lang_code):
    split_data_path = Path("/kaggle/working/chaii_data/")
    !mkdir /kaggle/working/chaii_data

    test_qa_data_datapoints = test_qa_data["data"]
    test_qa_data = {"data": test_qa_data_datapoints, "version":f"chaii_{lang_code}_test"}

    with open(split_data_path / f"test.{lang_code}.qa.jsonl",'w') as f:
      json.dump(test_qa_data,f)

    print(f"{lang_code} Test data size: %d" % len(test_qa_data_datapoints))
    
split_data(hi_test_qa_data, 'hi')
split_data(ta_test_qa_data, 'ta')

## Inference and Evaluation

In [None]:
# If you trained the model on a local machine and want to evaluate you can use this cell
# predict.sh ${MODEL_PATH} ${TASK} ${DATA_DIR} ${PREDICTIONS_DIR} ${MODEL} ${MODEL_TYPE} ${GPU} ${PREDICT_FILE_NAME}
# predict_qa.sh ${MODEL} ${MODEL_TYPE} ${MODEL_PATH} ${TGT} ${GPU} ${DATA_DIR} ${PREDICTIONS_DIR} ${PREDICT_FILE_NAME}
# Predict on train to see performance

!bash /kaggle/input/modified-xtreme/predict.sh "/kaggle/input/chaii-output/outputs-temp/chaii_hi/bert-base-multilingual-cased_LR3e-5_EPOCH2.0_maxlen384" \
      chaii_hi "/kaggle/working/chaii_data/" "/kaggle/working/eval_dir/predictions/" "bert-base-multilingual-cased" "bert" 0 test.hi.qa.jsonl


In [None]:
# Tamil Inference

!bash /kaggle/input/modified-xtreme/predict.sh "/kaggle/input/chaii-output/outputs-temp/chaii_ta/bert-base-multilingual-cased_LR3e-5_EPOCH2.0_maxlen384" \
      chaii_ta "/kaggle/working/chaii_data/" "/kaggle/working/eval_dir/predictions/" "bert-base-multilingual-cased" "bert" 0 test.ta.qa.jsonl

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def evaluate(lang_code):
    # For evaluating the predictions, we will use our custom script which uses jaccard mean 
    import json
#     with open(f"/kaggle/working/outputs-temp/chaii_{lang_code}/bert-base-multilingual-cased_LR3e-5_EPOCH2.0_maxlen384/predictions_{lang_code}_.json") as f:
#       preds = json.load(f)
    with open(f"/kaggle/working/eval_dir/predictions/predictions_{lang_code}_.json") as f:
        preds = json.load(f)

    with open(f"/kaggle/working/chaii_data/test.{lang_code}.qa.jsonl") as f:
        dev_data = json.load(f)
    
    submission_preds = [{'id':k.split('-')[1], 'PredictionString': v} for k, v in preds.items()]
    
    # write submissions file
    df_ = pd.DataFrame.from_dict(submission_preds)
    df_.to_csv(f'/kaggle/working/eval_dir/chaii_{lang_code}_submission.csv', index=False)
    
    from pprint import pprint
    jaccard_mean = 0
    dev_answer_pair_matches = []
    for d in dev_data['data']:
        for para in d['paragraphs']:
            for qa in para['qas']:
                sample_jaccard = jaccard(qa['answers'][0]['text'], preds[qa['id']])
                jaccard_mean += sample_jaccard
                dev_answer_pair_matches.append({'context':para['context'],'question':qa['question'],'gold_answer':qa['answers'],'mbert_pred':preds[qa['id']],'id':qa['id']})

    jaccard_mean /= len(dev_answer_pair_matches)
    print(f"Jaccard Mean for chaii_{lang_code}: {jaccard_mean}")
    
    return dev_answer_pair_matches
    
    
    
test_answer_pair_matches_hi = evaluate("hi")
test_answer_pair_matches_ta = evaluate("ta")

In [None]:
%%bash
# Delete existing submission.csv file
rm /kaggle/working/submission.csv
# Combine predictions for all languages into a single submission.csv file
cd /kaggle/working/eval_dir
cat chaii_hi_submission.csv >> /kaggle/working/submission.csv
tail -n +2 chaii_ta_submission.csv >> /kaggle/working/submission.csv

In [None]:
!wc -l /kaggle/working/eval_dir/chaii_hi_submission.csv
!wc -l /kaggle/working/eval_dir/chaii_ta_submission.csv
!wc -l /kaggle/working/submission.csv

In [None]:
def write_dev_answer_pair_matches(test_answer_pair_matches, lang_code):
    #Matches in predictions
    correct_ans = [d for d in test_answer_pair_matches if d['mbert_pred']==d['gold_answer'][0]['text']]
    with open(f'/kaggle/working/eval_dir/correct_chaii_{lang_code}_mbert.txt','w',encoding='utf-8') as f:
      for c in correct_ans:
        f.write(f"id:{c['id']}\n")
        f.write(f"context:{c['context']}\n")
        f.write(f"question:{c['question']}\n")
        f.write(f"gold_answer:{c['gold_answer'][0]['text']}\n")
        f.write(f"mbert_pred:{c['mbert_pred']}\n")
        f.write("\n\n")
        
    #Mismatches in predictions
    wrong_ans = [d for d in test_answer_pair_matches if d['mbert_pred']!=d['gold_answer'][0]['text']]
    with open(f'/kaggle/working/eval_dir/wrong_chaii_{lang_code}_mbert.txt','w',encoding='utf-8') as f:
      for c in wrong_ans:
        f.write(f"id:{c['id']}\n")
        f.write(f"context:{c['context']}\n")
        f.write(f"question:{c['question']}\n")
        f.write(f"gold_answer:{c['gold_answer'][0]['text']}\n")
        f.write(f"mbert_pred:{c['mbert_pred']}\n")
        f.write("\n\n")
    
    return correct_ans, wrong_ans
        
correct_ans, wrong_ans = write_dev_answer_pair_matches(test_answer_pair_matches_hi, "hi")
correct_ans, wrong_ans = write_dev_answer_pair_matches(test_answer_pair_matches_ta, "ta")

In [None]:
len(correct_ans),len(wrong_ans)