# chaii - Dataset Extension
This notebook is intended to help you download 3 datasets from their sources and utilize them to extend the relevant dataset for chaii competition. 
However, these 3 sources are only intended to extend the QA dataset for Hindi Language and are mentioned below:
1. XQA <br>
Homepage Link: https://github.com/thunlp/XQA <br>
Downloading Link: https://thunlp.s3-us-west-1.amazonaws.com/data_XQA.tar.gz <br>

## XQA is still under process and will be updated once completed with a newer version of this notebook

2. MLQA <br>
Homepage Link: https://github.com/facebookresearch/MLQA <br>
Downloading Link: https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip <br>

3. XQUAD: <br>
Hompage Link: https://github.com/deepmind/xquad <br>
Downlaoding Link: https://github.com/deepmind/xquad.git <br>

Once these datasets are downloaded, we will try to preprocess these to utilize them for chaii pretraining part kernels.

## Importing relevant Modules & Setting up pre-processor function

In [None]:
import tarfile
import zipfile
import json
import os, shutil
import pandas as pd
from tqdm import tqdm

In [None]:
def preprocess(dataset, tier):
    num_exs = 0 
    examples = []

    for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            qas = article_paragraphs[pid]['qas'] 
            for qn in qas:
                question = qn['question'] 
                ans_text = qn['answers'][0]['text']
                ans_start_charloc = qn['answers'][0]['answer_start']
                ans_end_charloc = ans_start_charloc + len(ans_text)
                examples.append(
                    {
                        'context':context, 
                        'question':question, 
                        'answer_text':ans_text, 
                        'answer_start':ans_start_charloc, 
                    }
                )

                num_exs += 1
    print(num_exs)    
    return examples

## Downlaoding XQA dataset and expanding its files
Commented for now, will be re-evaluated once the dataset understanding is completed

In [None]:
# !wget https://thunlp.s3-us-west-1.amazonaws.com/data_XQA.tar.gz

In [None]:
# xqa_file = r'/kaggle/working/data_XQA.tar.gz'
# with tarfile.open(xqa_file, "r") as tar_file:
#     tar_file.extractall()

In [None]:
# xqa_train_json_data = '/kaggle/working/data/ta/dev_doc.json'
# xqa_test_json_data = '/kaggle/working/data/ta/test_doc.json'
# xqa_train_txt_data = '/kaggle/working/data/ta/dev.txt'
# xqa_test_txt_data = '/kaggle/working/data/ta/test.txt'

# train_file_xqa = [json.loads(line) for line in open(xqa_train_json_data, 'r')]
# test_file_xqa = [json.loads(line) for line in open(xqa_test_json_data, 'r')]

In [None]:
# examples_train_xqa = preprocess(train_file_xqa, 'dev')
# examples_test_xqa = preprocess(test_file_xqa, 'test')

In [None]:
# examples_xqa = examples_train_xqa + examples_test_xqa
# xqa = pd.DataFrame(examples_xqa)
# xqa['language'] = 'tamil'

## Downloading MLQA dataset and expanding its files

In [None]:
!wget https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip

In [None]:
mlqa_file = r'/kaggle/working/MLQA_V1.zip'
with zipfile.ZipFile(mlqa_file) as zip_ref:
    zip_ref.extractall('/kaggle/working/')

In [None]:
mlqa_train_data = '/kaggle/working/MLQA_V1/dev/dev-context-hi-question-hi.json'
mlqa_test_data = '/kaggle/working/MLQA_V1/test/test-context-hi-question-hi.json'

with open(mlqa_train_data, 'r') as file_input:
    train_file_mlqa = json.load(file_input)
    
with open(mlqa_test_data, 'r') as file_input:
    test_file_mlqa = json.load(file_input)

In [None]:
examples_train_mlqa = preprocess(train_file_mlqa, 'dev')
examples_test_mlqa = preprocess(test_file_mlqa, 'test')

In [None]:
examples_mlqa = examples_train_mlqa + examples_test_mlqa
mlqa = pd.DataFrame(examples_mlqa)
mlqa['language'] = 'hindi'

## Downloading XQUAD dataset and expanding its files

In [None]:
!git clone https://github.com/deepmind/xquad.git

In [None]:
xquad_train_file = '/kaggle/working/xquad/xquad.hi.json'

with open(xquad_train_file, 'r') as file_input:
    train_file = json.load(file_input)
    
examples_train = preprocess(train_file, 'dev')
xquad = pd.DataFrame(examples_train)
xquad['language'] = 'hindi'

## Remove Downloaded files

In [None]:
folder = '/kaggle/working/'
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))

## Create CSV formats for respective datasets

In [None]:
# xqa.to_csv('xqa_tamil.csv', index=False)
mlqa.to_csv('mlqa_hindi.csv', index=False)
xquad.to_csv('xquad_hindi.csv', index=False)

In [None]:
mlqa.head()

In [None]:
xquad.head()