# About

Helper functions for converting the original and external [chaii - Hindi and Tamil Question Answering](https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering) datasets to SQuAD format. An example of how to use prepared data you can find in this [training notebook](https://www.kaggle.com/oleksandrsirenko/chaii-fine-tuning-model). Converted datasets are available [here](https://www.kaggle.com/oleksandrsirenko/chaii-squad) and will updated during the competition.

In [None]:
import pandas as pd
import json
from typing import List, Dict, Optional
from pathlib import Path

# SQuAD format

The data item example of the SQuAD format that we need to obtain as a result of the conversion process.

```python
{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'id': '5733be284776f41900661182',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'title': 'University_of_Notre_Dame'}
```

# Utils

In [None]:
def read_json(from_path: Path) -> dict:
    with open(from_path, 'r', encoding='utf-8') as out_file:
        return json.load(out_file)
        
def write_json(data: dict, out_path: Path) -> None:
    with open(out_path, 'w', encoding='utf-8') as out_file:
        json.dump(data, out_file, indent=2, sort_keys=True, ensure_ascii=False)

# SQuADv2 to SQuAD

In [None]:
def squad_v2_to_squad_format(path: Path, out_name: str) -> Path:
    squad_v2_dict = read_json(path)
    data = []
    for group in squad_v2_dict['data']:
        title = group['title']
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                idx = qa['id']
                for answer in qa['answers']:
                    values_to_lists = {k: [v] for k, v in answer.items()}  
                    data.append(
                        {
                            'answers': values_to_lists,
                            'context': context,
                            'id': idx,
                            'question': question,
                            'title': title
                        }
                    )
                    
    df_as_squad = {'data': data, 'version': out_name}
    
    out_path = f'./{out_name}.json'
    write_json(df_as_squad, out_path)
    print('The data has been converted to SQuAD format and saved as a JSON object.')
    
    return out_path

## XQuAD Hindi Dataset

In [None]:
# convert
i_xquad_hindi_path = '../input/xquad-multilingual-data/xquad-master/xquad.hi.json'
o_xquad_hindi_path = squad_v2_to_squad_format(i_xquad_hindi_path, 'xquad_hindi')

# check
read_json(o_xquad_hindi_path)['data'][0]

## Facebook MLQA

In [None]:
# convert
i_mlqa_test_hindi_path = '../input/facebook-mlqa/MLQA_V1/test/test-context-hi-question-hi.json'
o_mlqa_test_hindi_path = squad_v2_to_squad_format(i_mlqa_test_hindi_path, 'mlqa_test_hindi')

# check
read_json(o_mlqa_test_hindi_path)['data'][0]

In [None]:
# convert
i_mlqa_dev_hindi_path = '../input/facebook-mlqa/MLQA_V1/dev/dev-context-hi-question-hi.json'
o_mlqa_dev_hindi_path = squad_v2_to_squad_format(i_mlqa_dev_hindi_path, 'mlqa_dev_hindi')

# check
read_json(o_mlqa_dev_hindi_path)['data'][0]

# Dataframe to SQuAD

In [None]:
def df_to_squad_format(path: Path, out_name: str, lang: Optional[str] = None) -> Path:
    df = pd.read_csv(path)
    if lang:
        df = df.loc[df.language == lang].copy()
        out_name = f'{out_name}_{lang}'
    
    data = []
    for _, row in df.iterrows():
        answers = {}
        try:
            answers['answer_start'] = [int(row['answer_start'])]
            answers['text'] = [row['answer_text']]
        except:
            answers = {'answer_start': [-1], 'text': ['']}
        data.append(
            {
            'answers': answers,
            'context': row['context'],
            'id': row['id'],
            'question': row['question'],
            'title': ''
            }
        )
    
    df_as_squad = {'data': data, 'version': out_name}
    
    out_path = f'./{out_name}.json'
    write_json(df_as_squad, out_path)
    print('The data has been converted to SQuAD format and saved as a JSON object.')
    return out_path

## CHAII

In [None]:
# convert chaii train df 
train_df_path = '../input/chaii-hindi-and-tamil-question-answering/train.csv'
chaii_train = df_to_squad_format(train_df_path, 'chaii_train')

#check
read_json(chaii_train)['data'][0]

In [None]:
# convert hindi chaii
train_df_hindi_path = '../input/chaii-hindi-and-tamil-question-answering/train.csv'
chaii_train_hindi = df_to_squad_format(train_df_hindi_path, 'chaii_train', lang='hindi')

#check
read_json(chaii_train_hindi)['data'][0]

In [None]:
# convert tamil chaii
train_df_tamil_path = '../input/chaii-hindi-and-tamil-question-answering/train.csv'
chaii_train_tamil = df_to_squad_format(train_df_tamil_path, 'chaii_train', lang='tamil')

#check
read_json(chaii_train_tamil)['data'][0]

In [None]:
# convert chaii test df 
test_df_path = '../input/chaii-hindi-and-tamil-question-answering/test.csv'
chaii_test = df_to_squad_format(test_df_path, 'chaii_test')

#check
read_json(chaii_test)['data'][0]