In [17]:
import pandas as pd
import json

In [18]:
def sanity_check_mmqa():
    for i, (idx, row) in enumerate(df_mmqa.iterrows()):
        try:
            if row['Availability in'] == 'English':
                assert not pd.isnull(row['Answer (English)'])
                assert not pd.isnull(row['Answer String(English)'])
                assert pd.isnull(row['Answer (Hindi)'])
                assert pd.isnull(row['Answer String(Hindi)'])
            if row['Availability in'] == 'Hindi':
                assert pd.isnull(row['Answer (English)'])
                assert pd.isnull(row['Answer String(English)'])
                assert not pd.isnull(row['Answer (Hindi)'])
                assert not pd.isnull(row['Answer String(Hindi)'])
            if row['Availability in'] == 'Both':
                assert not pd.isnull(row['Answer (English)'])
                assert not pd.isnull(row['Answer String(English)'])
                assert not pd.isnull(row['Answer (Hindi)'])
                assert not pd.isnull(row['Answer String(Hindi)'])
            assert not pd.isnull(row['Domain'])
            assert not pd.isnull(row['Question (English)'])
            assert not pd.isnull(row['Question (Hindi)'])
        except AssertionError as error:
            print(i, idx, row)
            raise AssertionError
            

In [19]:
df_mmqa = pd.read_csv("./translated_mmqa/QA_Pairs.tsv", sep='\t', encoding='UTF-8', header=0, index_col=0, error_bad_lines=False)
df_mmqa.fillna(value=0)
header_names = df_mmqa.columns
print(list(enumerate(header_names)))      
sanity_check_mmqa()

b'Skipping line 1203: expected 10 fields, saw 17\nSkipping line 1233: expected 10 fields, saw 12\nSkipping line 1315: expected 10 fields, saw 13\nSkipping line 2671: expected 10 fields, saw 12\n'


[(0, 'Question (English)'), (1, 'Question (Hindi)'), (2, 'Domain'), (3, 'Answer (English)'), (4, 'Answer String(English)'), (5, 'Answer (Hindi)'), (6, 'Answer String(Hindi)'), (7, 'Availability in'), (8, 'Answer Type')]


In [20]:
def convert_to_squad(question_type, snippet_type):
    random_title = 'TITLE' 
    data = []
    for idx, row in df_mmqa.iterrows():
        answer = row['Answer (Hindi)'] if snippet_type is 'H' else row['Answer (English)']
        context = row['Answer String(Hindi)'] if snippet_type is 'H' else row['Answer String(English)']
        question = row['Question (Hindi)'] if question_type is 'H' else row['Question (English)']
        if pd.isnull(question):
            raise AssertionError("Question for {} is Nan. Data:{} ".format(idx, row))
        if pd.isnull(answer) and pd.isnull(context):
            continue
        elif pd.isnull(answer) or pd.isnull(context):
            raise AssertionError("Any of context or answer for {} is Nan. Data:{} ".format(idx, row))
        qas = [{
                "id": idx,
                "answers": [
                    {
                        "text": answer,
                        "answer_start": 0
                    }
                ],
                "is_impossible": False if answer else True,
                "question": question
            }]
        paragraphs = [{
            'context': context,
            'qas': qas
        }]
        data.append({
            'paragraphs': paragraphs,
            'title': random_title
        }) 
            
    return {
        'data' : data,
        'version' : 'v2.0'
    }

In [21]:
with open('./translated_mmqa/test_data_eq_ec.json', 'w', encoding='utf-8') as fp:
    json.dump(convert_to_squad('E', 'E'), fp)
with open('./translated_mmqa/test_data_eq_hc.json', 'w', encoding='utf-8') as fp:
    json.dump(convert_to_squad('E', 'H'), fp)
with open('./translated_mmqa/test_data_hq_ec.json', 'w', encoding='utf-8') as fp:
    json.dump(convert_to_squad('H', 'E'), fp)
with open('./translated_mmqa/test_data_hq_hc.json', 'w', encoding='utf-8') as fp:
    json.dump(convert_to_squad('H', 'H'), fp)


