In [5]:
import pandas as pd
import numpy as np
import json
import os
dev_file = '../squad/dev-v2.0.json'
train_file = '../squad/train-v2.0.json'
header_names = ['eq','ea','ec', 'es', 'ee', 'hq','ha','hc', 'hs', 'he']

In [6]:
def generate_squad_dataset_skeleton(df, train_file, dev_file):
    """
    Generates squad dataset skeleton with title and question sets (qids) which can be reused to create variants. 
    Questions not found in datasets have been grouped by their context and presented with synthetic title. 
    
    Params:
    df= dataframe of the translated SQUAD dataset
    train_file= squad train dataset
    dev_file = squad dev dataset
    
    Returns:
    dataset_skeleton: Skeleton structure with title again list of question sets (for each context), where each question
                     set contains question ids. 
    id_map: map of question ids to dataframe ids. 
    """
    parsed_v = None
    parsed_t = None
    with open(dev_file, 'r') as handle:
        parsed_v = json.load(handle)
    with open(train_file, 'r') as handle:
        parsed_t = json.load(handle)
    qid_to_dfid = {}
    dataset_skeleton = {}

    # Generate questions to dataframe id
    qs_to_dfid = {}
    for ind, row in df.iterrows():
            qs_to_dfid[row['eq']] = ind

    # Form dataset skeleton with title and paragraph lists containing related qids. Also form qid to dataframe id link.
    for data in parsed_v['data']:
        pars = []
        
        for par in data['paragraphs']:
            qas = []
            context = par['context']
            for qa in par['qas']:
                if not qa['is_impossible'] and qa['question'] in qs_to_dfid:
                    df_id = qs_to_dfid[qa['question']]
                    # There are multiple questions with same language in different contexts. 
                    if context != df.iloc[df_id]['ec']:
                        continue
                    qas.append(qa['id'])
                    qid_to_dfid[qa['id']] = df_id
            if qas:
                pars.append(qas)
        if pars:
            dataset_skeleton[data['title']] = pars
    for data in parsed_t['data']:
        pars = []
        for par in data['paragraphs']:
            qas = []
            context = par['context']
            for qa in par['qas']:
                if not qa['is_impossible'] and qa['question'] in qs_to_dfid:
                    df_id = qs_to_dfid[qa['question']]
                    # There are multiple questions with same language in different contexts. 
                    if context != df.iloc[df_id]['ec']:
                        continue
                    qas.append(qa['id'])
                    qid_to_dfid[qa['id']] = df_id
            if qas:
                pars.append(qas)
        if pars:
            assert(data['title'] not in dataset_skeleton.keys())
            dataset_skeleton[data['title']] = pars
    # Add those questions not found in datasets. 
    marked = set(qid_to_dfid.values())
    ctx_to_id = {}
    for ind, row in df.iterrows():
        if ind not in marked:
            if row['ec'] not in ctx_to_id:
                ctx_to_id[row['ec']] = [ind]
            else:
                ctx_to_id[row['ec']].append(ind)
    for index, (_, ids) in enumerate(ctx_to_id.items()):
        title = 'NO_TITLE_'+str(index)
        qs = []
        for i in ids:
            qid = title + '_Q_'+str(i)
            qid_to_dfid[qid] = i
            qs.append(qid)
        dataset_skeleton[title] = [qs]
    return dataset_skeleton, qid_to_dfid

In [7]:
def generate_squad_dataset_variant(skeleton, df, q_type='e', a_type='e'):
    """
    Takes the skeleton structure (dataset_skeleton and id map) and generates the synthetic dataset json based on the 
    question and answer type passed. h for hindi and e for english. Variants can be on question type and context type. 
    
    Params:
    skeleton: Pair passed from generate_squad_dataset_skeleton. 
    df: dataframe of the translated SQUAD dataset
    q_type: language of question. h for hindi and e for english
    a_type: language of answer and context. h for hindi and e for english
    
    Returns: 
    Synthentic SQuAD dataset dict. Should be dumped as json by user. 
    
    """
    types = ['e', 'h']
    if q_type not in types or a_type not in types:
        raise ValueError("types should be e or h")
    data = []
    id_map = skeleton[1]
    skeleton = skeleton[0]
    for title, qsets in skeleton.items():
        paragraphs = []
        for qset in qsets:
            qas = []
            context = None
            for q in qset:
                context = df.iloc[id_map[q]]['ec' if a_type=='e' else 'hc']
                answer = df.iloc[id_map[q]]['ea' if a_type=='e' else 'ha']
                if context.find(answer) == -1:
                    print(answer, context)
                    assert(False)
                qas.append({
                    "id": q,
                    "answers": [
                        {
                            "text": answer,
                            "answer_start": context.find(answer)
                        }
                    ],
                    "is_impossible": True if not answer else False,
                    "question": df.iloc[id_map[q]]['eq' if q_type=='e' else 'hq']
                })
            paragraphs.append({
                'context': context,
                'qas': qas
            })
        data.append({
            'paragraphs': paragraphs,
            'title': title
        }) 
        
    return {
        'data' : data,
        'version' : 'v2.0'
    }

In [140]:
# Cleanse unneeded whitespace in answers 
def cleanse_answer_eng(text):
    new_text = ''
    for idx, ch in enumerate(text):
        new_text= new_text+ch 
        # phrases that don't no space before
        no_space_before = [',', '.', '%', '°', 'FM', '/','-', '–', '♠', '×', 'n\'t', ';','+',':', '\'ve','´']
        # phrases that don't no space after
        no_space_after = ['#', '$', '°', '/','-', '–', '♠', '×','+','´']
        # phrases to replace with
        replace_with = {'--':'–', '\'\'':'"', '\u00a0':' ', '``':'"', '\'`':'"', '`':'\'',
                       'can not':'cannot', '. nf':'.nf', ' \'s': '\'s', '’':'\'', '..':'.', 'a " tree':'a "tree',
                        's \'':'s\'', '\' aesthetic \'':'\'aesthetic\'', '. mp3':'.mp3', '" a':'"a'}
        for ele in no_space_before:
            if new_text.endswith(ele) and len(new_text)>len(ele) and new_text[-len(ele)-1]==' ':
                new_text = new_text[:-len(ele)-1] + new_text[-len(ele):]
                break
        if new_text[-1]==' ':
            for ele in no_space_after:
                if new_text[:-1].endswith(ele):
                    new_text = new_text[:-1]
                    break
        for ele, rele in replace_with.items():
            if new_text.endswith(ele):
                new_text = new_text[:-len(ele)] + rele
        #Resolve degree issues - temperature after, degree before. 
        if len(new_text) > 2 and new_text[-2] == '°':
            if new_text[-1] in ['C', 'F']:
                new_text = new_text[:-2] + ' ' + new_text[-2:]
            else:
                new_text = new_text[:-1] + ' ' + new_text[-1:]
        if ch.isdigit() and len(new_text)>=3 and new_text[-2].isspace() and new_text[-3].isdigit():
            new_text = new_text[:-2]+new_text[-1]
    dont_end_with = [ '\'', '"', '.', ',']
    for ele in dont_end_with:
        if new_text.endswith(ele):
            new_text = new_text[:-len(ele)]
            break
    return new_text.strip()

def cleanse_answer_hin(text):
    new_text = ''
    no_space_before = [',', '.', '%', '°', 'FM', '/','-', '–', '♠', '×', 'n\'t', ';','+', ':']
    no_space_after = ['#', '$', '°', '/','-', '–', '♠', '×']
    replace_with = {', और':' और', '--':'–', '\'\'':'"', '\u00a0':' ', '``':'"', '\'`':'"', '`':'\'', 
                    ', या':' या', 'अक्टूबर ': 'अक्टूबर, ', 'दिसंबर ': 'दिसंबर, ', 'मई ': 'मई, ', 'जनवरी ': 'जनवरी, ',
                    'मार्च ': 'मार्च, ', 'नवंबर ': 'नवंबर, ', 'अगस्त ': 'अगस्त, ', 'सितंबर ': 'सितंबर, ', 'अप्रैल ': 'अप्रैल, ',
                   'जुलाई ': 'जुलाई, ', 'जून ': 'जून, ', 'जे।': 'जे', 'एडी':'ए.डी.', 'ए.डी':'ए.डी.', 'एस।': 'एस', 'यू.एस.':'यूएस', 
                    'आर।':'आर', 'डॉ।':'डॉ.', 'डॉ ':'डॉ. ', '..':'.', 'सी।':'सी', 'एल।':'एल', 'जी।':'जी'}
    for idx, ch in enumerate(text):
        new_text = new_text + ch
        for ele in no_space_before:
            if new_text.endswith(ele) and len(new_text)>len(ele) and new_text[-len(ele)-1]==' ':
                new_text = new_text[:-len(ele)-1] + new_text[-len(ele):]
                break
        if new_text[-1]==' ':
            for ele in no_space_after:
                if new_text[:-1].endswith(ele):
                    new_text = new_text[:-1]
                    break
        for ele, rele in replace_with.items():
            if new_text.endswith(ele):
                new_text = new_text[:-len(ele)] + rele
        if ch == ',' and len(new_text)>1 and new_text[-2].isdigit():
            new_text = new_text[:-1]
        if ch.isdigit() and len(new_text)>=3 and new_text[-2].isspace() and new_text[-3].isdigit():
            new_text = new_text[:-2]+new_text[-1]
    dont_end_with = [',', '.', '"', '।', '\'']
    for ele in dont_end_with:
            if new_text.endswith(ele):
                new_text = new_text[:-len(ele)]
                break
    return new_text.strip()
    
def santize_answers_in_dataset(df):
    for ind, row in df.iterrows():
            df.iloc[ind,1] = cleanse_answer_eng(df.iloc[ind]['ea'])
            df.iloc[ind,2] = cleanse_answer_eng(df.iloc[ind]['ec'])
            if df.iloc[ind]['ec'].find(df.iloc[ind]['ea']) == -1:
                print(str(ind),'='*5 ,df.iloc[ind]['ea'], '='*5 ,df.iloc[ind]['ec'])
                assert(False)
            df.iloc[ind,6] = cleanse_answer_hin(df.iloc[ind]['ha'])
            df.iloc[ind,7] = cleanse_answer_hin(df.iloc[ind]['hc'])
            if df.iloc[ind]['hc'].find(df.iloc[ind]['ha']) == -1:
                print(str(ind),'='*5 , df.iloc[ind]['ha'], '='*5 ,df.iloc[ind]['hc'])
                assert(False)
    return df

#sanity test
assert(cleanse_answer_eng('hello , hi . ') =='hello, hi.')
assert(cleanse_answer_eng('9 0') =='90')

In [141]:
#Train - incomplete
df_train = pd.read_csv("./translated/train_data.txt", sep='\t', encoding='UTF-8', header=None, index_col=False,
                 names=header_names) 
df_train = santize_answers_in_dataset(df_train)
df_train.to_csv("./translated/train_data_t.txt", sep='\t', header=None, index=False)
df_train = pd.read_csv("./translated/train_data_t.txt", sep='\t', encoding='UTF-8', header=None, index_col=False,
                 names=header_names)
skeleton_train = generate_squad_dataset_skeleton(df_train, train_file, dev_file)
with open('./translated/train_data_eq_ec.json', 'w') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_train, df_train, q_type='e', a_type='e'), fp)
with open('./translated/train_data_eq_hc.json', 'w') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_train, df_train, q_type='e', a_type='h'), fp)
with open('./translated/train_data_hq_ec.json', 'w') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_train, df_train, q_type='h', a_type='e'), fp)
with open('./translated/train_data_hq_hc.json', 'w') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_train, df_train, q_type='h', a_type='h'), fp)
df_train.to_csv("./translated/train_data_t.txt", sep='\t', header=None, index=False)
os.remove("./translated/train_data_t.txt")

In [142]:
import copy
def sanity_check(dataset_file):
    dataset = None
    with open(dataset_file, 'r', encoding='utf-8') as fp:
        dataset = json.load(fp)
    datas = []
    for data in dataset['data']:
        paragraphs = []
        for par in data['paragraphs']:
            qas = []
            context = par['context']
            for qa in par['qas']:
                new_qa = copy.deepcopy(qa)
                answer = qa['answers'][0]['text']
                start = qa['answers'][0]['answer_start']
                if not context[start:].startswith(answer):
                    if context[start:].find(answer)==-1:
                        print('here',qa['question'], answer, context.find(answer), context)
                        assert(False)
                    else:
                        print(answer, start, context[start:].find(answer))
                        new_qa['answers'][0]['answer_start'] = start+context[start:].find(answer)
                qas.append(new_qa)
            par['qas'] = qas
            paragraphs.append(par)
        data['paragraphs'] = paragraphs
        datas.append(data)
    dataset['data'] = datas
    with open(dataset_file, 'w', encoding='utf-8') as fp:
        json.dump(dataset, fp)
                    

In [143]:
sanity_check("./translated/train_data_eq_ec.json")
sanity_check("./translated/train_data_eq_hc.json")
sanity_check("./translated/train_data_hq_ec.json")
sanity_check("./translated/train_data_hq_hc.json")

In [5]:
#Test
df_test = pd.read_csv("./translated/test_data.txt", sep='\t', encoding='UTF-8', header=None, index_col=False,
                 names=header_names) 
df_test = santize_answers_in_dataset(df_test)
skeleton_test = generate_squad_dataset_skeleton(df_test, train_file, dev_file)
with open('./translated/test_data_eq_ec.json', 'w', encoding='utf-8') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_test, df_test, q_type='e', a_type='e'), fp)
with open('./translated/test_data_eq_hc.json', 'w', encoding='utf-8') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_test, df_test, q_type='e', a_type='h'), fp)
with open('./translated/test_data_hq_ec.json', 'w', encoding='utf-8') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_test, df_test, q_type='h', a_type='e'), fp)
with open('./translated/test_data_hq_hc.json', 'w', encoding='utf-8') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_test, df_test, q_type='h', a_type='h'), fp)
df_test.to_csv("./translated/test_data.txt", sep='\t', header=None, index=False)

In [7]:
sanity_check("./translated/test_data_eq_ec.json")
sanity_check("./translated/test_data_eq_hc.json")
sanity_check("./translated/test_data_hq_ec.json")
sanity_check("./translated/test_data_hq_hc.json")

In [8]:
#Val
df_val = pd.read_csv("./translated/val_data.txt", sep='\t', encoding='UTF-8', header=None, index_col=False,
                 names=header_names) 
df_val = santize_answers_in_dataset(df_val)
df_val.to_csv("./translated/val_data_t.txt", sep='\t', header=None, index=False)
df_val = pd.read_csv("./translated/val_data_t.txt", sep='\t', encoding='UTF-8', header=None, index_col=False,
                 names=header_names)
skeleton_val = generate_squad_dataset_skeleton(df_val, train_file, dev_file)
with open('./translated/val_data_eq_ec.json', 'w') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_val, df_val, q_type='e', a_type='e'), fp)
with open('./translated/val_data_eq_hc.json', 'w') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_val, df_val, q_type='e', a_type='h'), fp)
with open('./translated/val_data_hq_ec.json', 'w') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_val, df_val, q_type='h', a_type='e'), fp)
with open('./translated/val_data_hq_hc.json', 'w') as fp:
    json.dump(generate_squad_dataset_variant(skeleton_val, df_val, q_type='h', a_type='h'), fp)
df_val.to_csv("./translated/val_data_t.txt", sep='\t', header=None, index=False)
os.remove("./translated/val_data_t.txt")

In [9]:
sanity_check("./translated/val_data_eq_ec.json")
sanity_check("./translated/val_data_eq_hc.json")
sanity_check("./translated/val_data_hq_ec.json")
sanity_check("./translated/val_data_hq_hc.json")