## Handle data

In [None]:
!pip install -q transformers

## **Import**

In [None]:
import numpy as np
import pandas as pd
import sys
import random
from tqdm import tqdm
import re
import string
import os
import shutil
import json
from transformers import AutoTokenizer, TFBertMainLayer, TFBertForPreTraining, BertConfig, TFBertModel
import tensorflow as tf
from tensorflow.keras.losses import sparse_categorical_crossentropy as sce

In [None]:
f_test = '../input/tensorflow2-question-answering/simplified-nq-test.jsonl'
f_train = '../input/tensorflow2-question-answering/simplified-nq-train.jsonl'
num_train_samples = 307372
num_test_samples = 346

In [None]:
def get_id_df(filename=f_test):
    list_id = []
    with open(filename) as f:
        progress = tqdm(f)  
        for sam_count, line in enumerate(progress):
            data = json.loads(line)
            example_id = str(data['example_id'])
            doc = {'example_id':example_id}
            list_id.append(doc)
    list_id_df = pd.DataFrame(list_id)
    return list_id_df 

In [None]:
AnswerType = {
    'NO_ANSWER': 0,
    'YES': 1,
    'NO': 2,
    'SHORT' : 3,
    'LONG' : 4
}

AnswerTypeRev = {
    0: 'NO_ANSWER',
    1: 'YES',
    2: 'NO',
    3: 'SHORT',
    4: 'LONG'
}

In [None]:
def preprocess_data(data, tokenizer, debug=False): 
    progress = tqdm(data, total=len(data))
    x1 = []
    x2 = []
    x3 = []
    y = []
    for sam in progress:
        tokenized_sam = tokenizer.encode_plus(sam['question'], sam['context'], 
                                              padding='max_length',
                                              truncation=True,
                                              max_length=512,
                                              add_special_tokens=True)
        
        x1.append(tf.cast(tokenized_sam['input_ids'], tf.int32))
        x2.append(tf.cast(tokenized_sam['token_type_ids'], tf.int32))
        x3.append(tf.cast(tokenized_sam['attention_mask'], tf.int32))

        y.append([sam['start'], sam['stop'], AnswerType[sam['target']]])

    x1 = tf.convert_to_tensor(x1)
    x2 = tf.convert_to_tensor(x2)
    x3 = tf.convert_to_tensor(x3)

    y = tf.convert_to_tensor(y)
    return x1, x2, x3, y

In [None]:
def get_strategy():
    try:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
        print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker'])
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)
    except ValueError as e:
        print(e)
        print('No TPU detected')
        tpu = None
        strategy = tf.distribute.get_strategy()
    return strategy

In [None]:
def mergeInstanceResult(test_res, list_test_ins):
    for i in range(len(list_test_ins)):
        ins_res = test_res[i]
        start = np.argmax(ins_res[0])
        stop = np.argmax(ins_res[1])
        target = np.argmax(ins_res[2])

        start_score = ins_res[0][start]
        stop_score = ins_res[1][stop]
        target_score = ins_res[2][target]

        start_CLS = ins_res[0][0]
        stop_CLS = ins_res[1][0]


        list_test_ins[i]['start'] = start 
        list_test_ins[i]['stop'] = stop
        list_test_ins[i]['target'] = target 

        list_test_ins[i]['start_score'] = start_score
        list_test_ins[i]['stop_score'] = stop_score
        list_test_ins[i]['target_score'] = target_score

        list_test_ins[i]['start_CLS'] = start_CLS
        list_test_ins[i]['stop_CLS'] = stop_CLS
    return list_test_ins

In [None]:
def mergeDocumentRes(ins_df, val_id_df, threshold=0.0001, stride=128, debug=False):
    STRIDE = stride
    list_doc_lan = []
    for idx, doc in val_id_df.iterrows():
        doc_id = doc['example_id']
        ins_of_doc = ins_df.loc[ins_df['example_id'] == doc_id]
        
        start_ins = ins_of_doc.loc[ins_of_doc['start'] != 0]
        stop_ins = ins_of_doc.loc[ins_of_doc['stop'] != 0]
        all_non_zero = pd.concat([start_ins,stop_ins]).drop_duplicates()
        
        best_start = -1
        best_stop = -1
        best_target = 0
        best_score = threshold
                    
        for idx_ins, ins in all_non_zero.iterrows():
            ins_start = int(ins['start'])
            ins_stop = int(ins['stop'])
            ins_target = int(ins['target'])
            
            part_start = ins['part_start']
            
            real_start = int(ins_start + part_start)
            real_stop = int(ins_stop + part_start)
            
            s_start = ins['start_score']
            s_stop = ins['stop_score']
            
            cls_start = ins['start_CLS']
            cls_stop = ins['stop_CLS']
            
            if real_stop > real_start:   
                if s_start - cls_start + s_stop - cls_stop > best_score:
                    best_score = s_start - cls_start + s_stop - cls_stop
                    best_start = real_start
                    best_stop = real_stop
                    best_target = ins_target

        doc_lan = {}
        doc_lan['example_id'] = doc_id
        doc_lan['start'] = best_start
        doc_lan['stop'] = best_stop
        doc_lan['target'] = best_target
        doc_lan['score'] = best_score
        
        if debug:
            if idx == 101:
                print(doc_lan)
        
        list_doc_lan.append(doc_lan)
    
    list_doc_lan_df = pd.DataFrame(list_doc_lan)
    return list_doc_lan_df

## **Get instances (html tags cleaned) for long answer predict**

In [None]:
cleanr = re.compile('<.*?>')
def clean_html(raw_html):
    cleantext = re.sub(cleanr, '<tag>', raw_html)
    return cleantext

def parseDataClean(filename=f_test, is_val=True, drop_noanswer_rate = 0.95, drop_null_instances_rate = 0.98, debug=False):
    INSTANCE_WORDS_LEN = 500 
    STRIDE = 128 
    num, count_drop, count_yes_no, count_long, count_short, count_no_answer = 0, 0, 0, 0, 0, 0
    list_instances = []

    with open(filename) as f:
        progress = tqdm(f)  
        for sam_count, line in enumerate(progress):
            data = json.loads(line)
            example_id = str(data['example_id'])


            doc_text_raw = data['document_text']
            doc_text_tag = clean_html(doc_text_raw) # change all html tags to the form <tag>
            doc_tag_split = doc_text_tag.split()

            lan_start, lan_stop, san_start, san_stop = -1, -1, -1, -1

            clean_doc = list(filter(('<tag>').__ne__, doc_tag_split))

            question = data['question_text'] # question

            len_ques = len(question.split())
            part_len = INSTANCE_WORDS_LEN - len_ques 

            num_ins = (len(clean_doc) - part_len)//STRIDE + 1

            for part_id in range(num_ins + 1):
                part_start = part_id*STRIDE
                part_stop = min(len(clean_doc), part_id*STRIDE + part_len)

                part_split = clean_doc[part_start:part_stop]

                part = ' '.join(part_split)
                
                instance = {'example_id': example_id, 'part_start': part_start, 'part_stop': part_stop,
                            'question': question,'context': part, 
                            'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                list_instances.append(instance)
    return list_instances

In [None]:
def getMapping(set_id, filename=f_test):
    list_cand_maps = []
    with open(filename) as f:
        progress = tqdm(f)  
        for sam_count, line in enumerate(progress):
                
            data = json.loads(line)
            example_id = str(data['example_id'])

            if example_id in set_id:
                doc_text_raw = data['document_text']
                doc_text_raw = clean_html(doc_text_raw) # change all html tags to the form <tag>
                doc_text_split = doc_text_raw.split()

                clean_doc = list(filter(('<tag>').__ne__, doc_text_split))

                list_candidates = data['long_answer_candidates']
                list_new_candidates = []
                for cand in list_candidates:
                    cand_start = cand['start_token']
                    cand_stop = cand['end_token']
                    
                    num_tag_bef_start = doc_text_split[0:cand_start].count('<tag>')
                    num_tag_bef_stop = doc_text_split[0:cand_stop].count('<tag>')
                
                    new_start = cand_start - num_tag_bef_start
                    new_stop = cand_stop - num_tag_bef_stop
                    
                    new_cand = {}
                    new_cand['end_token'] = new_stop
                    new_cand['start_token'] = new_start
                    
                    list_new_candidates.append(new_cand)
                sample = {}
                sample['example_id'] = str(example_id)
                sample['new_candidates'] = list_new_candidates
                sample['old_candidates'] = list_candidates
                
                list_cand_maps.append(sample)
    return list_cand_maps

In [None]:
def build_model(model_name, debug=False):
    encoder = TFBertModel.from_pretrained(model_name)
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    tags = ['``', '\'\'', '--']

    special_tokens_dict = {'additional_special_tokens': tags}

    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    
    encoder.resize_token_embeddings(len(tokenizer))

    NUM_TARGET = 5
    class MyQAModel(tf.keras.Model):
        def __init__(self, *inputs, **kwargs):
            super().__init__(*inputs, **kwargs)            
            self.bert = encoder

            self.start_logits = tf.keras.layers.Dense(1)
            self.stop_logits = tf.keras.layers.Dense(1)
            
            self.target = tf.keras.layers.Dense(NUM_TARGET)

        def call(self, inputs, **kwargs):
            bert_res=self.bert(inputs[0], 
                               token_type_ids=inputs[1], 
                               attention_mask=inputs[2]
                               )
            dropout_res1 = bert_res[0]

            start_logits = tf.squeeze(self.start_logits(dropout_res1), -1)
            dropout_res2 = bert_res[0]

            stop_logits = tf.squeeze(self.stop_logits(dropout_res2), -1)
            dropout_res3 = bert_res[1]
            
            targets = self.target(dropout_res3)
            
            paddings = tf.constant([[0, 0,], [0, 512-NUM_TARGET]])
            targets = tf.pad(targets, paddings)
            
            res = tf.stack([start_logits, stop_logits, targets], axis=1)
            return res
        
    model = MyQAModel()
    return model 

In [None]:
def getRawInstanceResults(list_test, verbose = True, debug = False):  
    if verbose:
        print('Getting raw result for all the instances generated from test file')
        
    model_name = '../input/tensorflow-question-answer-fine-data'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    tags = ['``', '\'\'', '--']

    special_tokens_dict = {'additional_special_tokens': tags}

    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    print(num_added_toks)
    print(len(tokenizer))
    
    x_test1, x_test2, x_test3, y_test = preprocess_data(list_test, tokenizer)
    if verbose:
        print("Finish tokenizing ", len(list_test), " data for the first model")
        print(x_test1.shape)
    
    if verbose:
        print("Preparing model")
        
    strategy = get_strategy()
    with strategy.scope():
        testModel = build_model(model_name)
        x = np.ones([1, 512], dtype=int)
        testModel.predict([x, x, x])
        testModel.load_weights('../input/model1/weights-02.h5')
        optAdam = tf.keras.optimizers.Adam(learning_rate=0.00005)
        lossSCE = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metricSCA = tf.keras.metrics.SparseCategoricalAccuracy()
        testModel.compile(optimizer=optAdam, loss=lossSCE, metrics=[metricSCA])
    
    if verbose:
        print("Finish loading pretrained weights for the model")
        
    test_res = testModel.predict([x_test1, x_test2, x_test3], verbose=1)
    
    if verbose:
        print("Finish calculating raw result, get an array of size: ", test_res.shape)
    return test_res


In [None]:
def getSubmissionLan(doc_res_df, doc_cand_df, threshold=0.0001, debug=False):
    doc_res_df.example_id = doc_res_df.example_id.astype(str)
    doc_cand_df.example_id = doc_cand_df.example_id.astype(str)
    if debug:
        print(doc_res_df.dtypes)
        print(doc_cand_df.dtypes)

    combine_df = pd.merge(doc_res_df, doc_cand_df, on='example_id')
    lines = []
    for id, doc in combine_df.iterrows():

        example_id = doc['example_id']
        long_id = str(example_id) + '_long'
        short_id = str(example_id) + '_short'

        line_long = {}
        line_long['example_id'] = long_id

        an_start = int(doc['start'])
        an_stop = int(doc['stop'])
        an_target = doc['target']
        an_score = doc['score']
        # print(an_start, an_stop, an_target, an_score)
        lan_start, lan_stop = -1, -1

        # find long answer 
        if an_start > 0 and an_stop > 0:
            candidates = doc['new_candidates']
            an_range = [*range(an_start, an_stop + 1, 1)]

            best_inter = 0.5
            shortest = 10000000000000
            best_id = 0
            for cidx, cand in enumerate(candidates):
                c_start = int(cand['start_token'])
                c_stop = int(cand['end_token'])

                c_range = [*range(c_start, c_stop + 1, 1)]
                inter = len(list(set(an_range)&set(c_range)))
            
                if float(inter) > best_inter:
                    best_id = cidx
                    best_inter = inter
                    shortest = len(c_range)
                elif inter == best_inter:
                    if shortest > len(c_range):
                        best_id = cidx
                        shortest = len(c_range)

            real_candidates = doc['old_candidates']
            lan_start = real_candidates[best_id]['start_token']
            lan_stop = real_candidates[best_id]['end_token']

            if debug:
                if id == 101:
                    print(lan_start, lan_stop)

        if lan_start > 0 and lan_stop > 0 and an_target != 0:
            long_string = str(lan_start) + ':' + str(lan_stop)
        else:
            long_string = ''


        line_long['PredictionString'] = long_string
        lines.append(line_long)

    lines_df = pd.DataFrame(lines)
    sorted_df = lines_df.sort_values('example_id')
    return sorted_df

## **Process short answer**

In [None]:
def getSanCandidate(sub, filename=f_test, debug=False):
    INSTANCE_WORDS_LEN = 500 
    STRIDE = 256 

    list_doc_lan_res = []
    for rowid, row in sub.iterrows():
        example_id = str(row['example_id']).replace('_long',"")
        lan_start, lan_stop = -1, -1

        if str(row['PredictionString']) != '':
            tokens = str(row['PredictionString']).split(':')
            lan_start = int(tokens[0])
            lan_stop = int(tokens[1]) 
            
        sam = {'example_id': example_id, 'lan_start': lan_start, 'lan_stop': lan_stop}
        list_doc_lan_res.append(sam)
        
    list_doc_lan_res_df = pd.DataFrame(list_doc_lan_res)

    set_id = set(list_doc_lan_res_df['example_id'].values.tolist())

    list_san_ins = []

    with open(filename) as f:
        progress = tqdm(f)  
        for sam_count, line in enumerate(progress):
            data = json.loads(line)
            example_id = str(data['example_id'])
            if example_id in set_id:
                # get lan result 
                ans = list_doc_lan_res_df.loc[list_doc_lan_res_df['example_id']==example_id]
                lan_start, lan_stop = -1, -1
                for rowid, row in ans.iterrows():
                    lan_start = row['lan_start']
                    lan_stop = row['lan_stop']
                if debug:
                    print(example_id, lan_start, lan_stop)
                doc_text = data['document_text']
                doc_text_split = doc_text.split()
                question = data['question_text']
                
                if lan_start > -1 and lan_stop > -1:
                    if lan_stop - lan_start <= INSTANCE_WORDS_LEN:
                        offset = (INSTANCE_WORDS_LEN - (lan_stop - lan_start))//2 
                        part_start = max(0,lan_start - offset)
                        part_stop = min(lan_stop + offset, len(doc_text_split))
                        part_split = doc_text_split[part_start:part_stop]
                        context = ' '.join(part_split)
                        ins = {'example_id': example_id, 'part_start': part_start, 'part_stop': part_stop, 
                               'question': question, 'context': context, 'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                        list_san_ins.append(ins) 
                        if debug:
                            print(ins)
                    else: 
                    # in case found long answer is longer than context length limit then split the long answer into small parts
                    # and slide with stride 256
                        part_length = INSTANCE_WORDS_LEN
                        num_parts = (lan_stop - lan_start - INSTANCE_WORDS_LEN)//STRIDE + 1
                        for part_id in range(num_parts + 1):
                            part_start = lan_start + part_id*STRIDE
                            part_stop = min(len(doc_text_split), lan_start + part_id*STRIDE + part_length)
                            part_split = doc_text_split[part_start:part_stop]
                    
                            context = ' '.join(part_split)
                            ins = {'example_id': example_id, 'part_start': part_start, 'part_stop': part_stop, 
                               'question': question, 'context': context, 'start': 0, 'stop': 0, 'target': 'NO_ANSWER'}
                            list_san_ins.append(ins)
                            if debug:
                                print(ins)
    return list_san_ins            


In [None]:
def create_model_san(tokenizer_san, model_name_san, debug=False):
    config = BertConfig()
    if debug:
        print(config)
    encoder = TFBertModel.from_pretrained(model_name_san)
    encoder.resize_token_embeddings(len(tokenizer_san))

    NUM_TARGET = 5
    class MyQAModel(tf.keras.Model):
        def __init__(self, *inputs, **kwargs):
            super().__init__(*inputs, **kwargs)            
            self.bert = encoder
            self.start_logits = tf.keras.layers.Dense(1)
            self.stop_logits = tf.keras.layers.Dense(1)
            
            self.target = tf.keras.layers.Dense(NUM_TARGET)

        def call(self, inputs, **kwargs):
            bert_res=self.bert(inputs[0], 
                               token_type_ids=inputs[1], 
                               attention_mask=inputs[2]
                               )
            
            dropout_res1 = bert_res[0]

            start_logits = tf.squeeze(self.start_logits(dropout_res1), -1)

            dropout_res2 = bert_res[0]

            stop_logits = tf.squeeze(self.stop_logits(dropout_res2), -1)

            dropout_res3 = bert_res[1]
            
            targets = self.target(dropout_res3)
            
            paddings = tf.constant([[0, 0,], [0, 512-NUM_TARGET]])
            targets = tf.pad(targets, paddings)
            
            res = tf.stack([start_logits, stop_logits, targets], axis=1)
            return res
        
    model = MyQAModel()
    return model 

In [None]:
def getSanRawRes(list_san_ins, verbose=1):
    print("Getting raw result for short answer instance generated from found long answers")
    
    model_name_san = '../input/tensorflow-question-answer-fine-data'

    tokenizer_san = AutoTokenizer.from_pretrained(model_name_san)

    tags_san = ['<Dd>', '<Dl>', '<Dt>', '<H1>', '<H2>', '<H3>', '<Li>', '<Ol>', '<P>', '<Table>', '<Td>', '<Th>', '<Tr>', '<Ul>',
            '</Dd>', '</Dl>', '</Dt>', '</H1>', '</H2>', '</H3>', '</Li>', '</Ol>', '</P>', '</Table>', '</Td>', '</Th>', '</Tr>', '</Ul>',
            '<Th_colspan=', '</Th_colspan=', '``', '\'\'', '--']

    special_tokens_dict_san = {'additional_special_tokens': tags_san}

    num_added_toks_san = tokenizer_san.add_special_tokens(special_tokens_dict_san)
    print("Short answer vocab size: ", len(tokenizer_san))
    
    x_san1, x_san2, x_san3, y_san = preprocess_data(list_san_ins, tokenizer_san)
    print("Finish tokenizing ", len(list_san_ins), " instances for short answer candidates")
    print(x_san1.shape)
    
    strategy_san = get_strategy()
    with strategy_san.scope():
        sanModel = create_model_san(tokenizer_san, model_name_san)
        x = np.ones([1, 512], dtype=int)
        sanModel.predict([x, x, x])
        sanModel.load_weights('../input/model1/weights-14.h5')
        optAdam = tf.keras.optimizers.Adam(learning_rate=0.00005)
        lossSCE = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metricSCA = tf.keras.metrics.SparseCategoricalAccuracy()
        sanModel.compile(optimizer=optAdam, loss=lossSCE, metrics=[metricSCA])
    
    if verbose:
        print("Finish loading pretrained weights for the model for short answer")
        
    test_res = sanModel.predict([x_san1, x_san2, x_san3], verbose=1)
    
    if verbose:
        print("Finish calculating raw result, get an array of size: ", test_res.shape)
    return test_res

In [None]:
def getSanSubmission(doc_res_df, threshold=0.0001, debug=False):
    doc_res_df.example_id = doc_res_df.example_id.astype(str)
    lines = []
    for id, doc in doc_res_df.iterrows():
        example_id = doc['example_id']
        short_id = str(example_id) + '_short'

        line_short = {}
        line_short['example_id'] = short_id

        an_start = int(doc['start'])
        an_stop = int(doc['stop'])
        an_target = int(doc['target'])
        an_score = float(doc['score'])

        if an_start > 0 and an_stop > 0 and an_target != 4 and an_stop - an_start < 30:
            short_string = str(an_start) + ':' + str(an_stop)
        else:
            short_string = ''

        if an_target == 1 or an_target == 2:
            short_string = AnswerTypeRev[an_target]


        line_short['PredictionString'] = short_string
        lines.append(line_short)

    lines_df = pd.DataFrame(lines)
    sorted_df = lines_df.sort_values('example_id')
    return sorted_df

In [None]:
def refineLan(sub, list_mapping_df, debug=False):
    newsub = []
    for rowid, row in sub.iterrows():
        if 'long' in str(row['example_id']):
            example_id = str(row['example_id']).replace('_long',"")
            
            longid = str(row['example_id'])
            longStr = str(row['PredictionString'])
            
            lan_start, lan_stop = -1, -1

            if str(row['PredictionString']) != '':
                tokens = str(row['PredictionString']).split(':')
                lan_start = int(tokens[0])
                lan_stop = int(tokens[1])
            
            # find corresponding short answer 
            san_start, san_stop = -1, -1
            
            sanid = str(example_id) + '_short'
            san = sub.loc[sub['example_id'] == sanid].iloc[0]
            sanStr = str(san['PredictionString'])
            
            
            if sanStr != '' and sanStr != 'YES' and sanStr != 'NO':
                tokensans = sanStr.split(':')
                san_start = int(tokensans[0])
                san_stop = int(tokensans[1])
                
                if san_start < lan_start or san_stop > lan_stop: # san is not in lan 
                    # find candidate list of this example 
                    cands = list_mapping_df.loc[list_mapping_df['example_id'] == example_id].iloc[0]['old_candidates']
                    
                    an_range = [*range(san_start, san_stop + 1, 1)]
                    best_inter = 0.5
                    shortest = 10000000000000
                    best_id = 0
                    for cidx, cand in enumerate(cands):
                        c_start = int(cand['start_token'])
                        c_stop = int(cand['end_token'])

                        c_range = [*range(c_start, c_stop + 1, 1)]
                        inter = len(list(set(an_range)&set(c_range)))

                        if float(inter) > best_inter:
                            best_id = cidx
                            best_inter = inter
                            shortest = len(c_range)
                        elif inter == best_inter:
                            if shortest > len(c_range):
                                best_id = cidx
                                shortest = len(c_range)

                    lan_start = cands[best_id]['start_token']
                    lan_stop = cands[best_id]['end_token']
                    longStr = str(lan_start) + ":" + str(lan_stop)
                    
            longline = {'example_id': longid, 'PredictionString': longStr}
            shortline = {'example_id': sanid, 'PredictionString': sanStr}
            newsub.append(longline)
            newsub.append(shortline)
    newsubdf = pd.DataFrame(newsub)
    newsubsorted = newsubdf.sort_values('example_id')
    return newsubsorted

## **From here on is for test**

In [None]:
list_id_df = get_id_df()

In [None]:
set_id = set(list_id_df['example_id'].values.tolist())
lan_map = getMapping(set_id)

In [None]:
list_mappings_df = pd.DataFrame(lan_map)
list_mappings_df.head()

In [None]:
list_all_ins= parseDataClean(f_test)
all_ins_res = getRawInstanceResults(list_all_ins)

In [None]:
list_fine_res_all_ins = mergeInstanceResult(all_ins_res, list_all_ins)
fine_res_all_ins_df = pd.DataFrame(list_fine_res_all_ins)

In [None]:
fine_res_all_ins_df.head()

In [None]:
docAnsDf = mergeDocumentRes(fine_res_all_ins_df, list_id_df)

In [None]:
docAnsDf.head()

In [None]:
subLan = getSubmissionLan(docAnsDf, list_mappings_df)

In [None]:
subLan.head(20)

In [None]:
list_san_ins = getSanCandidate(subLan, debug=False)

In [None]:
sanRawRes = getSanRawRes(list_san_ins)

In [None]:
list_fine_res_san_ins = mergeInstanceResult(sanRawRes, list_san_ins)
fine_res_san_ins_df = pd.DataFrame(list_fine_res_san_ins)

In [None]:
fine_res_san_ins_df.head()

In [None]:
docSanAnsDf = mergeDocumentRes(fine_res_san_ins_df, list_id_df)

In [None]:
docSanAnsDf.head()

In [None]:
subSan = getSanSubmission(docSanAnsDf, threshold=0.2)

In [None]:
subSan.head(20)

In [None]:
sub = pd.concat([subLan, subSan])
sub_sorted = sub.sort_values('example_id')

In [None]:
sub_sorted.head(20)

In [None]:
refineSub = refineLan(sub, list_mappings_df, debug=True)

In [None]:
refineSub.to_csv('./submission.csv', 
                  index=False, 
                  columns=['example_id', 'PredictionString'])