# Introduction

Main notebook for trying out code fragments, or unit different modules.  Similar to def main().

## Data preprocessing

Various code fragments to process the Kaggle dataset.  The full Kaggle data must be stored locally.  Functions are stored in some external python scripts which are also stored locally (e.g. final_project/*).

In [123]:
import pandas as pd
import numpy as np
import json
import tensorflow as tf
from tqdm import tqdm

In [124]:
def random_sample_negative_candidates(distribution):
    temp = np.random.random()
    value = 0.
    for index in range(len(distribution)):
        value += distribution[index]
        if value > temp:
            break
    return index

def inputdata_KaggleWinner(json_dir, max_data = 9999999999):
    # prepare input
    #json_dir = '../../input/simplified-nq-train.jsonl'
    #max_data = 9999999999

    id_list = []
    data_dict = {}
    with open(json_dir) as f:
        for n, line in tqdm(enumerate(f)):
            if n > max_data:
                break
            data = json.loads(line)

            is_pos = False
            annotations = data['annotations'][0]
            if annotations['yes_no_answer'] == 'YES':
                is_pos = True
            elif annotations['yes_no_answer'] == 'NO':
                is_pos = True
            elif annotations['short_answers']:
                is_pos = True
            elif annotations['long_answer']['candidate_index'] != -1:
                is_pos = True

            if is_pos and len(data['long_answer_candidates']) > 1:
                data_id = data['example_id']
                id_list.append(data_id)

                # uniform sampling
                distribution = np.ones((len(data['long_answer_candidates']),), dtype=np.float32)
                if is_pos:
                    distribution[data['annotations'][0]['long_answer']['candidate_index']] = 0.
                distribution /= len(distribution)
                negative_candidate_index = random_sample_negative_candidates(distribution)

                #
                doc_words = data['document_text'].split()
                # negative
                candidate = data['long_answer_candidates'][negative_candidate_index]
                negative_candidate_words = doc_words[candidate['start_token']:candidate['end_token']]
                negative_candidate_start = candidate['start_token']
                negative_candidate_end = candidate['end_token']
                # positive
                candidate = data['long_answer_candidates'][annotations['long_answer']['candidate_index']]
                positive_candidate_words = doc_words[candidate['start_token']:candidate['end_token']]
                positive_candidate_start = candidate['start_token']
                positive_candidate_end = candidate['end_token']

                print('data annots' , data['annotations'])
                # initialize data_dict
                data_dict[data_id] = {
                    'question_text': data['question_text'],
                    'annotations': data['annotations'],
                    'positive_text': positive_candidate_words,
                    'positive_start': positive_candidate_start,
                    'positive_end': positive_candidate_end,
                    'negative_text': negative_candidate_words,
                    'negative_start': negative_candidate_start,
                    'negative_end': negative_candidate_end,
                }

    return data_dict

def compute_statistics(datasetKaggle):
    """
    Outputs various statistics of the Kaggle dataset

    :param datasetKaggle: the input dataset formatted as a dictionary
    :type datasetKaggle: dict
    :return: dictionary with the processed information
    :rtype: dict


    """
    print('compute_statistics called')
    yesNoAnswer = 0
    annotationsMax = 0
    totalLength = 0
    totalExamples = len(datasetKaggle)

    i=0
    #print('datasetKaggle=',datasetKaggle)
    for example in datasetKaggle:
        #print('example:', i);
        #print(example)
        print('value part')
        print(example['annotations'])
        annotationsMax = max(len(example['annotations']), annotationsMax)  # check for the maximum number of annotations
        if example['annotations'][0]['yes_no_answer'] != 'NONE':
            yesNoAnswer += 1

        totalLength += len(example['document_text']) 
    averageLength = totalLength / totalExamples
    output = {'annotationsMax': annotationsMax, 'num_yesNo': yesNoAnswer, 'text_avgLength': averageLength}
    i = i + 1
    return output


In [121]:
json_dir = 'cs230_spring2020/final_project/simplified-nq-train.jsonl.full'
num_entries = 5000

dataset_kaggle = inputdata_KaggleWinner(json_dir, num_entries)

len(dataset_kaggle)

dataset_kaggle

271it [00:00, 1359.36it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1952, 'candidate_index': 54, 'end_token': 2019}, 'short_answers': [{'start_token': 1960, 'end_token': 1969}], 'annotation_id': 593165450220027640}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 212, 'candidate_index': 15, 'end_token': 310}, 'short_answers': [{'start_token': 213, 'end_token': 215}], 'annotation_id': 12034874153783787365}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 319, 'candidate_index': 24, 'end_token': 438}, 'short_answers': [], 'annotation_id': 10527123009892725162}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 509, 'candidate_index': 59, 'end_token': 576}, 'short_answers': [{'start_token': 512, 'end_token': 514}], 'annotation_id': 14634796365152556576}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 82, 'candidate_index': 7, 'end_token': 169}, 'short_answers': [{'start_token': 83, 'end_token': 85}], 

561it [00:00, 1403.95it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1618, 'candidate_index': 73, 'end_token': 1813}, 'short_answers': [{'start_token': 1673, 'end_token': 1674}], 'annotation_id': 18317396663960143170}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 53, 'candidate_index': 0, 'end_token': 230}, 'short_answers': [], 'annotation_id': 8394227903812414210}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 540, 'candidate_index': 18, 'end_token': 641}, 'short_answers': [{'start_token': 562, 'end_token': 567}], 'annotation_id': 2519173687451435589}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 185, 'candidate_index': 11, 'end_token': 334}, 'short_answers': [], 'annotation_id': 14240855338080410049}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 638, 'candidate_index': 4, 'end_token': 750}, 'short_answers': [{'start_token': 684, 'end_token': 685}], 'annotation_id': 11915006256841949820

817it [00:00, 1332.12it/s]

[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 698, 'candidate_index': 91, 'end_token': 817}, 'short_answers': [{'start_token': 758, 'end_token': 759}, {'start_token': 760, 'end_token': 761}, {'start_token': 763, 'end_token': 764}], 'annotation_id': 4679192294004651337}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 9196, 'candidate_index': 112, 'end_token': 9279}, 'short_answers': [{'start_token': 9201, 'end_token': 9202}], 'annotation_id': 1140859560363660861}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 457, 'candidate_index': 26, 'end_token': 470}, 'short_answers': [{'start_token': 465, 'end_token': 468}], 'annotation_id': 3024264918733613793}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 40, 'candidate_index': 0, 'end_token': 237}, 'short_answers': [{'start_token': 100, 'end_token': 104}], 'annotation_id': 7516590246220905857}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token':

1060it [00:00, 1253.37it/s]

[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 12024, 'candidate_index': 312, 'end_token': 12815}, 'short_answers': [], 'annotation_id': 17227263206914125295}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1973, 'candidate_index': 44, 'end_token': 2033}, 'short_answers': [{'start_token': 2029, 'end_token': 2031}], 'annotation_id': 17505549913380224749}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 551, 'candidate_index': 16, 'end_token': 838}, 'short_answers': [], 'annotation_id': 6548689843950671128}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 403, 'candidate_index': 34, 'end_token': 458}, 'short_answers': [{'start_token': 452, 'end_token': 456}], 'annotation_id': 1366702545677166834}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1145, 'candidate_index': 76, 'end_token': 1205}, 'short_answers': [{'start_token': 1174, 'end_token': 1176}, {'start_token': 1190, 'end_token': 119

1309it [00:01, 1240.29it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 26, 'candidate_index': 0, 'end_token': 369}, 'short_answers': [{'start_token': 278, 'end_token': 282}], 'annotation_id': 8563599553058033182}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 638, 'candidate_index': 24, 'end_token': 752}, 'short_answers': [], 'annotation_id': 11630925138351843693}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 398, 'candidate_index': 25, 'end_token': 456}, 'short_answers': [{'start_token': 419, 'end_token': 421}, {'start_token': 422, 'end_token': 424}], 'annotation_id': 10409417425046209926}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 166, 'candidate_index': 12, 'end_token': 302}, 'short_answers': [{'start_token': 167, 'end_token': 170}], 'annotation_id': 7203732121612998734}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 458, 'candidate_index': 49, 'end_token': 606}, 'short_answers': [{'s

1557it [00:01, 1237.94it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1421, 'candidate_index': 41, 'end_token': 1436}, 'short_answers': [{'start_token': 1422, 'end_token': 1424}], 'annotation_id': 6325894142671632792}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1887, 'candidate_index': 38, 'end_token': 2115}, 'short_answers': [{'start_token': 2011, 'end_token': 2013}], 'annotation_id': 523191122451738784}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 688, 'candidate_index': 50, 'end_token': 779}, 'short_answers': [{'start_token': 713, 'end_token': 715}, {'start_token': 716, 'end_token': 718}, {'start_token': 719, 'end_token': 721}], 'annotation_id': 13643561275896565507}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 3695, 'candidate_index': 31, 'end_token': 3919}, 'short_answers': [], 'annotation_id': 10486841121883878265}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 221, 'candidate_i

1680it [00:01, 1198.42it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 544, 'candidate_index': 8, 'end_token': 2576}, 'short_answers': [{'start_token': 665, 'end_token': 668}, {'start_token': 988, 'end_token': 990}, {'start_token': 1011, 'end_token': 1013}, {'start_token': 1055, 'end_token': 1057}, {'start_token': 1562, 'end_token': 1564}], 'annotation_id': 4425720064939966422}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 500, 'candidate_index': 7, 'end_token': 656}, 'short_answers': [{'start_token': 508, 'end_token': 509}], 'annotation_id': 12801609680721123867}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 390, 'candidate_index': 23, 'end_token': 556}, 'short_answers': [{'start_token': 454, 'end_token': 456}], 'annotation_id': 1801488384043431678}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 31, 'candidate_index': 0, 'end_token': 99}, 'short_answers': [{'start_token': 87, 'end_token': 88}], 'annotation_id': 5

1917it [00:01, 1163.24it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1034, 'candidate_index': 21, 'end_token': 1113}, 'short_answers': [], 'annotation_id': 4199995972183943750}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 506, 'candidate_index': 52, 'end_token': 574}, 'short_answers': [{'start_token': 554, 'end_token': 555}], 'annotation_id': 10860811318395312114}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 307, 'candidate_index': 27, 'end_token': 428}, 'short_answers': [{'start_token': 375, 'end_token': 377}], 'annotation_id': 13327833673639775263}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 20, 'candidate_index': 0, 'end_token': 442}, 'short_answers': [], 'annotation_id': 7553395319018500917}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 18, 'candidate_index': 0, 'end_token': 358}, 'short_answers': [{'start_token': 32, 'end_token': 34}], 'annotation_id': 6874233665476456098}]
dat

2187it [00:01, 1243.30it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 410, 'candidate_index': 32, 'end_token': 478}, 'short_answers': [{'start_token': 441, 'end_token': 442}], 'annotation_id': 12613185160996988537}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 245, 'candidate_index': 18, 'end_token': 301}, 'short_answers': [{'start_token': 270, 'end_token': 272}], 'annotation_id': 17797353331228697847}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 948, 'candidate_index': 27, 'end_token': 1114}, 'short_answers': [{'start_token': 1090, 'end_token': 1093}], 'annotation_id': 13235982278583773923}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 20, 'candidate_index': 0, 'end_token': 207}, 'short_answers': [{'start_token': 34, 'end_token': 35}], 'annotation_id': 11491817565868865737}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1540, 'candidate_index': 69, 'end_token': 1950}, 'short_answers': [

2465it [00:01, 1312.29it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 515, 'candidate_index': 30, 'end_token': 703}, 'short_answers': [{'start_token': 593, 'end_token': 596}], 'annotation_id': 5495364051185243599}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 365, 'candidate_index': 22, 'end_token': 464}, 'short_answers': [{'start_token': 374, 'end_token': 376}], 'annotation_id': 7588757957738641174}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1653, 'candidate_index': 151, 'end_token': 1809}, 'short_answers': [{'start_token': 1769, 'end_token': 1773}], 'annotation_id': 17836641698290244227}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 2723, 'candidate_index': 218, 'end_token': 2758}, 'short_answers': [{'start_token': 2750, 'end_token': 2752}, {'start_token': 2753, 'end_token': 2756}], 'annotation_id': 1905677564408355538}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 34, 'candidate_in

2728it [00:02, 1289.74it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1301, 'candidate_index': 53, 'end_token': 1364}, 'short_answers': [], 'annotation_id': 15370099523437154678}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 551, 'candidate_index': 16, 'end_token': 678}, 'short_answers': [{'start_token': 552, 'end_token': 555}], 'annotation_id': 13162996050221354834}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 141, 'candidate_index': 11, 'end_token': 240}, 'short_answers': [{'start_token': 142, 'end_token': 144}], 'annotation_id': 9104871488250431064}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 928, 'candidate_index': 45, 'end_token': 1037}, 'short_answers': [{'start_token': 935, 'end_token': 947}], 'annotation_id': 4027844027240105276}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 253, 'candidate_index': 18, 'end_token': 324}, 'short_answers': [{'start_token': 318, 'end_token': 320}

3001it [00:02, 1317.37it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 22, 'candidate_index': 0, 'end_token': 335}, 'short_answers': [], 'annotation_id': 13215204790318004040}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 14, 'candidate_index': 0, 'end_token': 162}, 'short_answers': [], 'annotation_id': 14546560288846352796}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 425, 'candidate_index': 19, 'end_token': 492}, 'short_answers': [{'start_token': 430, 'end_token': 447}], 'annotation_id': 10961913788185613831}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 126, 'candidate_index': 10, 'end_token': 168}, 'short_answers': [{'start_token': 155, 'end_token': 156}, {'start_token': 157, 'end_token': 158}], 'annotation_id': 3352482689535862364}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1273, 'candidate_index': 36, 'end_token': 1440}, 'short_answers': [], 'annotation_id': 11867366509630332555

3265it [00:02, 1284.26it/s]

[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 72, 'candidate_index': 1, 'end_token': 2028}, 'short_answers': [{'start_token': 103, 'end_token': 108}], 'annotation_id': 10599709442219908676}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1497, 'candidate_index': 46, 'end_token': 1564}, 'short_answers': [{'start_token': 1525, 'end_token': 1532}, {'start_token': 1538, 'end_token': 1545}, {'start_token': 1550, 'end_token': 1554}], 'annotation_id': 12905331611311097336}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 311, 'candidate_index': 1, 'end_token': 892}, 'short_answers': [{'start_token': 708, 'end_token': 710}], 'annotation_id': 15982765106982556736}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 195, 'candidate_index': 15, 'end_token': 340}, 'short_answers': [{'start_token': 217, 'end_token': 224}], 'annotation_id': 8441123036812032799}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_t

3552it [00:02, 1294.66it/s]

data annots [{'yes_no_answer': 'YES', 'long_answer': {'start_token': 143, 'candidate_index': 10, 'end_token': 315}, 'short_answers': [], 'annotation_id': 6271157883442109704}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 5377, 'candidate_index': 101, 'end_token': 5791}, 'short_answers': [{'start_token': 5459, 'end_token': 5460}], 'annotation_id': 6345190684333521427}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 155, 'candidate_index': 2, 'end_token': 216}, 'short_answers': [], 'annotation_id': 11520664542992474361}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1122, 'candidate_index': 126, 'end_token': 1305}, 'short_answers': [{'start_token': 1292, 'end_token': 1303}], 'annotation_id': 6043884482038376787}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 316, 'candidate_index': 6, 'end_token': 415}, 'short_answers': [{'start_token': 322, 'end_token': 324}], 'annotation_id': 661240285512327

3816it [00:02, 1267.27it/s]

[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 407, 'candidate_index': 27, 'end_token': 464}, 'short_answers': [{'start_token': 426, 'end_token': 427}], 'annotation_id': 17437871582026175987}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 191, 'candidate_index': 15, 'end_token': 245}, 'short_answers': [{'start_token': 207, 'end_token': 209}, {'start_token': 210, 'end_token': 212}, {'start_token': 213, 'end_token': 215}, {'start_token': 216, 'end_token': 218}, {'start_token': 220, 'end_token': 222}], 'annotation_id': 16705328388542067142}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 511, 'candidate_index': 28, 'end_token': 671}, 'short_answers': [{'start_token': 517, 'end_token': 519}, {'start_token': 520, 'end_token': 522}], 'annotation_id': 390829949041265501}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1081, 'candidate_index': 9, 'end_token': 1146}, 'short_answers': [{'start_token': 1094, 'end_tok

4076it [00:03, 1238.61it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 49, 'candidate_index': 0, 'end_token': 518}, 'short_answers': [{'start_token': 243, 'end_token': 246}], 'annotation_id': 10310124055203248348}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 140, 'candidate_index': 11, 'end_token': 206}, 'short_answers': [{'start_token': 157, 'end_token': 159}], 'annotation_id': 7255748263565259945}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 158, 'candidate_index': 11, 'end_token': 232}, 'short_answers': [{'start_token': 202, 'end_token': 208}, {'start_token': 209, 'end_token': 217}], 'annotation_id': 6540263721000340017}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 151, 'candidate_index': 14, 'end_token': 220}, 'short_answers': [{'start_token': 213, 'end_token': 215}, {'start_token': 216, 'end_token': 218}], 'annotation_id': 15366226762245712960}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'star

4351it [00:03, 1302.10it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 109, 'candidate_index': 8, 'end_token': 177}, 'short_answers': [{'start_token': 110, 'end_token': 112}], 'annotation_id': 15506845901130929166}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1660, 'candidate_index': 40, 'end_token': 1792}, 'short_answers': [], 'annotation_id': 14598865484958726729}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 204, 'candidate_index': 13, 'end_token': 213}, 'short_answers': [{'start_token': 209, 'end_token': 211}], 'annotation_id': 1491067977369385664}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 279, 'candidate_index': 2, 'end_token': 467}, 'short_answers': [], 'annotation_id': 5526716610511835221}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 113, 'candidate_index': 1, 'end_token': 264}, 'short_answers': [{'start_token': 168, 'end_token': 170}], 'annotation_id': 8287562956551820958}]


4643it [00:03, 1379.71it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 39, 'candidate_index': 0, 'end_token': 195}, 'short_answers': [], 'annotation_id': 7837783456407684872}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1285, 'candidate_index': 36, 'end_token': 1395}, 'short_answers': [{'start_token': 1286, 'end_token': 1290}], 'annotation_id': 13183678599409217974}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 6340, 'candidate_index': 64, 'end_token': 6662}, 'short_answers': [], 'annotation_id': 7424763965496329939}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 225, 'candidate_index': 19, 'end_token': 275}, 'short_answers': [{'start_token': 237, 'end_token': 239}], 'annotation_id': 7316100679253941526}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 24, 'candidate_index': 0, 'end_token': 378}, 'short_answers': [{'start_token': 341, 'end_token': 342}], 'annotation_id': 8777976454714384538}

4927it [00:03, 1350.64it/s]

data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 425, 'candidate_index': 41, 'end_token': 503}, 'short_answers': [{'start_token': 450, 'end_token': 452}], 'annotation_id': 3510021150369975071}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 793, 'candidate_index': 69, 'end_token': 974}, 'short_answers': [{'start_token': 867, 'end_token': 868}], 'annotation_id': 1573610838878712038}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 183, 'candidate_index': 15, 'end_token': 225}, 'short_answers': [{'start_token': 184, 'end_token': 186}], 'annotation_id': 3791314156069538645}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 79, 'candidate_index': 0, 'end_token': 331}, 'short_answers': [{'start_token': 130, 'end_token': 131}], 'annotation_id': 2350292552701851392}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 36, 'candidate_index': 0, 'end_token': 186}, 'short_answers': [{'start_t

5001it [00:03, 1287.54it/s]


data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 123, 'candidate_index': 11, 'end_token': 260}, 'short_answers': [{'start_token': 247, 'end_token': 248}], 'annotation_id': 614297059522215212}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 109, 'candidate_index': 3, 'end_token': 246}, 'short_answers': [{'start_token': 136, 'end_token': 138}], 'annotation_id': 927393246966995434}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 6564, 'candidate_index': 228, 'end_token': 6659}, 'short_answers': [], 'annotation_id': 12904186001711505878}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 221, 'candidate_index': 20, 'end_token': 267}, 'short_answers': [{'start_token': 238, 'end_token': 241}], 'annotation_id': 12125728296553915539}]
data annots [{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 12098, 'candidate_index': 211, 'end_token': 12323}, 'short_answers': [{'start_token': 12122, 'end_token': 

{5655493461695504401: {'annotations': [{'annotation_id': 593165450220027640,
    'long_answer': {'candidate_index': 54,
     'end_token': 2019,
     'start_token': 1952},
    'short_answers': [{'end_token': 1969, 'start_token': 1960}],
    'yes_no_answer': 'NONE'}],
  'negative_end': 1792,
  'negative_start': 1730,
  'negative_text': ['<P>',
   'Companies',
   'considering',
   'the',
   'use',
   'of',
   'an',
   'email',
   'marketing',
   'program',
   'must',
   'make',
   'sure',
   'that',
   'their',
   'program',
   'does',
   'not',
   'violate',
   'spam',
   'laws',
   'such',
   'as',
   'the',
   'United',
   'States',
   "'",
   'Controlling',
   'the',
   'Assault',
   'of',
   'Non-Solicited',
   'Pornography',
   'and',
   'Marketing',
   'Act',
   '(',
   'CAN',
   '-',
   'SPAM',
   ')',
   ',',
   'the',
   'European',
   'Privacy',
   'and',
   'Electronic',
   'Communications',
   'Regulations',
   '2003',
   ',',
   'or',
   'their',
   'Internet',
   'service',

In [122]:
dataset_kaggle_statistics = compute_statistics(dataset_kaggle)

print(dataset_kaggle_statistics)

compute_statistics called
value part


TypeError: 'int' object is not subscriptable

Code to product an extract of the Kaggle set data