In [0]:
# Reference: https://github.com/google-research/bert/
# special processing is needed to the datas without questions and answers

from google.colab import drive 
import json
import tensorflow as tf

In [0]:
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
# KorQuAD data load
file_path = '/gdrive/My Drive/Colab Notebooks/korquad/KorQuAD_v1.0_train.json' 

with open(file_path) as data_file:    
    data = json.load(data_file)

In [0]:
# A single training/test example 
class SquadExample(object):

  def __init__(self,
               qas_id,
               question_text,
               context_tokens,
               answer_text=None,
               start_position=None,
               end_position=None):
    
    self.qas_id = qas_id
    self.question_text = question_text
    self.context_tokens = context_tokens
    self.answer_text = answer_text
    self.start_position = start_position
    self.end_position = end_position


In [0]:
# Read a KorQuAD json file into a list of SquadExample.
def read_squad_examples(input_data):
  def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
      return True
    return False
  
  examples = []
  for entry in input_data:
    for paragraph in entry["paragraphs"]:
      context = paragraph["context"]
      context_tokens = []
      char_to_word_offset = []
      prev_is_whitespace = True
      for c in context:
        if is_whitespace(c):
          prev_is_whitespace = True
        else:
          if prev_is_whitespace:
            context_tokens.append(c)
          else:
            context_tokens[-1] += c
          prev_is_whitespace = False
        char_to_word_offset.append(len(context_tokens) - 1)
      
      for qa in paragraph["qas"]:
        qas_id = qa["id"]
        question_text = qa["question"]
        start_position = None
        end_position = None
        answer_text = None
        
        answer_text = qa["answers"][0]["text"]
        answer_offset = qa["answers"][0]["answer_start"]
        answer_length = len(answer_text)
        start_position = char_to_word_offset[answer_offset]
        end_position = char_to_word_offset[answer_offset + answer_length - 1]

        example = SquadExample(
            qas_id=qas_id,
            question_text=question_text,
            context_tokens=context_tokens,
            answer_text=answer_text,
            start_position=start_position,
            end_position=end_position)
        examples.append(example)

  return examples

In [0]:
def checkSquadExample(num):
  print('qas_id:', examples[num].qas_id)
  print('question_text:', examples[num].question_text)
  print('context_tokens:', examples[num].context_tokens)
  print('answer_text:',examples[num].answer_text)
  print('start_position:', examples[num].start_position)
  print('end_position:', examples[num].end_position)

In [0]:
examples = read_squad_examples(data['data'])

In [0]:
len(examples)

60407

In [0]:
checkSquadExample(500)

qas_id: 6484373-1-1
question_text: 농심그룹의 유통회사는 무엇인가?
context_tokens: ['농심그룹은', '사업주', '일가가', '농심홀딩스를', '통해', '식품사업인', "'농심'과", '화학사업인', "'율촌화학'을", '경영하고', '있다.', '농심은', '창업주인', '신춘호', '회장', '체제에서', '2세', '경영이', '한창', '진행', '중이다.', '그', '주인공은', '신', '회장의', '세', '아들인', '신동원(장남)', '농심', '부회장과', '신동윤(차남)', '율촌화학', '부회장,', '신동익(삼남)', '메가마트', '부회장이다.', '장남에게는', '식품사업을,', '차남에게는', '화학사업을', '맡기고,', '삼남에게는', '유통회사인', '메가마트', '경영을', '넘기며', '계열', '분리를', '준비해왔다.', '신춘호', '회장은', '2세', '중', '누구에게', '회사', '를', '맡기겠다고', '공식적으로', '밝힌', '적은', '없다.', '다만', '20', '여', '년', '전부터', '업무를', '차별화하고,', '지주사인', '농심홀딩스', '지분을', '차등', '배분하는', '방식으로', '후계구도를', '잤다.', '2017년', '5월과', '6월에', '진행된', '농심의', '지분', '변화가', '눈에', '띈다.', '5월', '4일', '신동원', '농심', '부회장과', '신동윤', '율촌화학', '부회장은', '자신들이', '보유하던', '상대방', '회사의', '주식을', '서로', '주고받아', '각자의', '회사', '지분을', '늘렸다.', '이', '주식', '교환으로', '장남은', '농심홀딩스', '지분을', '36.93%에서', '42.92%로', '늘렸고,', '차남은', '율촌화학', '지분을', '5.10%에서', '13.93%로', '확대했다.', '농심의', '지주회사인', '농심홀딩스의', '최대주