In [18]:
import os
import json
import re
from tqdm.notebook import tqdm
import collections

In [2]:
data_path="/opt/ml/project/odqa/data/"
context_path= "wikipedia_documents.json"
with open(os.path.join(data_path, context_path), "r", encoding="utf-8") as f:
    wiki = json.load(f)

In [3]:
wiki_data = []

for key in wiki.keys() :
    data = wiki[key]
    doc_id = data['document_id']
    text = data['text']
    wiki_data.append(text)

In [4]:
print('Data Size : %d' %len(wiki_data))

Data Size : 60613


## Preprocessor

In [5]:

class Preprocessor :
    def __init__(self,) :
        self.base_sub = re.compile(r'(\\n|\n)[*#]*')
        self.unk_sub = re.compile('[\u3000-\u303f\ud800—\udbff\ue000—\uf8ff]')
        self.outrange_sub = re.compile('[\uffff-\U000e007f]')

    def preprocess4train(self, dataset) :
        assert isinstance(dataset, dict)
        context = dataset['context']
        question = dataset['question']
        answer = dataset['answers']

        answer_start, answer_txt = answer['answer_start'][0], answer['text'][0]

        context_prev = context[:answer_start]
        context_next = context[answer_start+len(answer_txt):]

        context_prev = self.doc_preprocess(context_prev)
        context_next = self.doc_preprocess(context_next)
        answer_txt = self.doc_preprocess(answer_txt)

        answer_start = len(context_prev)

        dataset['context'] = context_prev + answer_txt + context_next
        dataset['answers'] = {'answer_start' : [answer_start], 'text' : [answer_txt]}
        dataset['question'] = self.doc_preprocess(question)
        return dataset

    def preprocess4test(self, dataset) :
        assert isinstance(dataset, dict)
        question = dataset['question']
        dataset['question'] = self.doc_preprocess(question)
        return dataset

    def doc_preprocess(self, txt) :
        txt = self.base_sub.sub(' ', txt)
        txt = self.unk_sub.sub(' ', txt)
        txt = self.outrange_sub.sub(' ', txt)
        txt = re.sub('\s+', ' ', txt)
        return txt



In [6]:
preprocessor = Preprocessor()

In [7]:
wiki_data = list(map(lambda x : preprocessor.doc_preprocess(x), wiki_data))
wiki_data = list(set(wiki_data))

In [8]:
print('Data Size : %d' %len(wiki_data))

Data Size : 55963


In [9]:
wiki_data[0]

' 사계절의 사나이 (A Man for all seasons)는 토머스 모어 경에 대해 다룬 로버트 볼트의 희곡 "A Man for All Seasons"를 바탕으로 한 1966년 영화이다. 프레드 진네만이 감독을 맡았다. 1966년 12월 12일에 개봉되었다. 이 영화는 작품상, 남우주연상을 포함하여 6개의 아카데미상을 받았다. 미국인 감독에 의해서 만들어진 가장 격조높은 작품으로, 신념대로 살다가 죽어 간 모어를 고전적(古典的)인 아름다운 무드와 드라마적인 감동으로 그려내고 있다. 무대극적인 제재를 받으면서도 영화적인 표현이 풍부하다. 이 성공은 스코필드의 훌륭한 연출에 힘입은 바가 컸으며, 쇼 이하 공연자(公演者)의 절도 있는 뛰어난 연기도 작품의 풍격(風格)을 높이는 원인이 되고 있다.'

## Korean Characters Rate

In [10]:
def check_rate(txt) :
    txt = re.sub('\s', '', txt)
    kor_size = 0
    for ch in txt :
        if ord(ch) in range(ord('가'), ord('힣')+1) :
            kor_size += 1

    kor_rate = kor_size / len(txt)
    return kor_rate

In [11]:
wiki_data = [txt for txt in wiki_data if check_rate(txt) > 0.5]
print('Data Size : %d' %len(wiki_data))

Data Size : 53653


## UNK Token

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer


In [16]:
# -- Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    'klue/bert-base',
    use_fast=True,
)



In [19]:
unk_id = tokenizer.convert_tokens_to_ids('[UNK]')

unk_words = []
for text in tqdm(wiki_data) :
    tok_list = word_tokenize(text)
    for tok in tok_list :
        if unk_id in tokenizer.encode(tok) :
            unk_words.append(tok)
            

HBox(children=(FloatProgress(value=0.0, max=53653.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (1226 > 512). Running this sequence through the model will result in indexing errors





In [20]:
unk_words = list(set(unk_words))

In [21]:
unk_chars = []

for word in unk_words :
    for ch in word :
        if tokenizer.convert_tokens_to_ids(ch) == unk_id :
            unk_chars.append(ch)

In [22]:
unk_ch_counter = collections.Counter()
unk_ch_counter.update(unk_chars)

unk_ch_counter = dict(unk_ch_counter)

In [24]:
sorted(unk_ch_counter.items(), key=lambda x : x[1], reverse=True)

[('李', 1528),
 ('–', 1092),
 ('ー', 801),
 ('ン', 550),
 ('郡', 547),
 ('縣', 518),
 ('α', 439),
 ('い', 426),
 ('á', 402),
 ('興', 400),
 ('к', 391),
 ('в', 390),
 ('国', 374),
 ('皇', 373),
 ('ا', 372),
 ('ス', 355),
 ('忠', 354),
 ('ん', 353),
 ('л', 348),
 ('ο', 344),
 ('永', 337),
 ('守', 335),
 ('ā', 334),
 ('羅', 329),
 ('松', 327),
 ('イ', 326),
 ('ル', 318),
 ('樂', 316),
 ('慶', 315),
 ('郎', 314),
 ('°', 312),
 ('尹', 308),
 ('六', 302),
 ('校', 294),
 ('等', 289),
 ('里', 288),
 ('監', 287),
 ('ς', 285),
 ('直', 278),
 ('ν', 276),
 ('藤', 276),
 ('衛', 274),
 ('趙', 270),
 ('か', 269),
 ('ó', 269),
 ('し', 267),
 ('令', 265),
 ('御', 265),
 ('ι', 264),
 ('ü', 262),
 ('ト', 259),
 ('像', 258),
 ('殿', 257),
 ('奉', 257),
 ('ラ', 257),
 ('左', 256),
 ('景', 255),
 ('リ', 255),
 ('昌', 253),
 ('ل', 252),
 ('崔', 250),
 ('應', 250),
 ('館', 250),
 ('功', 247),
 ('派', 247),
 ('谷', 246),
 ('ö', 245),
 ('順', 245),
 ('右', 244),
 ('ρ', 242),
 ('う', 240),
 ('星', 240),
 ('阿', 239),
 ('千', 238),
 ('な', 237),
 ('）', 235),
 ('（', 234

In [25]:
unk_ch_item = sorted(unk_ch_counter.items(), key=lambda x : x[1], reverse=True)
print('Size of UNK Characters : %d' %len(unk_ch_item))

Size of UNK Characters : 17138


In [30]:
unk_ch_list = []
unk_ch_count = []

for i in range(500) :
    if isinstance(unk_ch_item[i][1], int) == False :
        continue
    unk_ch_list.append(unk_ch_item[i][0])
    unk_ch_count.append(unk_ch_item[i][1])


In [28]:
import pandas as pd

In [33]:
unk_ch_map = pd.DataFrame({'Character' : unk_ch_list, 
    'ID' : range(len(unk_ch_list)),
    'Count' : unk_ch_count})

In [35]:
unk_ch_map.head()

Unnamed: 0,Character,ID,Count
0,李,0,1528
1,–,1,1092
2,ー,2,801
3,ン,3,550
4,郡,4,547


In [34]:
unk_ch_map.to_csv('../Tokenizer/unk_characters.csv')