In [96]:
import re
import os
import json
import torch
import random
import pandas as pd
import numpy as np
import multiprocessing
from datasets import DatasetDict, Dataset
from models.model import T5EncoderModel
from utils.metrics import NERMetrics
from utils.loader import Loader
from utils.parser import NERParser
from utils.seperate import Spliter
from utils.encoder import NEREncoder
from tqdm import tqdm

from transformers import (
    T5Config,
    T5TokenizerFast,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)


#### Load Datasets

In [97]:
 # -- Loading datasets
print("\nLoad datasets")
loader = Loader("data", "klue_ner_test_20.txt")
raw_dataset = loader.load(test_flag=True)


Load datasets


In [98]:
raw_dataset[:3]

['초반에 약간 뭐하는거지 할수는있는데뒤로갈수록 몰입도장난아님 주제도좋고 진짜편집좋다 <앤드류거필드:PS>도좋고',
 '국내 업체들이 마진이 적다는 이유로 경차 개발을 꺼린 탓에 경차가 자동차 시장 전체에서 차지하는 비중은 <10%:QT>를 간신히 넘는 상황이다.',
 '역시 <한지원:PS> 감독님.!코피루왁에서 눈물흘리면서 힐링 잘했습니다']

#### Extract Datasets

In [99]:
raw_sentences = []

for i in tqdm(range(len(raw_dataset))) :
    data = raw_dataset[i]
    info = re.search("<[^:]+:[A-Z]{2}>", data)

    while info is not None :
        start_p, end_p = info.span()
        group = info.group()
        tag_word, tag_name = group[1:-1].split(':')
        
        data = data[:start_p] + tag_word + data[end_p:]
        info = re.search("<[^:]+:[A-Z]{2}>", data)

    data = re.sub(" {1,}", " ", data).strip()
    raw_sentences.append(data)

100%|██████████| 5201/5201 [00:00<00:00, 84085.58it/s]


In [100]:
rand_id = np.random.randint(len(raw_sentences))
rand_sen = raw_sentences[rand_id]
print(rand_sen)

29일 금융권에 따르면 금융위원회와 금융감독원은 설 연휴가 끝난 직후인 2월 초에 대규모 인력을 투입해 신한카드, 삼성카드, 현대카드, 하나SK카드, 우리카드, 비씨카드 등 6개 전업 카드사에 대한 현장 검사를 실시한다.


#### Parsing Datasets

In [101]:
df = pd.DataFrame({"sentences" : raw_sentences})

In [102]:
df.head()

Unnamed: 0,sentences
0,초반에 약간 뭐하는거지 할수는있는데뒤로갈수록 몰입도장난아님 주제도좋고 진짜편집좋다 ...
1,국내 업체들이 마진이 적다는 이유로 경차 개발을 꺼린 탓에 경차가 자동차 시장 전체...
2,역시 한지원 감독님.!코피루왁에서 눈물흘리면서 힐링 잘했습니다
3,슬라이가 너무 평법한 액션 스릴러로 풀어버린 작품이다.
4,일본 애니메이션 진격의 거인을 연상시키는 진격의 농부가 등장해 각종 커뮤니티 사이트...


In [103]:
dataset = Dataset.from_pandas(df)

In [104]:
dataset

Dataset({
    features: ['sentences'],
    num_rows: 5201
})

#### Load Tokenizer

In [105]:
PLM = "exps/ner/fold-0"
tokenizer = T5TokenizerFast.from_pretrained(PLM, use_fast=True)

Didn't find file exps/ner/fold-0/added_tokens.json. We won't load it.
loading file exps/ner/fold-0/spiece.model
loading file exps/ner/fold-0/tokenizer.json
loading file None
loading file exps/ner/fold-0/special_tokens_map.json
loading file exps/ner/fold-0/tokenizer_config.json


#### Encode Datasets

In [106]:
encoder = NEREncoder(tokenizer, max_length=128, label_dict=None)

In [107]:
dataset = dataset.map(encoder, batched=True, num_proc=1)

  0%|          | 0/6 [00:00<?, ?ba/s]

In [108]:
dataset = dataset.remove_columns(["sentences"])

In [109]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 5201
})

#### Load Model

In [110]:
config = T5Config.from_pretrained(PLM)
model = T5EncoderModel.from_pretrained(PLM, config=config)

loading configuration file exps/ner/fold-0/config.json
Model config T5Config {
  "_name_or_path": "KETI-AIR/ke-t5-base",
  "architectures": [
    "T5EncoderModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "label_size": 7,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "torch_dtype": "float32",
  "transformers_version": "4.20.0",
  "use_cache": true,
  "vocab_size": 64128
}

loading weights file exps/ner/fold-0/pytorch_model.bin
All model checkpoint weights were used when initializing T5EncoderModel.

All the weights of T5EncoderModel were initialized from 

#### Data Collator

In [111]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, 
    padding=True,
    max_length=128
)

#### Training Arguments

In [112]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


#### Inference

In [113]:
trainer = Trainer(
    model,        
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


In [114]:
results = trainer.predict(test_dataset=dataset)

***** Running Prediction *****
  Num examples = 5201
  Batch size = 8


In [115]:
batch_size, seq_size, tag_size = results[0].shape

#### Postprocess

In [116]:
from konlpy.tag import Mecab

In [117]:
mecab = Mecab()

In [118]:
tags = np.argmax(results[0], axis=-1)

In [119]:
tokenized_sentences = tokenizer(
    raw_sentences,
    return_token_type_ids=False,
    return_offsets_mapping=True,
    truncation=True,
    max_length=128
)

offsets = tokenized_sentences.pop("offset_mapping")

In [None]:
def get_pos(data, tokenizer) :

    i = 0
    pos_label = []
    pos_data = tokenizer.pos(data)

    for pos in pos_data :
        word, tag = pos

        for j in range(len(word)) :
            pos_label.append(tag[0])
            i += 1
        
        if i < len(data) and data[i] == " " :
            pos_label.append("U")
            i += 1

    return pos_label

In [243]:
postprocessed_words = []
postprocessed_tags = []

for i in tqdm(range(len(tags))) :

    tag = tags[i]
    offset = offsets[i]
    sentence = raw_sentences[i]
    pos = get_pos(sentence, mecab)

    char_list = list(sentence)
    label_list = []

    if offset[0][0] == offset[1][0] :
        offset = offset[1:]

    for j in range(len(offset)-1) :
        start_p, end_p = offset[j]
        label = tag[j]
        label_list.extend([label] * (end_p - start_p))
    label_list.append(0)

    # 문제상황
    prev = 0
    k = 1
    word_list = []
    tag_list = []
    while k < len(label_list) :

        if label_list[k] != label_list[k-1] :
            if label_list[k-1] > 0 :
                
                l = k-1
                while pos[l] == "J" :
                    l -= 1

                word = [char_list[j] for j in range(prev, l+1)]
                word = "".join(word).strip()
                word_list.append(word)
                tag_list.append(label_list[l])
            prev = k
        k += 1

    postprocessed_words.append(word_list)
    postprocessed_tags.append(tag_list)

100%|██████████| 5201/5201 [00:01<00:00, 4602.79it/s]


#### Checking Results

In [247]:
rand_id = np.random.randint(len(postprocessed_words))
print(rand_id)
print(raw_dataset[rand_id])
print(raw_sentences[rand_id])
print(postprocessed_words[rand_id])
print(postprocessed_tags[rand_id])

3152
<육군:OG>은 <24일:DT> 인터넷 홈페이지에 내가 아는 선배, 친구, 가족들이 군대에 가면 ~~카더라, 진실은이라는 제목의 흥미로운 글을 올렸다.
육군은 24일 인터넷 홈페이지에 내가 아는 선배, 친구, 가족들이 군대에 가면 ~~카더라, 진실은이라는 제목의 흥미로운 글을 올렸다.
['육군', '24일']
[6, 2]
