In [1]:
!pip install transformers
!pip install datasets
!pip install seqeval
!pip install Korpora

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [2]:
import warnings

warnings.filterwarnings(action='ignore')

In [3]:
import torch

print("Torch version:{}".format(torch.__version__))
print("cuda version: {}".format(torch.version.cuda))
print("cudnn version:{}".format(torch.backends.cudnn.version()))

Torch version:2.0.0+cu118
cuda version: 11.8
cudnn version:8700


In [4]:
# Import package

from datasets import DatasetDict, load_dataset, concatenate_datasets, load_metric
from sklearn.dummy import DummyClassifier
from tqdm import tqdm
import pandas as pd
import numpy as np

In [5]:
from Korpora import Korpora

Korpora.fetch("naver_changwon_ner")
corpus = Korpora.load("naver_changwon_ner")

[naver_changwon_ner] download train_data: 16.9MB [00:00, 281MB/s]



    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : 네이버 + 창원대
    Repository : https://github.com/naver/nlp-challenge/tree/master/missions/ner
    References : http://air.changwon.ac.kr/?page_id=10

    개체명(Named Entity)은 인명, 기관명, 지명 등과 같이 문장 또는 문서에서 특정한 의미를 가지고 있는 단어 또는 어구를 말합니다.
    이 때문에 개체명은 정보 검색 및 언어 이해를 위한 분석에서 주요한 대상으로 다루어지고 있습니다.
    Data.ly에서는 개체명 코퍼스를 제공하여 연구에 도움을 드리고자 하며, 공개적인 리더보드를 통해 많은 분들의 연구 동향을 논의/공유하고자 합니다.
    제공되는 코퍼스는 Data.ly에서 제작한 것으로, 연구 및 리더보드를 위한 학습으로 사용 가능하며 상업적인 목적으로 사용될 수 없습니다.

    # License
    연구 및 리더보드를 위한 학습으로 사용 가능하며 상업적인 목적으로 사용될 수 없습니다.

[Korpora] Corpus `naver_changwon_ner` is already installed at /root/Korpora/naver_changwon_ner/train_data


In [6]:
df = pd.DataFrame()
df['text'] = corpus.train.texts
df['words'] = corpus.train.words
df['tags'] = corpus.train.tags

In [7]:
all_tag = []
for tags in tqdm(corpus.train.tags):
    for tag in tags:
        if tag not in all_tag:
            all_tag.append(tag)

100%|██████████| 90000/90000 [00:00<00:00, 386089.83it/s]


In [8]:
df['tags'] = df['tags'].apply(lambda x: ' '.join(x).replace('-','O'))

In [9]:
labels = [label.split() for label in df['tags'].values.tolist()]

label_list = []

for label in labels:
    [label_list.append(l) for l in label]

label_list = list(set(label_list)) 
print(label_list)

['ANM_I', 'CVL_B', 'AFW_I', 'LOC_I', 'TIM_B', 'NUM_I', 'DAT_I', 'FLD_B', 'ORG_I', 'PLT_B', 'ANM_B', 'EVT_I', 'PLT_I', 'LOC_B', 'O', 'AFW_B', 'PER_I', 'NUM_B', 'CVL_I', 'TRM_B', 'MAT_I', 'TRM_I', 'FLD_I', 'ORG_B', 'MAT_B', 'PER_B', 'TIM_I', 'DAT_B', 'EVT_B']


In [10]:
id2tag = {id: tag for id, tag in enumerate(label_list)}
tag2id = {tag: id for id, tag in enumerate(label_list)}
tag2id

{'ANM_I': 0,
 'CVL_B': 1,
 'AFW_I': 2,
 'LOC_I': 3,
 'TIM_B': 4,
 'NUM_I': 5,
 'DAT_I': 6,
 'FLD_B': 7,
 'ORG_I': 8,
 'PLT_B': 9,
 'ANM_B': 10,
 'EVT_I': 11,
 'PLT_I': 12,
 'LOC_B': 13,
 'O': 14,
 'AFW_B': 15,
 'PER_I': 16,
 'NUM_B': 17,
 'CVL_I': 18,
 'TRM_B': 19,
 'MAT_I': 20,
 'TRM_I': 21,
 'FLD_I': 22,
 'ORG_B': 23,
 'MAT_B': 24,
 'PER_B': 25,
 'TIM_I': 26,
 'DAT_B': 27,
 'EVT_B': 28}

In [11]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification

model_name = "beomi/kcbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

In [12]:
df['ner_tags'] = df['tags'].apply(lambda x: [tag2id[tag] for tag in x.split()])

In [13]:
df = df.iloc[:5000]

In [14]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

In [15]:
df_train.reset_index(drop=True,inplace=True)
df_val.reset_index(drop=True,inplace=True)
df_test.reset_index(drop=True,inplace=True)

In [16]:
from datasets import Dataset

dataset1 = Dataset.from_pandas(df_train)
dataset2 = Dataset.from_pandas(df_val)
dataset3 = Dataset.from_pandas(df_test)

In [17]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
updated_dataset = DatasetDict({"train":dataset1,
                               "val":dataset2,
                               "test":dataset3})

In [19]:
# hub에 존재.

# updated_dataset.push_to_hub("Hansollll/ssolllll")

In [21]:
dataset = load_dataset("Hansollll/ssolllll")

Downloading readme:   0%|          | 0.00/584 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/Hansollll___parquet/Hansollll--ssolllll-b1b362d10d305141/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.02M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/72000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/Hansollll___parquet/Hansollll--ssolllll-b1b362d10d305141/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'words', 'tags', 'ner_tags'],
        num_rows: 72000
    })
    test: Dataset({
        features: ['text', 'words', 'tags', 'ner_tags'],
        num_rows: 9000
    })
    val: Dataset({
        features: ['text', 'words', 'tags', 'ner_tags'],
        num_rows: 9000
    })
})

In [23]:
train_dataset = concatenate_datasets([dataset["train"],dataset["val"],dataset['test']])
train_dataset

Dataset({
    features: ['text', 'words', 'tags', 'ner_tags'],
    num_rows: 90000
})

In [24]:
ith_example=2

print(dataset["train"][ith_example]['words'])
print([label for label in train_dataset[2]['tags'].split()])

['박승문의', '평가에', '간탐한', '아발론지역', '지역지', '‘맨체스터', '당군', '군부’는', '아스널의', '그림자만', '쫓으면서', '경기에', '파상문을', '미치지', '못했다”는', '단평과', '낮은', '11점을', '매겼다', '.']
['PER_B', 'O', 'O', 'LOC_B', 'O', 'ORG_B', 'ORG_I', 'ORG_I', 'ORG_B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NUM_B', 'O', 'O']


In [25]:
tokenized_input = tokenizer(tokenizer.tokenize(dataset['train'][2]['text']), is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [26]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['words'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f'ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [27]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/72000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

In [28]:
id2tag[-100]='ignore'
exml=tokenized_train_dataset[2]

pd.DataFrame({'tokens':tokenizer.convert_ids_to_tokens(exml["input_ids"]), 'ner_labels':exml['labels'], 'ner_tags': [id2tag[label] for label in exml['labels']] })

Unnamed: 0,tokens,ner_labels,ner_tags
0,[CLS],-100,ignore
1,박,19,TRM_B
2,##승,-100,ignore
3,##문의,-100,ignore
4,평가,21,TRM_I
5,##에,-100,ignore
6,간,21,TRM_I
7,##탐,-100,ignore
8,##한,-100,ignore
9,아,24,MAT_B


In [29]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(pd.Series(tokenized_train_dataset['input_ids']).explode(), pd.Series(tokenized_train_dataset['labels']).explode().astype(str))
dummy_clf.score(pd.Series(tokenized_train_dataset['input_ids']).explode(), pd.Series(tokenized_train_dataset['labels']).explode().astype(str))

0.6039230075770557

In [30]:
exploded_values=pd.Series(tokenized_train_dataset['labels']).explode()
exploded_values=pd.DataFrame(exploded_values,columns=['B'])

most_frequent_elem_by_doc=pd.Series(tokenized_train_dataset['labels']).apply(lambda x:  max(set(x), key=x.count))
most_frequent_elem_by_doc=pd.DataFrame(most_frequent_elem_by_doc,columns=list('A'))

df_most_freq_token=exploded_values.merge(most_frequent_elem_by_doc, how='right', left_index=True, right_index=True)

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(pd.Series(tokenized_train_dataset['input_ids']).explode(), df_most_freq_token['A'])
dummy_clf.score(pd.Series(tokenized_train_dataset['input_ids']).explode(), df_most_freq_token['A'])

0.942242644593618

In [31]:
#Data Collator

from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the

In [32]:
metric_seqeval = load_metric("seqeval")
example = dataset["train"][2]

labels = [label_list[i] for i in example["ner_tags"]]
metric_seqeval.compute(predictions=[labels], references=[labels])

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

{'AT_B': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'IM_B': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'IM_I': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'RM_B': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'RM_I': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 4},
 'UM_I': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [33]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric_seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [34]:
from transformers import EarlyStoppingCallback, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./log_results',
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16,   
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    warmup_steps=500, 
    eval_steps=500,
    save_steps=500,
    evaluation_strategy="steps",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [35]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,1.0753,0.495571,0.651771,0.573298,0.610022,0.854917
1000,0.4624,0.380679,0.721715,0.670312,0.695065,0.886541
1500,0.3903,0.339277,0.750158,0.691158,0.71945,0.89733
2000,0.3616,0.294215,0.771847,0.73853,0.754821,0.909535
2500,0.3484,0.27527,0.77706,0.762866,0.769898,0.914395
3000,0.3186,0.256264,0.797031,0.762149,0.7792,0.919564
3500,0.3088,0.239768,0.805962,0.790537,0.798175,0.92569
4000,0.297,0.232538,0.810832,0.788443,0.799481,0.926938
4500,0.2918,0.217709,0.817477,0.807776,0.812598,0.931788
5000,0.2814,0.203313,0.830816,0.808648,0.819583,0.935737


TrainOutput(global_step=11250, training_loss=0.301698978000217, metrics={'train_runtime': 4053.2678, 'train_samples_per_second': 44.409, 'train_steps_per_second': 2.776, 'total_flos': 7061349245190240.0, 'train_loss': 0.301698978000217, 'epoch': 2.0})

In [36]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric_seqeval.compute(predictions=true_predictions, references=true_labels)
results

{'AT_B': {'precision': 0.8719709208400647,
  'recall': 0.8733818770226537,
  'f1': 0.8726758286176233,
  'number': 2472},
 'AT_I': {'precision': 0.8118279569892473,
  'recall': 0.7475247524752475,
  'f1': 0.7783505154639175,
  'number': 202},
 'ER_B': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 9},
 'ER_I': {'precision': 0.870931326434619,
  'recall': 0.8686432726590355,
  'f1': 0.8697857948139798,
  'number': 5329},
 'FW_B': {'precision': 0.8321564367191744,
  'recall': 0.8179391350774159,
  'f1': 0.8249865374259558,
  'number': 1873},
 'IM_B': {'precision': 0.7837837837837838,
  'recall': 0.7488262910798122,
  'f1': 0.765906362545018,
  'number': 426},
 'IM_I': {'precision': 0.9580221997981837,
  'recall': 0.9582155833669762,
  'f1': 0.9581188818246038,
  'number': 4954},
 'LD_B': {'precision': 0.8820754716981132,
  'recall': 0.8852071005917159,
  'f1': 0.8836385115180153,
  'number': 845},
 'LD_I': {'precision': 0.8463713477851084,
  'recall': 0.8528015194681862,
  'f1': 

In [37]:
def tag_sentence(text:str):
    # convert our text to a  tokenized sequence
    inputs = tokenizer(text, truncation=True, return_tensors="pt").to("cuda")
    # get outputs
    outputs = model(**inputs)
    # convert to probabilities with softmax
    probs = outputs[0][0].softmax(1)
    # get the tags with the highest probability
    word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()]) 
                  for i, tagid in enumerate (probs.argmax(axis=1))]

    return pd.DataFrame(word_tags, columns=['word', 'tag'])

In [38]:
text = '''신카이 마코토 감독의 일본 애니메이션 '스즈메의 문단속'이 국내 누적 관객 수 500만 명을 돌파했습니다.
수입사 미디어캐슬은 오늘(28일) 보도자료를 내고 '스즈메의 문단속'이 오늘 오후 4시쯤
누적 관객 수 500만 명을 넘어섰다고 밝혔습니다.
올해 개봉한 영화 가운데 500만 관객을 넘긴 것은 이 영화가 처음입니다.'''

df_tag = tag_sentence(text)

In [39]:
df_tag.head(20)

Unnamed: 0,word,tag
0,[CLS],TRM_I
1,신,TRM_B
2,##카,TRM_B
3,##이,ANM_I
4,마,ANM_I
5,##코,TRM_B
6,##토,ANM_I
7,감독,PER_I
8,##의,TRM_I
9,일본,MAT_B
