# 패키지 설치하기
pip 명령어로 의존성 있는 패키지를 설치합니다.

In [1]:
!pip install ratsnlp

Collecting ratsnlp
[?25l  Downloading https://files.pythonhosted.org/packages/4b/2c/f57b2f8b27007140f29c5758ffd5d89411c6a06e536c34c0d54471fbd1ec/ratsnlp-0.0.95-py3-none-any.whl (55kB)
[K     |██████                          | 10kB 25.3MB/s eta 0:00:01[K     |███████████▉                    | 20kB 31.2MB/s eta 0:00:01[K     |█████████████████▊              | 30kB 21.5MB/s eta 0:00:01[K     |███████████████████████▋        | 40kB 20.1MB/s eta 0:00:01[K     |█████████████████████████████▌  | 51kB 22.3MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 8.7MB/s 
Collecting flask-ngrok>=0.0.25
  Downloading https://files.pythonhosted.org/packages/af/6c/f54cb686ad1129e27d125d182f90f52b32f284e6c8df58c1bae54fa1adbc/flask_ngrok-0.0.25-py3-none-any.whl
Collecting Korpora>=0.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/1a/b1/5e563e23f1f705574bbeb55555e0cb95c9813e9396d654cd42709418ab66/Korpora-0.2.0-py3-none-any.whl (57kB)
[K     |████████████████████████

# 구글 드라이브 연동하기
모델 체크포인트 등을 저장해 둘 구글 드라이브를 연결합니다. 자신의 구글 계정에 적용됩니다.

In [2]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


# 각종 설정
모델 하이퍼파라메터(hyperparameter)와 저장 위치 등 설정 정보를 선언합니다.

In [3]:
from ratsnlp.nlpbook.classification import ClassificationTrainArguments
args = ClassificationTrainArguments(
    pretrained_model_name="beomi/kcbert-base",
    downstream_task_name="pair-classification",
    downstream_corpus_root_dir="/root/Korpora",
    downstream_corpus_name="kornli",
    downstream_model_dir="/gdrive/My Drive/nlpbook/checkpoint-paircls",
    do_eval=True,
    batch_size=32,
    epochs=3,
)

# 랜덤 시드 고정
학습 재현을 위해 랜덤 시드를 고정합니다.

In [4]:
from ratsnlp import nlpbook
nlpbook.seed_setting(args)

# 로거 설정
메세지 출력 등을 위한 logger를 설정합니다.

In [5]:
nlpbook.set_logger(args)

01/23/2021 04:05:33 - INFO - ratsnlp.nlpbook.utils -   Training/evaluation parameters ClassificationTrainArguments(pretrained_model_name='beomi/kcbert-base', downstream_task_name='pair-classification', downstream_corpus_name='kornli', downstream_corpus_root_dir='/root/Korpora', downstream_model_dir='/gdrive/My Drive/nlpbook/checkpoint-paircls', max_seq_length=128, overwrite_model=False, save_top_k=1, monitor='max val_acc', seed=7, overwrite_cache=False, force_download=False, test_mode=False, learning_rate=5e-06, optimizer='AdamW', lr_scheduler='exp', epochs=3, batch_size=32, cpu_workers=2, fp16=False, do_train=True, do_eval=True, do_predict=False, tpu_cores=0, report_cycle=100, stat_window_length=30)


# 말뭉치 다운로드
실습에 사용할 말뭉치(KorNLI Corpus)를 다운로드합니다.

In [6]:
from Korpora import Korpora
Korpora.fetch(
    corpus_name=args.downstream_corpus_name,
    root_dir=args.downstream_corpus_root_dir,
    force_download=True,
)

[kornli] download multinli.train.ko.tsv: 83.6MB [00:00, 251MB/s]                            
[kornli] download snli_1.0_train.ko.tsv: 78.5MB [00:00, 202MB/s]                            
[kornli] download xnli.dev.ko.tsv: 516kB [00:00, 19.2MB/s]
[kornli] download xnli.test.ko.tsv: 1.04MB [00:00, 29.0MB/s]


# 토크나이저 준비
토큰화를 수행하는 토크나이저를 선언합니다

In [7]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case=False,
)

01/23/2021 04:05:41 - INFO - transformers.tokenization_utils_base -   Model name 'beomi/kcbert-base' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, TurkuNLP/bert-base-finnish-cased-v1, TurkuNLP/bert-base-finnish-uncased-v1, wietsedv/bert-base-dutch-cased). Assuming 'beomi/kcbert-base' is a path, a model identifier, or url to a directory containing tokenizer files.
01/23/2021 04:05:41 - INFO - filelock -   Lock 139790862413552 acquired on /root/.cache/torch/transformers/8d1e655d205732689406462e2fa1fa62566629a0625aa980eeae4599d873bb66.4e15945a0369

Downloading:   0%|          | 0.00/250k [00:00<?, ?B/s]

01/23/2021 04:05:41 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/beomi/kcbert-base/vocab.txt in cache at /root/.cache/torch/transformers/8d1e655d205732689406462e2fa1fa62566629a0625aa980eeae4599d873bb66.4e15945a03694aa613f931134ea9a6f64624cd748a3cfe607ca1e2b066eb9f91
01/23/2021 04:05:41 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/8d1e655d205732689406462e2fa1fa62566629a0625aa980eeae4599d873bb66.4e15945a03694aa613f931134ea9a6f64624cd748a3cfe607ca1e2b066eb9f91
01/23/2021 04:05:41 - INFO - filelock -   Lock 139790862413552 released on /root/.cache/torch/transformers/8d1e655d205732689406462e2fa1fa62566629a0625aa980eeae4599d873bb66.4e15945a03694aa613f931134ea9a6f64624cd748a3cfe607ca1e2b066eb9f91.lock
01/23/2021 04:05:42 - INFO - filelock -   Lock 139790862413552 acquired on /root/.cache/torch/transformers/22b5f7b39de8c16e82d058e2d5116222ce1fc616a291c5a6ad9c2c24e802104f.53b84dc0c694dad783dde

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

01/23/2021 04:05:42 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/beomi/kcbert-base/tokenizer_config.json in cache at /root/.cache/torch/transformers/22b5f7b39de8c16e82d058e2d5116222ce1fc616a291c5a6ad9c2c24e802104f.53b84dc0c694dad783dde1213af7f7c990c093d7453b07066497d5ffcc953289
01/23/2021 04:05:42 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/22b5f7b39de8c16e82d058e2d5116222ce1fc616a291c5a6ad9c2c24e802104f.53b84dc0c694dad783dde1213af7f7c990c093d7453b07066497d5ffcc953289
01/23/2021 04:05:42 - INFO - filelock -   Lock 139790862413552 released on /root/.cache/torch/transformers/22b5f7b39de8c16e82d058e2d5116222ce1fc616a291c5a6ad9c2c24e802104f.53b84dc0c694dad783dde1213af7f7c990c093d7453b07066497d5ffcc953289.lock
01/23/2021 04:05:42 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/beomi/kcbert-base/vocab.txt from cache at /roo

# 학습데이터 구축
학습데이터를 만듭니다.

In [8]:
from ratsnlp.nlpbook.paircls import KorNLICorpus
from ratsnlp.nlpbook.classification import ClassificationDataset
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
corpus = KorNLICorpus()
train_dataset = ClassificationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="train",
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    sampler=RandomSampler(train_dataset, replacement=False),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)

01/23/2021 04:06:01 - INFO - filelock -   Lock 139790849013184 acquired on /root/Korpora/kornli/cached_train_BertTokenizer_128_kornli_pair-classification.lock
01/23/2021 04:06:01 - INFO - ratsnlp.nlpbook.classification.corpus -   Creating features from dataset file at /root/Korpora/kornli
01/23/2021 04:06:01 - INFO - ratsnlp.nlpbook.paircls.corpus -   loading train data... LOOKING AT /root/Korpora/kornli
01/23/2021 04:06:06 - INFO - ratsnlp.nlpbook.classification.corpus -   tokenize sentences, it could take a lot of time...
01/23/2021 04:09:58 - INFO - ratsnlp.nlpbook.classification.corpus -   tokenize sentences [took 232.160 s]
01/23/2021 04:10:06 - INFO - ratsnlp.nlpbook.classification.corpus -   *** Example ***
01/23/2021 04:10:06 - INFO - ratsnlp.nlpbook.classification.corpus -   sentence A, B: 개념적으로 크림 스키밍은 제품과 지리라는 두 가지 기본 차원을 가지고 있다. + 제품과 지리학은 크림 스키밍을 작동시키는 것이다.
01/23/2021 04:10:06 - INFO - ratsnlp.nlpbook.classification.corpus -   tokens: [CLS] 개념 ##적으로 크 ##림 스 ##키 ##밍 ##은 제품 

# 테스트 데이터 구축
학습 중에 평가할 테스트 데이터를 구축합니다.

In [9]:
if args.do_eval:
    val_dataset = ClassificationDataset(
        args=args,
        corpus=corpus,
        tokenizer=tokenizer,
        mode="test",
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        sampler=SequentialSampler(val_dataset),
        collate_fn=nlpbook.data_collator,
        drop_last=False,
        num_workers=args.cpu_workers,
    )
else:
    val_dataloader = None

01/23/2021 04:11:54 - INFO - filelock -   Lock 139790862478584 acquired on /root/Korpora/kornli/cached_test_BertTokenizer_128_kornli_pair-classification.lock
01/23/2021 04:11:54 - INFO - ratsnlp.nlpbook.classification.corpus -   Creating features from dataset file at /root/Korpora/kornli
01/23/2021 04:11:54 - INFO - ratsnlp.nlpbook.paircls.corpus -   loading test data... LOOKING AT /root/Korpora/kornli
01/23/2021 04:11:54 - INFO - ratsnlp.nlpbook.classification.corpus -   tokenize sentences, it could take a lot of time...
01/23/2021 04:11:55 - INFO - ratsnlp.nlpbook.classification.corpus -   tokenize sentences [took 1.377 s]
01/23/2021 04:11:55 - INFO - ratsnlp.nlpbook.classification.corpus -   *** Example ***
01/23/2021 04:11:55 - INFO - ratsnlp.nlpbook.classification.corpus -   sentence A, B: 글쎄, 나는 그것에 관해 생각조차 하지 않았지만, 나는 너무 좌절했고, 결국 그에게 다시 이야기하게 되었다. + 나는 그와 다시 이야기하지 않았다.
01/23/2021 04:11:55 - INFO - ratsnlp.nlpbook.classification.corpus -   tokens: [CLS] 글쎄 , 나는 그것 ##에 관 ##해 생각 ##

# 모델 초기화
프리트레인이 완료된 BERT 모델을 읽고, 문서 분류를 수행할 모델을 초기화합니다.

In [10]:
from transformers import BertConfig, BertForSequenceClassification
pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels=corpus.num_labels,
)

01/23/2021 04:12:32 - INFO - filelock -   Lock 139790857829176 acquired on /root/.cache/torch/transformers/11ab69ed90bcc2d01ac229deb193678b3b22dc986959a7115be9f7e328d57956.5c73fbc761bca6713f2361fb4816b98c9db40d7c41a6e56197122ef2450ba4b2.lock
01/23/2021 04:12:32 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/beomi/kcbert-base/config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp_la_wmb2


Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

01/23/2021 04:12:33 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/beomi/kcbert-base/config.json in cache at /root/.cache/torch/transformers/11ab69ed90bcc2d01ac229deb193678b3b22dc986959a7115be9f7e328d57956.5c73fbc761bca6713f2361fb4816b98c9db40d7c41a6e56197122ef2450ba4b2
01/23/2021 04:12:33 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/11ab69ed90bcc2d01ac229deb193678b3b22dc986959a7115be9f7e328d57956.5c73fbc761bca6713f2361fb4816b98c9db40d7c41a6e56197122ef2450ba4b2
01/23/2021 04:12:33 - INFO - filelock -   Lock 139790857829176 released on /root/.cache/torch/transformers/11ab69ed90bcc2d01ac229deb193678b3b22dc986959a7115be9f7e328d57956.5c73fbc761bca6713f2361fb4816b98c9db40d7c41a6e56197122ef2450ba4b2.lock
01/23/2021 04:12:33 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/beomi/kcbert-base/config.json from cache at /r

In [11]:
model = BertForSequenceClassification.from_pretrained(
        args.pretrained_model_name,
        config=pretrained_model_config,
)

01/23/2021 04:12:35 - INFO - filelock -   Lock 139790862350376 acquired on /root/.cache/torch/transformers/a0348cdf9a93056f4e4adc497208b8853967239b4c6acccffcac0196ae7b6c90.3eb9d0c1847ce30bbb6bde7ce4902413737fcd46a212efb3fe8c2a708f2a47d5.lock
01/23/2021 04:12:35 - INFO - transformers.file_utils -   https://cdn.huggingface.co/beomi/kcbert-base/pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpnaqskdqm


Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

01/23/2021 04:12:40 - INFO - transformers.file_utils -   storing https://cdn.huggingface.co/beomi/kcbert-base/pytorch_model.bin in cache at /root/.cache/torch/transformers/a0348cdf9a93056f4e4adc497208b8853967239b4c6acccffcac0196ae7b6c90.3eb9d0c1847ce30bbb6bde7ce4902413737fcd46a212efb3fe8c2a708f2a47d5
01/23/2021 04:12:40 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/a0348cdf9a93056f4e4adc497208b8853967239b4c6acccffcac0196ae7b6c90.3eb9d0c1847ce30bbb6bde7ce4902413737fcd46a212efb3fe8c2a708f2a47d5
01/23/2021 04:12:40 - INFO - filelock -   Lock 139790862350376 released on /root/.cache/torch/transformers/a0348cdf9a93056f4e4adc497208b8853967239b4c6acccffcac0196ae7b6c90.3eb9d0c1847ce30bbb6bde7ce4902413737fcd46a212efb3fe8c2a708f2a47d5.lock
01/23/2021 04:12:40 - INFO - transformers.modeling_utils -   loading weights file https://cdn.huggingface.co/beomi/kcbert-base/pytorch_model.bin from cache at /root/.cache/torch/transformers/a0348cdf9a93056f4e4

# 학습 준비
Task와 Trainer를 준비합니다.

In [12]:
from ratsnlp.nlpbook.classification import ClassificationTask
task = ClassificationTask(model, args)

In [13]:
trainer = nlpbook.get_trainer(args)

01/23/2021 04:12:45 - INFO - lightning -   GPU available: True, used: True
01/23/2021 04:12:45 - INFO - lightning -   TPU available: False, using: 0 TPU cores
01/23/2021 04:12:45 - INFO - lightning -   CUDA_VISIBLE_DEVICES: [0]


# 학습
준비한 데이터와 모델로 학습을 시작합니다. 학습 결과물(체크포인트)은 미리 연동해둔 구글 드라이브의 준비된 위치(`/gdrive/My Drive/nlpbook/checkpoint-paircls`)에 저장됩니다.

In [None]:
trainer.fit(
    task,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
)

01/23/2021 04:13:05 - INFO - lightning -   
  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 


Training: 0it [00:00, ?it/s]