## 교제에서는 ratsnlp를 사용하여 colab에서 구현
### -> Local에서 구현하기 위해 HuggingFace 참고

In [2]:
import torch
from transformers import TrainingArguments,BertForSequenceClassification,BertTokenizer,Trainer,DataCollatorWithPadding
from Korpora import Korpora
from datasets import load_dataset

In [3]:
args = TrainingArguments(
    output_dir='E:/Model/checkpoint-doccls',
    per_device_train_batch_size=32 if torch.cuda.is_available() else 4,
    per_device_eval_batch_size=32 if torch.cuda.is_available() else 4,
    num_train_epochs=3,
    learning_rate=5e-5,
    tpu_num_cores=0 if torch.cuda.is_available() else 8,
    seed=7
)

### BertTokenizer pretrained 모델인 kcbert 사용

In [4]:
tokenizer = BertTokenizer.from_pretrained(
    'beomi/kcbert-base',
    do_lower_case = False
)

### load_dataset을 사용해 nsmc 데이터 로드

In [5]:
dataset = load_dataset('nsmc')
dataset['test'][0]

Found cached dataset nsmc (C:/Users/고성민/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

{'id': '6270596', 'document': '굳 ㅋ', 'label': 1}

### document(text)를 기준으로 toknizing 
> * batched : map을 적용할때 한번에 하지않고 batch 단위로 나눠서함
> * remove_columns: id와 document column은 tokenizing 이후 필요없어 제거

In [6]:
dataset = dataset.map(lambda examples: tokenizer(examples['document'],padding='max_length',truncation=True,max_length=128), batched=True)
dataset = dataset.remove_columns(['id','document'])

Loading cached processed dataset at C:\Users\고성민\.cache\huggingface\datasets\nsmc\default\1.1.0\bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3\cache-7bd95b1122c48121.arrow
Loading cached processed dataset at C:\Users\고성민\.cache\huggingface\datasets\nsmc\default\1.1.0\bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3\cache-facb9451f9c7c751.arrow


### DataCollatorWithPadding을 사용해 패딩 설정 가능

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,padding='max_length',max_length=128)

In [8]:
train_dataset = dataset['train']
test_dataset = dataset['test']

### BertForSequenceClassification 모델의 kcbert-base 사용
> * cuda사용해서 train

In [9]:
model = BertForSequenceClassification.from_pretrained("beomi/kcbert-base")
model.to('cuda')

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [10]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [11]:
trainer.train()

Step,Training Loss
500,0.3949
1000,0.3359
1500,0.3258
2000,0.3084
2500,0.2937
3000,0.2866
3500,0.2888
4000,0.2824
4500,0.2762
5000,0.2243


TrainOutput(global_step=14064, training_loss=0.20394258287579534, metrics={'train_runtime': 12666.2256, 'train_samples_per_second': 35.528, 'train_steps_per_second': 1.11, 'total_flos': 2.9599993728e+16, 'train_loss': 0.20394258287579534, 'epoch': 3.0})