## 교제에서는 ratsnlp를 사용하여 colab에서 구현
### -> Local에서 구현하기 위해 HuggingFace 참고

In [1]:
import torch
from transformers import TrainingArguments,BertForSequenceClassification,BertTokenizer,Trainer,DataCollatorWithPadding
from Korpora import Korpora
from datasets import load_dataset

In [3]:
args = TrainingArguments(
    output_dir='E:/Model/checkpoint-doccls',
    per_device_train_batch_size=32 if torch.cuda.is_available() else 4,
    per_device_eval_batch_size=32 if torch.cuda.is_available() else 4,
    num_train_epochs=3,
    learning_rate=5e-5,
    tpu_num_cores=0 if torch.cuda.is_available() else 8,
    seed=7
)

### BertTokenizer pretrained 모델인 kcbert 사용

In [9]:
tokenizer = BertTokenizer.from_pretrained(
    'beomi/kcbert-base',
    do_lower_case = False
)

### load_dataset을 사용해 nsmc 데이터 로드

In [5]:
dataset = load_dataset('nsmc')
dataset['test'][0]

Found cached dataset nsmc (C:/Users/고성민/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

{'id': '6270596', 'document': '굳 ㅋ', 'label': 1}

### document(text)를 기준으로 toknizing 
> * batched : map을 적용할때 한번에 하지않고 batch 단위로 나눠서함
> * remove_columns: id와 document column은 tokenizing 이후 필요없어 제거

In [6]:
dataset = dataset.map(lambda examples: tokenizer(examples['document'],padding='max_length',truncation=True,max_length=128), batched=True)
dataset = dataset.remove_columns(['id','document'])

Loading cached processed dataset at C:\Users\고성민\.cache\huggingface\datasets\nsmc\default\1.1.0\bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3\cache-7bd95b1122c48121.arrow
Loading cached processed dataset at C:\Users\고성민\.cache\huggingface\datasets\nsmc\default\1.1.0\bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3\cache-facb9451f9c7c751.arrow


### DataCollatorWithPadding을 사용해 패딩 설정 가능

In [91]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,padding='max_length',max_length=128)

In [92]:
train_dataset = dataset['train']
test_dataset = dataset['test']

NameError: name 'dataset' is not defined

### BertForSequenceClassification 모델의 kcbert-base 사용
> * cuda사용해서 train

In [3]:
model = BertForSequenceClassification.from_pretrained("beomi/kcbert-base")
model.to('cuda')

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [10]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [11]:
trainer.train()

Step,Training Loss
500,0.3949
1000,0.3359
1500,0.3258
2000,0.3084
2500,0.2937
3000,0.2866
3500,0.2888
4000,0.2824
4500,0.2762
5000,0.2243


TrainOutput(global_step=14064, training_loss=0.20394258287579534, metrics={'train_runtime': 12666.2256, 'train_samples_per_second': 35.528, 'train_steps_per_second': 1.11, 'total_flos': 2.9599993728e+16, 'train_loss': 0.20394258287579534, 'epoch': 3.0})

# 모델 테스트

## 가장 성능이 좋은 모델 찾기

In [4]:
import os
import json

# 마지막 데이터가 저장된 위치
path = 'E:\Model\checkpoint-doccls\checkpoint-14000'
loss_step=[]
state = os.path.join(path,'trainer_state.json') 
with open(state,'r') as f:
    for log in json.load(f)['log_history']:
        loss,step = log['loss'],log['step']
        loss_step.append((loss,step))

In [5]:
loss,step = sorted(loss_step)[0]
loss,step

(0.1002, 14000)

In [6]:
model_path = f'E:/Model/checkpoint-doccls/checkpoint-{step}'
fine_tuned_model = BertForSequenceClassification.from_pretrained(model_path)

### 예측해보기
> * predict : fine_tuning 전 bert 모델
> * nsmc_predict : nsmc 데이터셋으로 fine_tuning한 모델

In [7]:
def predict(sentence):
    features = tokenizer(
        [sentence],
        max_length=128,
        padding='max_length',
        truncation=True
    )

    features = {k:torch.tensor(v) for k,v in features.items()}
    with torch.no_grad():
        outputs = model(**features)
        prob = outputs.logits.softmax(dim=1)[0]
        pos,neg = round(prob[1].item(),4),round(prob[0].item(),4)
        pred = "긍정" if torch.argmax(prob)==1 else "부정"
        return {
            'fine_tuned' : False,
            'sentence':sentence,
            'pos':pos,
            'neg':neg,
            'pred':pred
        }
    
def nsmc_predict(sentence):
    features = tokenizer(
        [sentence],
        max_length=128,
        padding='max_length',
        truncation=True
    )

    features = {k:torch.tensor(v) for k,v in features.items()}
    with torch.no_grad():
        outputs = fine_tuned_model(**features)
        prob = outputs.logits.softmax(dim=1)[0]
        pos,neg = round(prob[1].item(),4),round(prob[0].item(),4)
        pred = "긍정" if torch.argmax(prob)==1 else "부정"
        return {
            'fine_tuned' : True,
            'sentence':sentence,
            'pos':pos,
            'neg':neg,
            'pred':pred
        }

In [12]:
sentence = input('문장 입력 :')
predict(sentence),nsmc_predict(sentence)

문장 입력 :한번 더 보고싶네


({'fine_tuned': False,
  'sentence': '한번 더 보고싶네',
  'pos': 0.3856,
  'neg': 0.6144,
  'pred': '부정'},
 {'fine_tuned': True,
  'sentence': '한번 더 보고싶네',
  'pos': 0.9812,
  'neg': 0.0188,
  'pred': '긍정'})

## 다른 방법 -> TextClassificationPipeline 사용

In [106]:
from transformers import TextClassificationPipeline

In [225]:
predict=TextClassificationPipeline(model=model,tokenizer=tokenizer,return_all_scores=True)
nsmc_predict=TextClassificationPipeline(model=fine_tuned_model,tokenizer=tokenizer,return_all_scores=True)



In [226]:
sentence = input('문장 입력 :')
predict(sentence),nsmc_predict(sentence)

문장 입력 :한번 더 보고싶다.


([[{'label': 'LABEL_0', 'score': 0.5195181965827942},
   {'label': 'LABEL_1', 'score': 0.4804818332195282}]],
 [[{'label': 'LABEL_0', 'score': 0.02928623929619789},
   {'label': 'LABEL_1', 'score': 0.97071373462677}]])