### 24-1. 커스텀 프로젝트 직접 만들기

In [7]:
import tensorflow as tf
import numpy as np
import transformers
import datasets
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import create_optimizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from datasets import load_metric
import torch
import time
from datasets import Dataset
import logging
from transformers import Trainer, TrainingArguments, BertTokenizer, TFBertForSequenceClassification, DataCollatorWithPadding
logging.basicConfig(level=logging.INFO)

#### STEP 1. NSMC 데이터 분석 및 Huggingface dataset 구성


In [8]:
dataset = load_dataset('nsmc')
print(dataset)



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})


In [9]:
# STEP 2. klue/bert-base model 및 tokenizer 불러오기


# TF model
# model_name = "klue/bert-base"
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2, from_pt=True) 

model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base')
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [10]:
dataset['train'][0]

{'id': '9976970', 'label': 0, 'document': '아 더빙.. 진짜 짜증나네요 목소리'}

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["document"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [12]:
# np array 변환 
train_inputs = {key: np.array(tokenized_datasets["train"][key]) for key in ["input_ids", "attention_mask"]}
train_labels = np.array(tokenized_datasets["train"]["label"])

test_inputs = {key: np.array(tokenized_datasets["test"][key]) for key in ["input_ids", "attention_mask"]}
test_labels = np.array(tokenized_datasets["test"]["label"])

In [13]:
train_inputs

{'input_ids': array([[   2, 1376,  831, ...,    0,    0,    0],
        [   2, 1963,   18, ...,    0,    0,    0],
        [   2,    1,    3, ...,    0,    0,    0],
        ...,
        [   2, 4380, 1097, ...,    0,    0,    0],
        [   2, 9300, 3771, ...,    0,    0,    0],
        [   2, 3629, 3771, ...,    0,    0,    0]]),
 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])}

In [14]:
print(train_inputs['input_ids'].shape)
print(train_inputs['attention_mask'].shape)

(150000, 128)
(150000, 128)


In [13]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = Adam(learning_rate=2e-5)

In [63]:
# Hugging Face Dataset으로 변환
train_data = Dataset.from_dict(train_inputs)
test_data = Dataset.from_dict(test_inputs)

train_data = train_data.add_column("labels", train_labels)
test_data = test_data.add_column("labels", test_labels)

#### [Pytorch] baseline 1 epoch

In [19]:
# Accuracy  metric필요
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred  
    predictions = np.argmax(logits, axis=-1)  
    return metric.compute(predictions=predictions, references=labels)  


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def inference(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits

    probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()  # .cpu()로 CPU로 이동
    pred_label = np.argmax(probabilities)

    result = "Positive" if pred_label == 1 else "Negative"
    print(f"{sentence} , {result} ({probabilities[0][pred_label]:.4f}%)")

In [20]:
data_collator = DataCollatorWithPadding(tokenizer)

training_args = TrainingArguments(
    output_dir='./assets',              
    num_train_epochs=2,                  
    per_device_train_batch_size=8,       
    per_device_eval_batch_size=8,        
    logging_dir='./logs',                
    logging_steps=1,                     
    evaluation_strategy="epoch",         
    save_strategy="epoch",               
    report_to="none",                    
    logging_first_step=True,             
    load_best_model_at_end=True,        
    metric_for_best_model="accuracy",  
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,         
    compute_metrics=compute_metrics      
)

PyTorch: setting up devices


In [66]:
trainer.train()

PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.


GPU Memory: 1.71 GB


***** Running training *****
  Num examples = 150000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 37500


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0393,0.311077,0.88354
2,0.2397,0.394901,0.89246


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-18750
Configuration saved in ./results/checkpoint-18750/config.json
Model weights saved in ./results/checkpoint-18750/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-37500
Configuration saved in ./results/checkpoint-37500/config.json
Model weights saved in ./results/checkpoint-37500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-37500 (score: 0.89246).


GPU Memory: 2.53 GB
Training Time: 9253.83 seconds


In [71]:
inference("이 영화 그냥 그래.. 볼지 말지는 너 선택이야.")

이 영화 그냥 그래.. 볼지 말지는 너 선택이야. , Negative (0.9862%)


In [72]:
inference("이 영화 완전 별로인것 같았는데 꿀잼이야.")

이 영화 완전 별로인것 같았는데 꿀잼이야. , Positive (0.9859%)


In [73]:
inference("아 왜 영화관에 사람들이 없는지 알겠다.")

아 왜 영화관에 사람들이 없는지 알겠다. , Negative (0.9475%)


In [74]:
inference("근데 재미있긴 해.")

근데 재미있긴 해. , Positive (0.9747%)


In [77]:
inference("재미있어서 상영내내 꿀잠잤어.")

재미있어서 상영내내 꿀잠잤어. , Positive (0.9582%)


In [75]:
inference("너~무 재미있어서 상영내내 꿀잠잤어.")

너~무 재미있어서 상영내내 꿀잠잤어. , Positive (0.9656%)


In [76]:
inference("너무 재미있어서 상영내내 꿀잠잤어.")

너무 재미있어서 상영내내 꿀잠잤어. , Positive (0.9885%)


In [5]:
###### Bucketing test ######

def tokenize_function_new(examples):
    return tokenizer(examples["document"], truncation=True)  

tokenized_datasets_b = dataset.map(tokenize_function_new, batched=True)

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [6]:
tokenized_datasets_b['train']

Dataset({
    features: ['attention_mask', 'document', 'id', 'input_ids', 'label', 'token_type_ids'],
    num_rows: 150000
})

In [7]:
# NumPy 배열로 변환
train_inputs = {key: np.array(tokenized_datasets_b["train"][key]) for key in ["input_ids", "attention_mask"]}
train_labels = np.array(tokenized_datasets_b["train"]["label"])

test_inputs = {key: np.array(tokenized_datasets_b["test"][key]) for key in ["input_ids", "attention_mask"]}
test_labels = np.array(tokenized_datasets_b["test"]["label"])

  train_inputs = {key: np.array(tokenized_datasets_b["train"][key]) for key in ["input_ids", "attention_mask"]}
  test_inputs = {key: np.array(tokenized_datasets_b["test"][key]) for key in ["input_ids", "attention_mask"]}


In [15]:
train_data = Dataset.from_dict({**train_inputs, "labels": train_labels})
test_data = Dataset.from_dict({**test_inputs, "labels": test_labels})

In [16]:
train_inputs

{'input_ids': array([[   2, 1376,  831, ...,    0,    0,    0],
        [   2, 1963,   18, ...,    0,    0,    0],
        [   2,    1,    3, ...,    0,    0,    0],
        ...,
        [   2, 4380, 1097, ...,    0,    0,    0],
        [   2, 9300, 3771, ...,    0,    0,    0],
        [   2, 3629, 3771, ...,    0,    0,    0]]),
 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])}

In [17]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred  
    predictions = np.argmax(logits, axis=-1)  
    return metric.compute(predictions=predictions, references=labels)

In [17]:
data_collator = DataCollatorWithPadding(tokenizer)


training_args1 = TrainingArguments(
    output_dir='./assets',              
    num_train_epochs=1,                  
    per_device_train_batch_size=8,       
    evaluation_strategy="epoch",         
    save_strategy="epoch",               
    group_by_length=True,  # Bucketing 
)

trainer1 = Trainer(
    model=model,
    args=training_args1,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,         
    compute_metrics=compute_metrics      
)

start_time = time.time()
trainer1.train()
end_time = time.time()


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 150000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18750


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2844,0.340221,0.89292


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 8
Saving model checkpoint to ./bucket/checkpoint-18750
Configuration saved in ./bucket/checkpoint-18750/config.json
Model weights saved in ./bucket/checkpoint-18750/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




Max Memory: 3.40 GB, Time: 1804.12 sec


In [22]:
inference("이 영화 그냥 그래.. 볼지 말지는 너 선택이야.")

이 영화 그냥 그래.. 볼지 말지는 너 선택이야. , Negative (0.9871%)


In [23]:
inference("이 영화 완전 별로인것 같았는데 꿀잼이야.")

이 영화 완전 별로인것 같았는데 꿀잼이야. , Positive (0.9925%)


In [24]:
inference("재미있어서 상영내내 꿀잠잤어.")

재미있어서 상영내내 꿀잠잤어. , Negative (0.9125%)


In [25]:
inference("너~무 재미있어서 상영내내 꿀잠잤어.")

너~무 재미있어서 상영내내 꿀잠잤어. , Negative (0.8062%)


In [27]:
#Default learning rate: 5e-5 / AdamW

data_collator = DataCollatorWithPadding(tokenizer)


training_args2 = TrainingArguments(
    output_dir='./assets',              
    num_train_epochs=1,                  
    per_device_train_batch_size=8,       
    evaluation_strategy="epoch",         
    save_strategy="epoch",               
    group_by_length=False,  
)

trainer2 = Trainer(
    model=model,
    args=training_args2,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,         
    compute_metrics=compute_metrics      
)

torch.cuda.reset_max_memory_allocated()
start_time = time.time()
trainer2.train()
end_time = time.time()
max_mem_no_bucket = torch.cuda.max_memory_allocated() / 1024 ** 3  
print(f"Max Memory: {max_mem_no_bucket:.2f} GB, Time: {end_time - start_time:.2f} sec")


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 150000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18750


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2775,0.339782,0.8872


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 8
Saving model checkpoint to ./bucket/checkpoint-18750
Configuration saved in ./bucket/checkpoint-18750/config.json
Model weights saved in ./bucket/checkpoint-18750/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




Max Memory: 3.47 GB, Time: 2328.61 sec


### Pytorch Base line
- Each epoch takes around an hour 
- group_by_length=True (Bucketing)	: 배치 내 샘플 길이를 비슷하게 묶음
- DataCollatorWithPadding : 배치 내에서 패딩을 자동으로 맞춤   
- [ 위 결과 ]
즉 문장토큰 길이를 파악하지 않고 진행하는 경우(max_length=128) 대략 4500sec 시간이 걸렸으나, DataCollatorWithPadding 만 사용했을 경우 2328.61 sec, Bucketing & DataCollatorWithPadding 같이 사용한 경우 1804.12 sec로 학습 속도를 줄일 수 있었다. 이는 패딩에 들어가는 메모리를 줄임으로써 최적화로부터 나온 차이로 사료된다.


#### [TF] baseline 2 epoch

In [39]:
model_name = "klue/bert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)
# PyTorch to TensorFlow 변환
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2, from_pt=True)

optimizer = Adam(learning_rate=1e-5) 
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
model.fit(train_inputs, train_labels, validation_data=(test_inputs, test_labels), batch_size=16, epochs=2)


loading file https://huggingface.co/klue/bert-base/resolve/main/vocab.txt from cache at /aiffel/.cache/huggingface/transformers/1a36e69d48a008e522b75e43693002ffc8b6e6df72de7c53412c23466ec165eb.085110015ec67fc02ad067f712a7c83aafefaf31586a3361dd800bcac635b456
loading file https://huggingface.co/klue/bert-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/klue/bert-base/resolve/main/special_tokens_map.json from cache at /aiffel/.cache/huggingface/transformers/aeaaa3afd086a040be912f92ffe7b5f85008b744624f4517c4216bcc32b51cf0.054ece8d16bd524c8a00f0e8a976c00d5de22a755ffb79e353ee2954d9289e26
loading file https://huggingface.co/klue/bert-base/resolve/main/tokenizer_config.json from cache at /aiffel/.cache/huggingface/transformers/f8f71eb411bb03f57b455cfb1b4e04ae124201312e67a3ad66e0a92d0c228325.78871951edcb66032caa0a9628d77b3557c23616c653dacdb7a1a8f33011a843
loading file https://huggingface.co/klue/bert-base/resolve/main/tokenizer.json from cache at /aiffe

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7cbc063d6f40>

In [40]:
def inference(sentence):
    inputs = tokenizer(sentence, return_tensors="tf", padding=True, truncation=True, max_length=128)
    
    logits = model(inputs["input_ids"], attention_mask=inputs["attention_mask"]).logits
    
    probabilities = tf.nn.softmax(logits) 
    pred_label = np.argmax(probabilities)  

    result = "Positive" if pred_label == 1 else "Negative"
    print(f" {sentence} , {result} ({probabilities.numpy()[0][pred_label]:.4f}%)")

In [48]:
inference("재미있어서 상영내내 꿀잠잤어.")

 재미있어서 상영내내 꿀잠잤어. , Negative (0.6970%)


In [46]:
inference("너~무 재미있어서 상영내내 꿀잠잤어.")

 너~무 재미있어서 상영내내 꿀잠잤어. , Negative (0.6475%)


In [49]:
inference("너무 재미있어서 상영내내 꿀잠잤어.")

 너무 재미있어서 상영내내 꿀잠잤어. , Positive (0.5792%)


In [51]:
print(model.summary())

Model: "tf_bert_for_sequence_classification_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  110617344 
_________________________________________________________________
dropout_189 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 110,618,882
Trainable params: 110,618,882
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
###############

### 전체 학습 3 epoch tensorflow

### Best Val Acc : 0.9022

In [30]:
optimizer = Adam(learning_rate=1e-5)  # Trainer Default : 5e-5
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
model.fit(train_inputs, train_labels, validation_data=(test_inputs, test_labels), batch_size=16, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x77fb841a3250>

In [23]:
def inference(sentence):
    inputs = tokenizer(sentence, return_tensors="tf", padding=True, truncation=True, max_length=128)
    
    logits = model(inputs["input_ids"], attention_mask=inputs["attention_mask"]).logits
    
    probabilities = tf.nn.softmax(logits) 
    pred_label = np.argmax(probabilities)  

    result = "Positive" if pred_label == 1 else "Negative"
    print(f" {sentence} , {result} ({probabilities.numpy()[0][pred_label]:.4f}%)")

Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}



### Freezing

#### 9 ~ 12 Trarin

In [101]:
model.bert.embeddings.trainable = False  

for i in range(9):  
    model.bert.encoder.layer[i].trainable = False  

# 3 layers
for i in range(9, 12):  
    model.bert.encoder.layer[i].trainable = True  

trainable_count = sum([tf.reduce_sum(tf.cast(v.trainable, tf.int32)).numpy() for v in model.trainable_variables])
print(f"Trainable layers: {trainable_count}")


Trainable layers: 52


In [102]:
optimizer = Adam(learning_rate=1e-5)  # Fine-tuning 시에는 작은 학습률이 좋음

model.compile(optimizer=optimizer, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=["accuracy"])

model.fit(train_inputs, train_labels, validation_data=(test_inputs, test_labels), batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x77fa1fad01f0>

#### 6 Trarin

In [120]:
model.bert.embeddings.trainable = False  

for i in range(6):  
    model.bert.encoder.layer[i].trainable = False  

# 절반 train
for i in range(6, 12):  
    model.bert.encoder.layer[i].trainable = True  

trainable_count = sum([tf.reduce_sum(tf.cast(v.trainable, tf.int32)).numpy() for v in model.trainable_variables])
print(f"Trainable layers: {trainable_count}")

Trainable layers: 100


In [121]:
optimizer = Adam(learning_rate=1e-4) 

model.compile(optimizer=optimizer, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=["accuracy"])

model.fit(train_inputs, train_labels, validation_data=(test_inputs, test_labels), batch_size=32, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x77fa2dff4bb0>

#### Summary
<1>   
- max_length=128 : 4500sec 
- DataCollatorWithPadding : 2328.61 sec 
- Bucketing & DataCollatorWithPadding : 1804.12 sec   
`즉 패딩에 사용되는 메모리 할당 부분을 최적화 함으로써 학습시간을 줄일 수 있다`

-  Trainer 의 default lr 5e-5를 수정하여 테스트 해볼 수 있겠다. `lr:1e-5로 설정했을때 동일 에폭에서 유일하게 0.9 val acc를 넘겼다`. 
- `Freezing fine-tuning에서 학습 속도는 빨라졌지만, 성능의 변화는 미비했다.` 프루닝을 진행해볼 수 있을것 같다.

의문점
- Pytorch 기반 Trainer를 사용하였을때 사용중인 환경에서 최대 사용한 batch size는 8이였다.
  하지만 TF로 바꿔서 학습하였을때, batch size를 16으로 늘릴 수 있었다. 하지만 학습시간은 큰 변화는 없는것 같다.