# HuggingFace 커스텀 프로젝트 만들기

## Step 0. 준비

In [24]:
import datasets
from datasets import load_dataset, load_metric
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertForTokenClassification
import os
import numpy as np

## Step 1. NSMC 데이터 분석 및 Huggingface dataset 구성

In [2]:
data = load_dataset('nsmc')
data

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/807 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset nsmc/default (download: 18.62 MiB, generated: 20.90 MiB, post-processed: Unknown size, total: 39.52 MiB) to /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/6.33M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset nsmc downloaded and prepared to /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [3]:
train = data['train']
test = data['test']

In [6]:
for i in range(5):
    print(train[i])

{'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0, 'id': '9976970'}
{'document': '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'label': 1, 'id': '3819312'}
{'document': '너무재밓었다그래서보는것을추천한다', 'label': 0, 'id': '10265843'}
{'document': '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', 'label': 0, 'id': '9045019'}
{'document': '사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', 'label': 1, 'id': '6483659'}


## Step 2. klue/bert-base model 및 tokenizer 불러오기

In [9]:
model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base')
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [12]:
def transform(data):
    return tokenizer(
        data['document'],
        truncation = True,
        padding = 'max_length',
        return_token_type_ids = False,
        )

## Step 3. 위에서 불러온 tokenizer로 데이터셋을 전처리하고, model 학습 진행해 보기

In [13]:
train = train.map(transform, batched=True)
test = test.map(transform, batched=True)

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [14]:
num_samples = len(train)
num_train_samples = int(num_samples * 0.7)
num_val_samples = num_samples - num_train_samples

train_data = train.select(range(num_train_samples))
val_data = train.select(range(num_train_samples, num_samples))
test_data = test

In [17]:
# model 학습
path = os.getenv('HOME') + '/aiffel/transformers'

training_arguments = TrainingArguments(
    path,
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01)

In [19]:
metric = load_metric('glue', 'sst2')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Downloading:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

In [20]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running training *****
  Num examples = 105000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13125


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3058,0.276839,0.9024


Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-500
Configuration saved in /aiffel/aiffel/transformers/checkpoint-500/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-1000
Configuration saved in /aiffel/aiffel/transformers/checkpoint-1000/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-1500
Configuration saved in /aiffel/aiffel/transformers/checkpoint-1500/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-2000
Configuration saved in /aiffel/aiffel/transformers/checkpoint-2000/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoi

TrainOutput(global_step=13125, training_loss=0.32031826782226563, metrics={'train_runtime': 12142.1192, 'train_samples_per_second': 8.648, 'train_steps_per_second': 1.081, 'total_flos': 2.76266608128e+16, 'train_loss': 0.32031826782226563, 'epoch': 1.0})

In [21]:
import json

model.save_pretrained("saved_model_directory")
training_args_dict = training_arguments.to_dict()  
with open("training_args.json", "w") as training_args_file:
    json.dump(training_args_dict, training_args_file)

Configuration saved in saved_model_directory/config.json
Model weights saved in saved_model_directory/pytorch_model.bin


In [25]:
loaded_model = BertForTokenClassification.from_pretrained("saved_model_directory")
loaded_training_args_dict = json.load(open("training_args.json", "r"))
loaded_training_args = TrainingArguments.from_dict(loaded_training_args_dict)

loading configuration file saved_model_directory/config.json
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file saved_model_directory/pytorch_model.bin
All model checkpoint weights were used when initializing BertForTokenClassification.

All the weights of BertForTokenClassification were initialized from the model 

AttributeError: type object 'TrainingArguments' has no attribute 'from_dict'

In [27]:
trainer.evaluate(test_data)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 8


{'eval_loss': 0.28323811292648315,
 'eval_accuracy': 0.89938,
 'eval_runtime': 1874.5661,
 'eval_samples_per_second': 26.673,
 'eval_steps_per_second': 3.334,
 'epoch': 1.0}