# 허깅페이스 모델 사용

In [3]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis', framework='tf')
classifier('We are very happy to include pipeline into the transformers repository.')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


[{'label': 'POSITIVE', 'score': 0.9978194236755371}]

In [4]:
# 감정분석 테스트
classifier('오늘은 기분이 좋아. 랄랄라 랄랄랄랄라~ 마음속 깊이, 간직한 꿈이, 이루어질 것 같아요!')

[{'label': 'POSITIVE', 'score': 0.8796578645706177}]

## 허깅 페이스에서 모델을 불러오는 방식


In [5]:
# 모델명 지정 방식

from transformers import TFBertForPreTraining
model_fpmethod = TFBertForPreTraining.from_pretrained('bert-base-cased') # ID를 입력

print(model_fpmethod.__class__)

All PyTorch model weights were used when initializing TFBertForPreTraining.

All the weights of TFBertForPreTraining were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForPreTraining for predictions without further training.


<class 'transformers.models.bert.modeling_tf_bert.TFBertForPreTraining'>


In [6]:
# AutoModel 사용
from transformers import TFAutoModel
model_am = TFAutoModel.from_pretrained("bert-base-cased")

print(model_am.__class__)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

<class 'transformers.models.bert.modeling_tf_bert.TFBertModel'>



## 토크나이저

In [7]:
# 특정 모델의 토크나이저를 지정하는 방법
from transformers import BertTokenizer
tokenizer_specific = BertTokenizer.from_pretrained('bert-base-cased')

In [8]:
# 토크나이저 자동 지정
from transformers import AutoTokenizer
tokenizer_ID = AutoTokenizer.from_pretrained('bert-base-cased') # 모델 로드 시 사용한 ID 지정

In [9]:
# 토크나이저 실행 결과
encoded = tokenizer_specific("This is Test for aiffel")
print(f'specified encoded: {encoded}')

encoded_2 = tokenizer_ID("This is Test for aiffel_2")
print(f'ID-based encoded: {encoded_2}')

specified encoded: {'input_ids': [101, 1188, 1110, 5960, 1111, 170, 11093, 1883, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
ID-based encoded: {'input_ids': [101, 1188, 1110, 5960, 1111, 170, 11093, 1883, 168, 123, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
# 문장 분할 메서드
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_tokens = bert_tokenizer.tokenize("This is Test for aiffel")
print("BertTokenizer로 토큰화된 결과:", bert_tokens)

# AutoTokenizer 사용
auto_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
auto_tokens = auto_tokenizer.tokenize("This is Test for aiffel")
print("AutoTokenizer로 토큰화된 결과:", auto_tokens)

BertTokenizer로 토큰화된 결과: ['This', 'is', 'Test', 'for', 'a', '##iff', '##el']
AutoTokenizer로 토큰화된 결과: ['This', 'is', 'Test', 'for', 'a', '##iff', '##el']


In [11]:
# batch 단위 토크나이징도 가능

batch_sentences = ["Hello I'm a single sentence",
                    "And another sentence",
                    "And the very very last one"]

encoded_batch = tokenizer_ID(batch_sentences)
print(encoded_batch)

{'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102], [101, 1262, 1330, 5650, 102], [101, 1262, 1103, 1304, 1304, 1314, 1141, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


In [12]:
# 다양한 옵션 설정

batch = tokenizer_ID(batch_sentences, padding=True, truncation=True, return_tensors="tf")
print(batch)

{'input_ids': <tf.Tensor: shape=(3, 9), dtype=int32, numpy=
array([[ 101, 8667,  146,  112,  182,  170, 1423, 5650,  102],
       [ 101, 1262, 1330, 5650,  102,    0,    0,    0,    0],
       [ 101, 1262, 1103, 1304, 1304, 1314, 1141,  102,    0]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(3, 9), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(3, 9), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 0]], dtype=int32)>}


## Config: 하이퍼 파라미터를 포함한 전반적인 설정을 처리하는 설정 클래스

In [13]:
# 모델 지정 기반의 설정 불러오기
from transformers import BertConfig

config = BertConfig.from_pretrained("bert-base-cased")
print(config.__class__)
print(config)

<class 'transformers.models.bert.configuration_bert.BertConfig'>
BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



In [14]:
# AutoConfig
from transformers import AutoConfig

config = AutoConfig.from_pretrained("bert-base-cased")
print(config.__class__)
print(config)

<class 'transformers.models.bert.configuration_bert.BertConfig'>
BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



In [15]:
# 생성된 모델의 설정 불러오기
print(f'사전훈련 모델에서 불러온 경우\n{model_fpmethod.config}')
print(f'오토모델\n{model_am.config}')

사전훈련 모델에서 불러온 경우
BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

오토모델
BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

## Trainer: 학습을 진행하는 클래스

In [16]:
# 자연어처리 연구 및 애플리케이션 개발용 허깅페이스 라이브러리 설치
!pip install datasets



In [17]:
from datasets import load_dataset

# 데이터셋 불러오기: 일반 언어이해 평가(GLUE) 벤치마크 중 하나인 CoLA 데이터셋
raw_datasets = load_dataset("glue", "cola")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# 모델 로드
model_name_or_path = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=2)    # COLA dataset의 라벨은 0(unacceptable)과 1(accpetable) 두 가지로 구분됨

# 토크나이저 정의
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

In [19]:
# 라이브러리 업데이트(세션 재시작 필요)
!pip install accelerate -U



In [20]:
from transformers import TrainingArguments
# 훈련 인자(Arguments)
training_args = TrainingArguments(
    output_dir='./results',              # output이 저장될 경로
    num_train_epochs=1,              # train 시킬 총 epochs
    per_device_train_batch_size=16,  # 각 device 당 batch size
    per_device_eval_batch_size=64,   # evaluation 시에 batch size
    warmup_steps=500,                # learning rate scheduler에 따른 warmup_step 설정
    weight_decay=0.01,                 # weight decay
    logging_dir='./logs',                 # log가 저장될 경로
    do_train=True,                        # train 수행여부
    do_eval=True,                        # eval 수행여부
    eval_steps=1000,
    group_by_length=False,
)

In [21]:
from transformers import Trainer
trainer = Trainer(
    model,                                                                    # 학습시킬 model
    args=training_args,                                                # TrainingArguments을 통해 설정한 arguments
    train_dataset=tokenized_datasets["train"],         # training dataset
    eval_dataset=tokenized_datasets["validation"], # validation dataset
    tokenizer=tokenizer,
)

# 모델 학습
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.5467


TrainOutput(global_step=535, training_loss=0.5442507734922605, metrics={'train_runtime': 68.6234, 'train_samples_per_second': 124.608, 'train_steps_per_second': 7.796, 'total_flos': 91092439031580.0, 'train_loss': 0.5442507734922605, 'epoch': 1.0})