# HuggingFace Task 커스텀 프로젝트
- model(klue/ber-base)를 활용하여 NSMC(Naver Sentiment Movie Corpus) task 수행
- 데이터: https://github.com/e9t/nsmc
- model: https://huggingface.co/klue/bert-base

In [1]:
import tensorflow
import numpy as np
import transformers
import datasets

print(tensorflow.__version__)
print(np.__version__)
print(transformers.__version__)
print(datasets.__version__)

2.6.0
1.21.4
4.11.3
1.14.0


## Dataset

### Huggingface dataset에서 불러오기
NSMC 데이터셋 https://huggingface.co/datasets/Blpeng/nsmc

In [2]:
from datasets import load_dataset

ds = load_dataset("Blpeng/nsmc")

Using custom data configuration Blpeng___nsmc-55757a98c8abea78
Reusing dataset csv (/aiffel/.cache/huggingface/datasets/csv/Blpeng___nsmc-55757a98c8abea78/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'id', 'document', 'label'],
        num_rows: 400000
    })
})

train 데이터만 존재 -> 데이터 분할 필요

In [4]:
train = ds['train']
cols = train.column_names
for i in range(5):
    for col in cols:
        print(col, ":", train[col][i])
    print('\n')

Unnamed: 0 : 0
id : 8112052
document : 어릴때보고 지금다시봐도 재밌어요ㅋㅋ
label : 1


Unnamed: 0 : 1
id : 8132799
document : 디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업이 부러웠는데. 사실 우리나라에서도 그 어려운시절에 끝까지 열정을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다는 것에 감사합니다.
label : 1


Unnamed: 0 : 2
id : 4655635
document : 폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.
label : 1


Unnamed: 0 : 3
id : 9251303
document : 와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런게 진짜 영화지
label : 1


Unnamed: 0 : 4
id : 10067386
document : 안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.
label : 1




### 결측치 제거

In [5]:
# 결측치를 제거하는 함수 정의
def remove_missing_values(data):
    # 모든 값이 존재하는지 확인
    return all(value is not None for value in data.values())

# train, test 데이터셋에서 결측치 제거
dataset = train.filter(remove_missing_values)

Loading cached processed dataset at /aiffel/.cache/huggingface/datasets/csv/Blpeng___nsmc-55757a98c8abea78/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-7ae5fa8d6b787a51.arrow


In [6]:
dataset

Dataset({
    features: ['Unnamed: 0', 'id', 'document', 'label'],
    num_rows: 399984
})

약 16개의 데이터 제거됨

> Q. tokeinzer에 전처리 과정도 포함되어 있는 거 아닌가? 왜 결측치가 있는 데이터는 못 받지?

## Tokenizer & Model
klue/ber-base 모델 https://huggingface.co/klue/bert-base  
Auto Classes 활용

In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base")

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

### 토큰화

In [8]:
def transform(data):
    return tokenizer(
        data['document'],
        truncation = True,
        padding = 'max_length',
        return_token_type_ids = False,    # mrpc task에는 필요 없음
        )

tokenized_dataset = dataset.map(transform)

Loading cached processed dataset at /aiffel/.cache/huggingface/datasets/csv/Blpeng___nsmc-55757a98c8abea78/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-5593fbd717ec625c.arrow


In [9]:
tokenized_dataset

Dataset({
    features: ['Unnamed: 0', 'id', 'document', 'label', 'input_ids', 'attention_mask'],
    num_rows: 399984
})

### 데이터 분할

In [10]:
train_validtest = tokenized_dataset.train_test_split(test_size=0.2)
valid_test = train_validtest['test'].train_test_split(test_size=0.5)

train_data = train_validtest["train"]
valid_data = valid_test["train"]
test_data = valid_test["test"]

Loading cached split indices for dataset at /aiffel/.cache/huggingface/datasets/csv/Blpeng___nsmc-55757a98c8abea78/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-eeac95bd82894cee.arrow and /aiffel/.cache/huggingface/datasets/csv/Blpeng___nsmc-55757a98c8abea78/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-d830928c05c0a664.arrow


In [11]:
train_data

Dataset({
    features: ['Unnamed: 0', 'id', 'document', 'label', 'input_ids', 'attention_mask'],
    num_rows: 319987
})

## Trainer

### 학습 준비

In [12]:
# 학습 관련 설정을 미리 지정
from transformers import Trainer, TrainingArguments

output_dir = './outputs'

training_arguments = TrainingArguments(
    output_dir,                             # output이 저장될 경로
    evaluation_strategy="epoch",           #evaluation하는 빈도
    learning_rate = 2e-5,                         #learning_rate
    per_device_train_batch_size = 16,   # 각 device 당 batch size
    per_device_eval_batch_size = 8,    # evaluation 시에 batch size
    num_train_epochs = 1,                     # train 시킬 총 epochs
    weight_decay = 0.01,                        # weight decay
)

In [13]:
from datasets import load_metric
metric = load_metric("accuracy")

# binary classification
def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

### 학습

In [14]:
trainer = Trainer(
    model=model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=train_data,    # training dataset
    eval_dataset=valid_data,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Unnamed: 0, id, document.
***** Running training *****
  Num examples = 319987
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 20000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33m4rldur0[0m ([33m4-rldur0[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 14.76 GiB total capacity; 13.16 GiB already allocated; 169.69 MiB free; 13.33 GiB reserved in total by PyTorch)

In [None]:
# 평가
trainer.evaluate(test_data)

## wandb와 함께 사용
- https://docs.wandb.ai/guides/integrations/huggingface
- https://wandb.ai/matt24/vit-snacks-sweeps/reports/Hyperparameter-Search-for-HuggingFace-Transformer-Models--VmlldzoyMTUxNTg0

In [16]:
#!pip install wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting wandb
  Downloading wandb-0.17.4-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
     |████████████████████████████████| 6.9 MB 5.6 MB/s            
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-2.10.0-py2.py3-none-any.whl (302 kB)
     |████████████████████████████████| 302 kB 78.2 MB/s            
Collecting gitpython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
     |████████████████████████████████| 207 kB 86.5 MB/s            
[?25hCollecting setproctitle
  Downloading setproctitle-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting 

In [14]:
import wandb

wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[34m[1mwandb[0m: Currently logged in as: [33m4rldur0[0m ([33m4-rldur0[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [20]:
# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'epochs': {
        'values': [2, 3, 4, 5]
        },
    'batch_size': {
        'values': [8, 16, 32]
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
    },
    'weight_decay': {
        'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
    },
}

sweep_config['parameters'] = parameters_dict


In [21]:
def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        
        training_arguments = TrainingArguments(
            output_dir='./sweeps',                             # output이 저장될 경로
            evaluation_strategy="epoch",           #evaluation하는 빈도
            learning_rate = config.learning_rate,                         #learning_rate
            per_device_train_batch_size = config.batch_size,   # 각 device 당 batch size
            per_device_eval_batch_size = 8,    # evaluation 시에 batch size
            num_train_epochs = config.epochs,                     # train 시킬 총 epochs
            weight_decay = config.weight_decay,                        # weight decay
            report_to="wandb",  # enable logging to W&B
            logging_steps=1,  # how often to log to W&B
        )
        
        model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base")

        trainer = Trainer(
            model=model,           # 학습시킬 model
            args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
            train_dataset=train_data,    # training dataset
            eval_dataset=valid_data,       # evaluation dataset
            compute_metrics=compute_metrics,
        )
        
        trainer.train()

In [22]:
sweep_id = wandb.sweep(sweep_config, project='nsmc_klue')

wandb.agent(sweep_id, train, count=1)

Create sweep with ID: 8vl20onk
Sweep URL: https://wandb.ai/4-rldur0/nsmc_klue/sweeps/8vl20onk


[34m[1mwandb[0m: Agent Starting Run: tl2b4z7n with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 2.764547851183588e-05
[34m[1mwandb[0m: 	weight_decay: 0.1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


PyTorch: setting up devices
loading configuration file https://huggingface.co/klue/bert-base/resolve/main/config.json from cache at /aiffel/.cache/huggingface/transformers/fbd0b2ef898c4653902683fea8cc0dd99bf43f0e082645b913cda3b92429d1bb.99b3298ed554f2ad731c27cdb11a6215f39b90bc845ff5ce709bb4e74ba45621
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file https://huggingface.co/klue/bert-base/resolve/main/pytorch_model.bin from cache at /aiffel/

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run tl2b4z7n errored:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/wandb/agents/pyagent.py", line 307, in _run_job
    self._function()
  File "/tmp/ipykernel_595/607916351.py", line 27, in train
    trainer.train()
  File "/opt/conda/lib/python3.9/site-packages/transformers/trainer.py", line 1316, in train
    tr_loss_step = self.training_step(model, inputs)
  File "/opt/conda/lib/python3.9/site-packages/transformers/trainer.py", line 1849, in training_step
    loss = self.compute_loss(model, inputs)
  File "/opt/conda/lib/python3.9/site-packages/transformers/trainer.py", line 1881, in compute_loss
    outputs = model(**inputs)
  File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 1529, in forward
    outputs = self.bert(
  File "/opt/conda/lib/python3.9/site-package

In [None]:
with wandb.init(project="nsmc_klue") as run:
    # Pass the name and version of Artifact
    my_model_name = "<run_name>:latest"
    my_model_artifact = run.use_artifact(my_model_name)

    # Download model weights to a folder and return the path
    model_dir = my_model_artifact.download()

    # Load your Hugging Face model from that folder
    #  using the same model class
    model = AutoModelForSequenceClassification.from_pretrained(
        model_dir, num_labels=num_labels
    )