# HuggingFace 커스텀 프로젝트
- model(klue/ber-base)를 활용하여 NSMC(Naver Sentiment Movie Corpus) task 수행
- 데이터: https://github.com/e9t/nsmc
- model: https://huggingface.co/klue/bert-base

In [1]:
import tensorflow
import numpy as np
import transformers
import datasets

print(tensorflow.__version__)
print(np.__version__)
print(transformers.__version__)
print(datasets.__version__)

2.6.0
1.21.4
4.11.3
1.14.0


## Dataset

### Huggingface dataset에서 불러오기
NSMC 데이터셋 https://huggingface.co/datasets/Blpeng/nsmc

In [2]:
from datasets import load_dataset

ds = load_dataset("Blpeng/nsmc")
ds

Using custom data configuration Blpeng___nsmc-55757a98c8abea78
Reusing dataset csv (/aiffel/.cache/huggingface/datasets/csv/Blpeng___nsmc-55757a98c8abea78/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'id', 'document', 'label'],
        num_rows: 400000
    })
})

train 데이터만 존재 -> 데이터 분할 필요

In [3]:
train = ds['train']
cols = train.column_names
for i in range(5):
    for col in cols:
        print(col, ":", train[col][i])
    print('\n')

Unnamed: 0 : 0
id : 8112052
document : 어릴때보고 지금다시봐도 재밌어요ㅋㅋ
label : 1


Unnamed: 0 : 1
id : 8132799
document : 디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업이 부러웠는데. 사실 우리나라에서도 그 어려운시절에 끝까지 열정을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다는 것에 감사합니다.
label : 1


Unnamed: 0 : 2
id : 4655635
document : 폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.
label : 1


Unnamed: 0 : 3
id : 9251303
document : 와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런게 진짜 영화지
label : 1


Unnamed: 0 : 4
id : 10067386
document : 안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.
label : 1




### 불필요한 열 제거

In [4]:
ds = ds['train']
ds = ds.remove_columns(['Unnamed: 0', 'id'])
ds

Dataset({
    features: ['document', 'label'],
    num_rows: 400000
})

### 데이터셋 줄이기

In [5]:
small_ds = ds.shuffle(seed=42).select(range(int(0.1 * len(ds))))

Loading cached shuffled indices for dataset at /aiffel/.cache/huggingface/datasets/csv/Blpeng___nsmc-55757a98c8abea78/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-4b7488257526ee44.arrow


## 전처리

### 결측치 제거

In [23]:
# 결측치를 제거하는 함수 정의
def remove_missing_values(data):
    # 모든 값이 존재하는지 확인
    return all(value is not None for value in data.values())

# train, test 데이터셋에서 결측치 제거
dataset = small_ds.filter(remove_missing_values)
dataset

Loading cached processed dataset at /aiffel/.cache/huggingface/datasets/csv/Blpeng___nsmc-55757a98c8abea78/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-c3384dd26d1364fe.arrow


Dataset({
    features: ['document', 'label'],
    num_rows: 39996
})

약 16개의 데이터 제거됨

> Q. tokeinzer에 전처리 과정도 포함되어 있는 거 아닌가? 왜 결측치가 있는 데이터는 못 받지?  
A. 텍스트 데이터를 모델이 이해할 수 있는 형식으로 변환하는 작업은 가능하지만, 결측치 제거와 같은 데이터 전처리 과정은 포함되어 있지 않음

## Tokenizer & Model
klue/ber-base 모델 https://huggingface.co/klue/bert-base  
Auto Classes 활용

In [24]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base")

loading configuration file https://huggingface.co/klue/bert-base/resolve/main/config.json from cache at /aiffel/.cache/huggingface/transformers/fbd0b2ef898c4653902683fea8cc0dd99bf43f0e082645b913cda3b92429d1bb.99b3298ed554f2ad731c27cdb11a6215f39b90bc845ff5ce709bb4e74ba45621
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading file https://huggingface.co/klue/bert-base/resolve/main/vocab.txt from cache at /aiffel/.cache/huggingface/transformers/1a36e69d48a0

### 토큰화

In [25]:
def transform(data):
    return tokenizer(
        data['document'],
        truncation = True,
        padding = 'max_length',
        return_token_type_ids = False,    # binary classificaiton task에는 필요 없음
        )

tokenized_dataset = dataset.map(transform, batched=True)
tokenized_dataset

Loading cached processed dataset at /aiffel/.cache/huggingface/datasets/csv/Blpeng___nsmc-55757a98c8abea78/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-853cf1e1ad9f6aa7.arrow


Dataset({
    features: ['attention_mask', 'document', 'input_ids', 'label'],
    num_rows: 39996
})

In [26]:
# 필요없는 열 삭제
tokenized_dataset = tokenized_dataset.remove_columns(['document'])
tokenized_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'label'],
    num_rows: 39996
})

### 데이터 분할

In [27]:
train_validtest = tokenized_dataset.train_test_split(test_size=0.2)
valid_test = train_validtest['test'].train_test_split(test_size=0.5)

train_data = train_validtest["train"]
valid_data = valid_test["train"]
test_data = valid_test["test"]

In [28]:
train_data

Dataset({
    features: ['attention_mask', 'input_ids', 'label'],
    num_rows: 31996
})

## Trainer

### 학습 준비

In [29]:
from datasets import load_metric
metric = load_metric("accuracy")

# binary classification
def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

### 학습

#### wandb와 함께 사용
- https://docs.wandb.ai/guides/integrations/huggingface
- https://wandb.ai/matt24/vit-snacks-sweeps/reports/Hyperparameter-Search-for-HuggingFace-Transformer-Models--VmlldzoyMTUxNTg0

In [13]:
!pip install wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting wandb
  Downloading wandb-0.17.4-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
     |████████████████████████████████| 6.9 MB 4.7 MB/s            
Collecting gitpython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
     |████████████████████████████████| 207 kB 71.6 MB/s            
[?25hCollecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-2.10.0-py2.py3-none-any.whl (302 kB)
     |████████████████████████████████| 302 kB 68.3 MB/s            
Collecting setproctitle
  Downloading setproctitle-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting 

In [14]:
import wandb

wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /aiffel/.netrc


True

In [30]:
# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'epochs': {
        'values': [1, 2]
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 7e-5
    },
    'weight_decay': {
        'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
    },
}

sweep_config['parameters'] = parameters_dict


In [31]:
from transformers import Trainer, TrainingArguments

def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        
        wandb_training_arguments = TrainingArguments(
            fp16=True,
            output_dir='./sweeps',                             # output이 저장될 경로
            evaluation_strategy="steps",             # evaluation 하는 빈도
            eval_steps=1000,                         # 1000 스텝마다 평가 수행
            learning_rate = config.learning_rate,                         #learning_rate
            per_device_train_batch_size = 8,   # 각 device 당 batch size
            per_device_eval_batch_size = 8,    # evaluation 시에 batch size
            num_train_epochs = 1,                     # train 시킬 총 epochs
            weight_decay = config.weight_decay,                        # weight decay
            report_to="wandb",  # enable logging to W&B
            logging_steps=1,  # how often to log to W&B
        )
        
        model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base")

        trainer = Trainer(
            model=model,           # 학습시킬 model
            args=wandb_training_arguments,           # TrainingArguments을 통해 설정한 arguments
            train_dataset=train_data,    # training dataset
            eval_dataset=valid_data,       # evaluation dataset
            compute_metrics=compute_metrics,
        )
        
        trainer.train()

In [32]:
# 캐시 지우기
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [33]:
sweep_id = wandb.sweep(sweep_config, project='nsmc_klue')

wandb.agent(sweep_id, train, count=3)

Create sweep with ID: 8cvin4xn
Sweep URL: https://wandb.ai/4-rldur0/nsmc_klue/sweeps/8cvin4xn


[34m[1mwandb[0m: Agent Starting Run: avx5fdkm with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 2.0514280895740115e-06
[34m[1mwandb[0m: 	weight_decay: 0.4


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


PyTorch: setting up devices
loading configuration file https://huggingface.co/klue/bert-base/resolve/main/config.json from cache at /aiffel/.cache/huggingface/transformers/fbd0b2ef898c4653902683fea8cc0dd99bf43f0e082645b913cda3b92429d1bb.99b3298ed554f2ad731c27cdb11a6215f39b90bc845ff5ce709bb4e74ba45621
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file https://huggingface.co/klue/bert-base/resolve/main/pytorch_model.bin from cache at /aiffel/

Step,Training Loss,Validation Loss,Accuracy
1000,0.6124,0.371049,0.83975
2000,0.7037,0.359285,0.85225
3000,0.1954,0.34999,0.855
4000,0.2793,0.337643,0.86225
5000,0.2667,0.342916,0.8625
6000,0.278,0.35158,0.8625
7000,0.6938,0.34479,0.865
8000,0.1482,0.342576,0.8635


  nn.utils.clip_grad_norm_(
Saving model checkpoint to ./sweeps/checkpoint-500
Configuration saved in ./sweeps/checkpoint-500/config.json
Model weights saved in ./sweeps/checkpoint-500/pytorch_model.bin
  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 8
Saving model checkpoint to ./sweeps/checkpoint-1000
Configuration saved in ./sweeps/checkpoint-1000/config.json
Model weights saved in ./sweeps/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./sweeps/checkpoint-1500
Configuration saved in ./sweeps/checkpoint-1500/config.json
Model weights saved in ./sweeps/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 8
Saving model checkpoint to ./sweeps/checkpoint-2000
Configuration saved in ./sweeps/checkpoint-2000/config.json
Model weights saved in ./sweeps/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./sweeps/checkpoint-2500
Configuration saved in ./sweeps/checkpoint-

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▄▅▇▇▇██
eval/loss,█▆▄▁▂▄▂▂
eval/runtime,█▁▂▁▁▄▅█
eval/samples_per_second,▁█▇██▅▄▁
eval/steps_per_second,▁█▆██▆▄▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,▆▅▅▃▃▂▅▆▁▃▃▄▆▃▄▅▁▇▆▂▂▂▂▂▆▄▆▂▃▃▁▁▄▃▄▃▃█▁▆
train/total_flos,▁

0,1
eval/accuracy,0.8635
eval/loss,0.34258
eval/runtime,138.9414
eval/samples_per_second,28.789
eval/steps_per_second,3.599
train/epoch,2.0
train/global_step,8000.0
train/learning_rate,0.0
train/loss,0.1482
train/total_flos,1.683700265459712e+16


[34m[1mwandb[0m: Agent Starting Run: bncbljgi with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 8.617277189439932e-05
[34m[1mwandb[0m: 	weight_decay: 0.5


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


PyTorch: setting up devices
loading configuration file https://huggingface.co/klue/bert-base/resolve/main/config.json from cache at /aiffel/.cache/huggingface/transformers/fbd0b2ef898c4653902683fea8cc0dd99bf43f0e082645b913cda3b92429d1bb.99b3298ed554f2ad731c27cdb11a6215f39b90bc845ff5ce709bb4e74ba45621
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file https://huggingface.co/klue/bert-base/resolve/main/pytorch_model.bin from cache at /aiffel/

Step,Training Loss,Validation Loss,Accuracy
1000,0.5777,0.501703,0.78175


  nn.utils.clip_grad_norm_(
Saving model checkpoint to ./sweeps/checkpoint-500
Configuration saved in ./sweeps/checkpoint-500/config.json
Model weights saved in ./sweeps/checkpoint-500/pytorch_model.bin
  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 8
Saving model checkpoint to ./sweeps/checkpoint-1000
Configuration saved in ./sweeps/checkpoint-1000/config.json
Model weights saved in ./sweeps/checkpoint-1000/pytorch_model.bin
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


In [36]:
# 평가
with wandb.init(project="nsmc_klue") as run:
    # Pass the name and version of Artifact
    my_model_name = "run-c8ifvt62-history:latest"
    my_model_artifact = run.use_artifact(my_model_name)

    # Download model weights to a folder and return the path
    model_dir = my_model_artifact.download()

    # Load your Hugging Face model from that folder
    #  using the same model class
    num_labels = 2  
    model = AutoModelForSequenceClassification.from_pretrained(
        model_dir, num_labels=num_labels
    )
    # Define evaluation arguments
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_eval_batch_size=16,
        do_train=False,
        do_eval=True,
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_dataset,
        compute_metrics=compute_metrics,
    )

    # Evaluate the model
    results = trainer.evaluate()
    
    print(results)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m:   1 of 1 files downloaded.  
file /aiffel/aiffel/AIFFEL-Quest/huggingface/artifacts/run-c8ifvt62-history:v0/config.json not found
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/transformers/configuration_utils.py", line 546, in get_config_dict
    resolved_config_file = cached_path(
  File "/opt/conda/lib/python3.9/site-packages/transformers/file_utils.py", line 1417, in cached_path
    raise EnvironmentError(f"file {url_or_filename} not found")
OSError: file /aiffel/aiffel/AIFFEL-Quest/huggingface/artifacts/run-c8ifvt62-history:v0/config.json not found

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_671/250046739.py", line 12, in <module>
    model = AutoModelForSequenceClassification.from_pretrained(
  File "/opt/conda/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 396, in from_pretrained
    config, kwargs = AutoConfig.fr

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

OSError: Can't load config for '/aiffel/aiffel/AIFFEL-Quest/huggingface/artifacts/run-c8ifvt62-history:v0'. Make sure that:

- '/aiffel/aiffel/AIFFEL-Quest/huggingface/artifacts/run-c8ifvt62-history:v0' is a correct model identifier listed on 'https://huggingface.co/models'

- or '/aiffel/aiffel/AIFFEL-Quest/huggingface/artifacts/run-c8ifvt62-history:v0' is the correct path to a directory containing a config.json file

