In [None]:
# 자연어처리 연구 및 애플리케이션 개발용 허깅페이스 라이브러리 설치
!pip install datasets



In [None]:
!pip install accelerate -U # accelerate 버전 업데이트 (업데이트 후 세션 재시작)



# 데이터셋 로드

In [None]:
import datasets
from datasets import load_dataset

# Microsoft Research Paraphrase Corpus: 패러프레이징 체크 과제
huggingface_mrpc_dataset = load_dataset('glue', 'mrpc')
print(huggingface_mrpc_dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


In [None]:
# 훈련 데이터셋 확인
for i in range(5):
    for column in huggingface_mrpc_dataset['train'].column_names:
        print(column, ":", huggingface_mrpc_dataset['train'][column][i])
    print('\n')

sentence1 : Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
sentence2 : Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
label : 1
idx : 0


sentence1 : Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .
sentence2 : Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .
label : 0
idx : 1


sentence1 : They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .
sentence2 : On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .
label : 1
idx : 2


sentence1 : Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .
sentence2 : Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .
label : 0

In [None]:
# 커스텀 데이터셋 생성을 위해, 텐서플로 라이브러리에서 mrpc 데이터셋 다운로드

import tensorflow_datasets as tfds
from datasets import Dataset

tf_dataset, tf_dataset_info = tfds.load('glue/mrpc', with_info=True)

In [None]:
for example in tf_dataset['train'].take(5):
    for column in huggingface_mrpc_dataset['train'].column_names:
        print(column, ":", example[column])
    print('\n')

sentence1 : tf.Tensor(b'The identical rovers will act as robotic geologists , searching for evidence of past water .', shape=(), dtype=string)
sentence2 : tf.Tensor(b'The rovers act as robotic geologists , moving on six wheels .', shape=(), dtype=string)
label : tf.Tensor(0, shape=(), dtype=int64)
idx : tf.Tensor(1680, shape=(), dtype=int32)


sentence1 : tf.Tensor(b"Less than 20 percent of Boise 's sales would come from making lumber and paper after the OfficeMax purchase is completed .", shape=(), dtype=string)
sentence2 : tf.Tensor(b"Less than 20 percent of Boise 's sales would come from making lumber and paper after the OfficeMax purchase is complete , assuming those businesses aren 't sold .", shape=(), dtype=string)
label : tf.Tensor(0, shape=(), dtype=int64)
idx : tf.Tensor(1456, shape=(), dtype=int32)


sentence1 : tf.Tensor(b'Spider-Man snatched $ 114.7 million in its debut last year and went on to capture $ 403.7 million .', shape=(), dtype=string)
sentence2 : tf.Tensor(b'Spi

In [None]:
# 허깅페이스 데이터셋과 같이 이중 딕셔너리 내부 리스트 형태로 변환

# Dataset이 train, validation, test로 나뉘도록 구성
train_dataset = tfds.as_dataframe(tf_dataset['train'], tf_dataset_info)
val_dataset = tfds.as_dataframe(tf_dataset['validation'], tf_dataset_info)
test_dataset = tfds.as_dataframe(tf_dataset['test'], tf_dataset_info)

# dataframe 데이터를 dict 내부에 list로 변경
train_dataset = train_dataset.to_dict('list')
val_dataset = val_dataset.to_dict('list')
test_dataset = test_dataset.to_dict('list')

# Huggingface dataset
tf_train_dataset = Dataset.from_dict(train_dataset)
tf_val_dataset = Dataset.from_dict(val_dataset)
tf_test_dataset = Dataset.from_dict(test_dataset)

# 토크나이저와 모델

In [None]:
# 토크나이저 로드
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 오토 클래스 사용시, 모델은 과업을 지정하여, 토크나이저는 모델명을 지정해 선언하는 것이 편리하다
huggingface_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)
huggingface_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 훈련데이터로부터 샘플을 불러와 토크나이저 적용

# 변환 함수 정의
def transform(data):
    return huggingface_tokenizer(
        data['sentence1'],
        data['sentence2'],
        truncation=True,
        padding='max_length',
        return_token_type_ids=False,
    )

# 5개의 샘플에 대해 변환 수행
transformed_samples = huggingface_mrpc_dataset['train'].map(transform).select(range(5))

# 결과 출력
for sample in transformed_samples:
    print(sample)


{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0, 'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# 매핑(mapping)을 이용한 다중 처리 기법
hf_dataset = huggingface_mrpc_dataset.map(transform, batched=True)

# train & validation & test split
hf_train_dataset = hf_dataset['train']
hf_val_dataset = hf_dataset['validation']
hf_test_dataset = hf_dataset['test']

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

In [None]:
# 텐서플로 데이터셋 매핑을 위한 데이터타입 변경 함수
def transform_tf(batch):
    sentence1 = [s.decode('utf-8') for s in batch['sentence1']]
    sentence2 = [s.decode('utf-8') for s in batch['sentence2']]
    return huggingface_tokenizer(
        sentence1,
        sentence2,
        truncation=True,
        padding='max_length',
        return_token_type_ids=False,
    )

# 토큰화 및 패딩을 적용
tf_train_dataset = tf_train_dataset.map(transform_tf, batched=True)
tf_val_dataset = tf_val_dataset.map(transform_tf, batched=True)
tf_test_dataset = tf_test_dataset.map(transform_tf, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

# 훈련/평가 및 테스트

## 허깅페이스 데이터셋으로 학습 진행

In [None]:
import os
import numpy as np
from transformers import Trainer, TrainingArguments

# 결과 저장 위치 지정
output_dir = '/transformers'

# 훈련 관련 인자 정의
training_arguments = TrainingArguments(
    output_dir,                                         # output이 저장될 경로
    evaluation_strategy="epoch",           #evaluation하는 빈도
    learning_rate = 2e-5,                         #learning_rate
    per_device_train_batch_size = 16,   # 각 device 당 batch size
    per_device_eval_batch_size = 16,    # evaluation 시에 batch size
    num_train_epochs = 3,                     # train 시킬 총 epochs
    weight_decay = 0.01,                        # weight decay
)

In [None]:
from datasets import load_metric

metric = load_metric('glue', 'mrpc')

# 태스크별 출력형식을 고려한 메트릭 계산 함수
def compute_metrics(eval_pred):
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

  metric = load_metric('glue', 'mrpc')


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [None]:
# 학습 진행

trainer = Trainer(
    model=huggingface_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_train_dataset,    # training dataset
    eval_dataset=hf_val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.402888,0.821078,0.873484
2,No log,0.388856,0.845588,0.895175
3,0.439900,0.398399,0.835784,0.884682


TrainOutput(global_step=690, training_loss=0.38678739796514094, metrics={'train_runtime': 558.8896, 'train_samples_per_second': 19.689, 'train_steps_per_second': 1.235, 'total_flos': 1457671254810624.0, 'train_loss': 0.38678739796514094, 'epoch': 3.0})

In [None]:
# 테스트 데이터로 평가
trainer.evaluate(hf_test_dataset)

{'eval_loss': 0.4222221076488495,
 'eval_accuracy': 0.8260869565217391,
 'eval_f1': 0.8717948717948718,
 'eval_runtime': 31.299,
 'eval_samples_per_second': 55.114,
 'eval_steps_per_second': 3.451,
 'epoch': 3.0}

In [None]:
# 텐서플로우 데이터셋으로 학습
trainer_tf = Trainer(
    model=huggingface_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=tf_train_dataset,    # training dataset
    eval_dataset=tf_val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer_tf.train()

RuntimeError: ignored

In [None]:
import torch
print(torch.__version__)
print(torch.version.cuda)

2.1.0+cu118
11.8


In [None]:
print(torch.cuda.is_initialized())

True


In [None]:
# 데이터 평가
trainer_tf.evaluate(tf_test_dataset)

RuntimeError: ignored