In [1]:
!pip install datasets
!pip install evaluate



In [2]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import pickle
from datasets import Dataset, DatasetDict
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = '/content/drive/MyDrive/klue'

model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=7).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/klue and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
data_path = '/content/drive/MyDrive/pkl/senti.pkl'

with open(data_path, 'rb') as file:
    data = pickle.load(file)

# 데이터 형식을 맞춤
formatted_data = {'text': [item[0] for item in data], 'label': [int(item[1]) for item in data]}

# Dataset 객체로 변환
dataset = Dataset.from_dict(formatted_data)

# 데이터셋을 train, test로 분할
train_test_split = dataset.train_test_split(test_size=0.2)  # 20%를 테스트 데이터로 사용

# DatasetDict 객체로 변환
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/15499 [00:00<?, ? examples/s]

Map:   0%|          | 0/3875 [00:00<?, ? examples/s]

In [5]:
training_args = TrainingArguments(
    output_dir='./results',                      # 출력 디렉토리
    evaluation_strategy="epoch",                 # 매 에폭마다 평가
    learning_rate=1e-5,                          # 학습률
    per_device_train_batch_size=16,              # 학습 배치 크기
    per_device_eval_batch_size=16,               # 평가 배치 크기
    num_train_epochs=10,                         # 학습 에폭 수
    weight_decay=0.01,                           # 가중치 감쇠
    seed=42,                                     # 시드 값
    lr_scheduler_type="linear",                  # 학습률 스케줄러 타입
    warmup_ratio=0.1,                            # 워밍업 비율
    optim="adamw_torch",                         # 옵티마이저 (기본값은 AdamW)
)



In [6]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    # Calculate precision, recall, f1-score for each class
    precision = precision_metric.compute(predictions=predictions, references=labels, average=None)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average=None)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average=None)["f1"]

    # Convert precision, recall, f1-score to lists
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision.tolist(),  # Convert NumPy array to list
        "recall": recall.tolist(),        # Convert NumPy array to list
        "f1": f1.tolist()                 # Convert NumPy array to list
    }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)


trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5596,0.332534,0.889548,"[0.97265625, 0.9040697674418605, 0.890728476821192, 0.8242811501597445, 0.8611111111111112, 0.8887722980062959, 0.9730392156862745]","[0.8736842105263158, 0.9367469879518072, 0.8163884673748103, 0.8973913043478261, 0.8830584707646177, 0.9559819413092551, 0.8428874734607219]","[0.9205175600739373, 0.9201183431952663, 0.8519398258115598, 0.8592839300582847, 0.8719467061435974, 0.921152800435019, 0.9032992036405004]"
2,0.2802,0.238902,0.923097,"[0.9342560553633218, 0.9692307692307692, 0.8846153846153846, 0.9087779690189329, 0.9114114114114115, 0.94874715261959, 0.9260869565217391]","[0.9473684210526315, 0.9487951807228916, 0.9074355083459787, 0.9182608695652174, 0.9100449775112444, 0.9401805869074492, 0.9044585987261147]","[0.9407665505226481, 0.9589041095890412, 0.8958801498127341, 0.9134948096885813, 0.9107276819204801, 0.9444444444444444, 0.9151450053705692]"
3,0.1712,0.26715,0.926968,"[0.9640287769784173, 0.9668674698795181, 0.8849557522123894, 0.9402173913043478, 0.9410852713178295, 0.9217391304347826, 0.9127659574468086]","[0.9403508771929825, 0.9668674698795181, 0.9104704097116844, 0.9026086956521739, 0.9100449775112444, 0.9571106094808126, 0.910828025477707]","[0.9520426287744227, 0.9668674698795181, 0.8975317875841436, 0.9210292812777285, 0.9253048780487806, 0.9390919158361019, 0.9117959617428268]"
4,0.114,0.318735,0.923097,"[0.9075907590759076, 0.9690402476780186, 0.9035222052067381, 0.9142367066895368, 0.95, 0.9355555555555556, 0.8823529411764706]","[0.9649122807017544, 0.9427710843373494, 0.8952959028831563, 0.9269565217391305, 0.8830584707646177, 0.9503386004514672, 0.9235668789808917]","[0.9353741496598639, 0.9557251908396946, 0.899390243902439, 0.920552677029361, 0.9153069153069152, 0.942889137737962, 0.9024896265560166]"
5,0.0898,0.332362,0.928516,"[0.967391304347826, 0.9697885196374623, 0.9019607843137255, 0.9155172413793103, 0.9492063492063492, 0.9390243902439024, 0.8843813387423936]","[0.9368421052631579, 0.9668674698795181, 0.9074355083459787, 0.9234782608695652, 0.896551724137931, 0.9559819413092551, 0.9256900212314225]","[0.9518716577540107, 0.9683257918552036, 0.9046898638426627, 0.9194805194805195, 0.9221279876638396, 0.9474272930648769, 0.9045643153526971]"
6,0.0673,0.367516,0.928,"[0.9712230215827338, 0.9670658682634731, 0.8628005657708628, 0.955719557195572, 0.9433070866141732, 0.9298245614035088, 0.9164882226980728]","[0.9473684210526315, 0.9728915662650602, 0.9256449165402124, 0.9008695652173913, 0.8980509745127436, 0.9571106094808126, 0.9087048832271762]","[0.9591474245115452, 0.96996996996997, 0.8931185944363104, 0.9274843330349148, 0.9201228878648232, 0.9432703003337041, 0.9125799573560769]"
7,0.0527,0.394032,0.926452,"[0.9475524475524476, 0.9666666666666667, 0.8690647482014389, 0.9221453287197232, 0.9389671361502347, 0.9640371229698376, 0.8907216494845361]","[0.9508771929824561, 0.9608433734939759, 0.9165402124430956, 0.9269565217391305, 0.8995502248875562, 0.9379232505643341, 0.9171974522292994]","[0.9492119089316987, 0.9637462235649547, 0.8921713441654358, 0.9245446660884649, 0.9188361408882082, 0.9508009153318078, 0.9037656903765691]"
8,0.0368,0.41499,0.929548,"[0.9645390070921985, 0.95, 0.882525697503671, 0.9391771019677997, 0.940625, 0.9495515695067265, 0.8981288981288982]","[0.9543859649122807, 0.9728915662650602, 0.9119878603945372, 0.9130434782608695, 0.9025487256371814, 0.9559819413092551, 0.9171974522292994]","[0.9594356261022927, 0.9613095238095238, 0.8970149253731343, 0.9259259259259259, 0.9211935730680948, 0.952755905511811, 0.907563025210084]"
9,0.0286,0.422649,0.928774,"[0.9611307420494699, 0.9554896142433235, 0.8950524737631185, 0.9232111692844677, 0.9392523364485982, 0.9451287793952967, 0.9]","[0.9543859649122807, 0.9698795180722891, 0.9059180576631259, 0.92, 0.904047976011994, 0.9525959367945824, 0.9171974522292994]","[0.9577464788732394, 0.9626307922272048, 0.9004524886877829, 0.921602787456446, 0.9213139801375096, 0.9488476672287802, 0.9085173501577287]"
10,0.0289,0.431525,0.928258,"[0.951048951048951, 0.9611940298507463, 0.882179675994109, 0.9265734265734266, 0.9388714733542319, 0.9493243243243243, 0.9056603773584906]","[0.9543859649122807, 0.9698795180722891, 0.9089529590288316, 0.9217391304347826, 0.8980509745127436, 0.9514672686230248, 0.9171974522292994]","[0.9527145359019263, 0.9655172413793104, 0.8953662182361732, 0.9241499564080208, 0.918007662835249, 0.9503945885005637, 0.9113924050632911]"


Trainer is attempting to log a value of "[0.97265625, 0.9040697674418605, 0.890728476821192, 0.8242811501597445, 0.8611111111111112, 0.8887722980062959, 0.9730392156862745]" of type <class 'list'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8736842105263158, 0.9367469879518072, 0.8163884673748103, 0.8973913043478261, 0.8830584707646177, 0.9559819413092551, 0.8428874734607219]" of type <class 'list'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.9205175600739373, 0.9201183431952663, 0.8519398258115598, 0.8592839300582847, 0.8719467061435974, 0.921152800435019, 0.9032992036405004]" of type <class 'list'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=9690, training_loss=0.19104553956857528, metrics={'train_runtime': 3440.1383, 'train_samples_per_second': 45.053, 'train_steps_per_second': 2.817, 'total_flos': 4.0781413187328e+16, 'train_loss': 0.19104553956857528, 'epoch': 10.0})

In [8]:
model.save_pretrained('./klue_senti')
tokenizer.save_pretrained('./klue_senti')

('./klue_senti/tokenizer_config.json',
 './klue_senti/special_tokens_map.json',
 './klue_senti/vocab.txt',
 './klue_senti/added_tokens.json',
 './klue_senti/tokenizer.json')

In [9]:
import os
import shutil

source_path      = '/content/klue_senti'
destination_path = '/content/drive/MyDrive/klue_senti'

if not os.path.exists(destination_path):
    os.makedirs(destination_path)

# source_path 경로에 있는 모든 파일과 폴더를 이동
for filename in os.listdir(source_path):
    file_path = os.path.join(source_path, filename)
    if os.path.isfile(file_path) or os.path.isdir(file_path):
        shutil.move(file_path, destination_path)

In [13]:
tokenized_datasets["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 15499
})