In [1]:
from unsloth import FastLanguageModel

max_seq_length = 2048

dtype = None

load_in_4bit = True

# 모델 설정
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="./mbti-model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    cache_dir='/data'
)

Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-09-16 10:52:02.598440: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-16 10:52:04.353720: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.24. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv('./senti.csv', encoding='UTF-8')

dataset = Dataset.from_pandas(df)

dataset['text'][0]

'헐! 나 이벤트에 당첨 됐어.'

In [4]:
def formatting_senti_prompts_func(examples):
    output_texts = []
    for i in range(len(examples['text'])):
        messages = [
            {"role": "system", "content": "정확한 챗봇으로서 상대방의 입력에 대해 감정을 맞추자. 모든 대답은 '행복', '분노', '슬픔', '중립', '혐오', '놀람', '공포' 중 하나로 대답해줘."},
            {"role": "user", "content": "{}".format(examples['text'][i])},
            {"role": "assistant", "content": "{}".format(examples['senti'][i])}
        ]
        # LLAMA의 모델 클래스에 맞는 기본 채팅 템플릿 적용
        chat_message = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        output_texts.append(chat_message)

    return {"text": output_texts}

In [5]:
formatted_dataset = dataset.map(formatting_senti_prompts_func, batched=True)

split_dataset = formatted_dataset.train_test_split(test_size=0.2, seed=42)

# 분할된 데이터셋에서 훈련과 테스트 세트 추출
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

Map:   0%|          | 0/19374 [00:00<?, ? examples/s]

In [6]:
from sklearn.metrics import precision_recall_fscore_support
from collections import defaultdict
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    
    # Get precision, recall, and f1-score for each class
    precision, recall, f1, support = precision_recall_fscore_support(labels, predictions, average=None)
    
    # Define class labels (assuming they are ordered as ['행복', '분노', '슬픔', '중립', '혐오', '놀람', '공포'])
    class_labels = ['행복', '분노', '슬픔', '중립', '혐오', '놀람', '공포']
    
    # Store metrics in a dictionary
    metrics = defaultdict(dict)
    for i, label in enumerate(class_labels):
        metrics[label]['precision'] = precision[i]
        metrics[label]['recall'] = recall[i]
        metrics[label]['f1'] = f1[i]

    # Optionally, you can also return the overall weighted average if you want
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    metrics['overall'] = {
        'precision': weighted_precision,
        'recall': weighted_recall,
        'f1': weighted_f1
    }

    return metrics

In [7]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    # eval_dataset=test_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=1000,
        learning_rate=5e-5,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=50,
        optim="adamw_torch",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="senti_outputs",
        save_steps=100,
    ),
    # compute_metrics=compute_metrics  # Pass the modified compute_metrics function here
)

Map (num_proc=2):   0%|          | 0/15499 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [8]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 15,499 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
50,2.3728
100,0.5995
150,0.4908
200,0.3732
250,0.3265
300,0.3337
350,0.325
400,0.316
450,0.302
500,0.3015


TrainOutput(global_step=1000, training_loss=0.42951299381256103, metrics={'train_runtime': 3212.4884, 'train_samples_per_second': 2.49, 'train_steps_per_second': 0.311, 'total_flos': 3.674732901315379e+16, 'train_loss': 0.42951299381256103, 'epoch': 0.5161290322580645})

In [11]:
model.save_pretrained_gguf("mbti-senti_model", tokenizer, quantization_method = "q4_k_m")

make: Entering directory '/PNU/grad/llama.cpp'
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE 
I NVCCFLAG

100%|██████████| 32/32 [00:13<00:00,  2.38it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at mbti-senti_model into bf16 GGUF format.
The output location will be ./mbti-senti_model/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: mbti-senti_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-t

In [13]:
import pickle

with open('test_set.pkl', 'wb') as file:
    pickle.dump(test_dataset, file)

: 

In [26]:
!nvidia-smi

Mon Sep 16 00:35:39 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.28.03              Driver Version: 560.28.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060        Off |   00000000:01:00.0 Off |                  N/A |
| 30%   44C    P8              7W /  170W |    8998MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [18]:
from unsloth import FastLanguageModel

loaded_model, loaded_tokenizer = FastLanguageModel.from_pretrained(
    model_name="./mbti-senti_model",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    cache_dir='/data'
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.24. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [20]:
loaded_model = FastLanguageModel.get_peft_model(
    loaded_model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [21]:
def formatting_senti_prompts_func(examples):
    output_texts = []
    for i in range(len(examples['text'])):
        messages = [
            {"role": "system", "content": "정확한 챗봇으로서 상대방의 입력에 대해 감정을 맞추자. 모든 대답은 '행복', '분노', '슬픔', '중립', '혐오', '놀람', '공포' 중 하나로 대답해줘."},
            {"role": "user", "content": "{}".format(examples['text'][i])},
            {"role": "assistant", "content": "{}".format(examples['senti'][i])}
        ]
        # LLAMA의 모델 클래스에 맞는 기본 채팅 템플릿 적용
        chat_message = loaded_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        output_texts.append(chat_message)

    return {"text": output_texts}

In [22]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv('./senti.csv', encoding='UTF-8')

dataset = Dataset.from_pandas(df)

dataset['text'][0]

'헐! 나 이벤트에 당첨 됐어.'

In [23]:
formatted_dataset = dataset.map(formatting_senti_prompts_func, batched=True)

split_dataset = formatted_dataset.train_test_split(test_size=0.2)

# 분할된 데이터셋에서 훈련과 테스트 세트 추출
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

Map:   0%|          | 0/19374 [00:00<?, ? examples/s]

In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    
    # Get precision, recall, and f1-score for each class
    precision, recall, f1, support = precision_recall_fscore_support(labels, predictions, average=None)
    
    # Define class labels (assuming they are ordered as ['행복', '분노', '슬픔', '중립', '혐오', '놀람', '공포'])
    class_labels = ['행복', '분노', '슬픔', '중립', '혐오', '놀람', '공포']
    
    # Store metrics in a dictionary
    metrics = defaultdict(dict)
    for i, label in enumerate(class_labels):
        metrics[label]['precision'] = precision[i]
        metrics[label]['recall'] = recall[i]
        metrics[label]['f1'] = f1[i]

    # Optionally, you can also return the overall weighted average if you want
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    metrics['overall'] = {
        'precision': weighted_precision,
        'recall': weighted_recall,
        'f1': weighted_f1
    }

    return metrics

In [25]:
from trl import SFTTrainer
from transformers import TrainingArguments

loaded_trainer = SFTTrainer(
    model=loaded_model,
    tokenizer=loaded_tokenizer,
    eval_dataset=test_dataset,  # 테스트 세트를 사용
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_eval_batch_size=2,  
        fp16=True,  # 혼합 정밀도 사용
        output_dir="senti_outputs",
    ),
    compute_metrics=compute_metrics  # 미리 정의한 평가 메트릭 함수 사용
)

# 테스트 세트에 대한 평가 수행
evaluation_metrics = loaded_trainer.evaluate()
print(evaluation_metrics)

Map (num_proc=2):   0%|          | 0/3875 [00:00<?, ? examples/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.00 GiB. GPU 0 has a total capacity of 11.66 GiB of which 2.87 GiB is free. Including non-PyTorch memory, this process has 8.78 GiB memory in use. Of the allocated memory 8.58 GiB is allocated by PyTorch, and 65.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)