In [1]:
%config Completer.use_jedi = False

In [2]:
!pip install --upgrade trl torch torchvision transformers bitsandbytes peft datasets accelerate

Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting torch
  Downloading torch-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting transformers
  Downloading transformers-4.52.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting rich (from trl)
  Downloading rich-14.0.0-py3-none-any.whl.metadata (18 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Do

In [None]:
# 허깅페이스 로그인

In [36]:
import tempfile
tempfile.tempdir = "/workspace/tmp"

import os
import tempfile

os.environ["TRANSFORMERS_CACHE"] = "/workspace/hf_cache"
os.environ["HF_HOME"] = "/workspace/hf_cache"
os.environ["TMPDIR"] = "/workspace/tmp"
tempfile.tempdir = "/workspace/tmp"

os.makedirs("/workspace/hf_cache", exist_ok=True)
os.makedirs("/workspace/tmp", exist_ok=True)

In [37]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from datasets import Dataset

# # PyTorch 메모리 관리 설정
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 1. JSON 데이터셋 로드 및 전처리
def load_dataset(file_path):
    dataset = Dataset.from_json(file_path)
    return dataset

def preprocess_data(example):
    input_text = f"{example['instruction']}\n질문: {example['question']}\n답변: {example['answer']}"
    return {"text": input_text}

dataset = load_dataset('/workspace/skin_custormer.json')
dataset = dataset.map(preprocess_data)

# 2. 토크나이저 및 모델 로드 (4-bit 양자화)
model_name = "meta-llama/Llama-3.1-8B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
# 디스크 캐시 경로 설정
os.environ["TRANSFORMERS_CACHE"] = "/workspace/hf_cache"
os.environ["HF_HOME"] = "/workspace/hf_cache"

tokenizer = AutoTokenizer.from_pretrained(model_name,cache_dir="/workspace/hf_cache")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",    
    cache_dir="/workspace/hf_cache"
)
# 패딩 토큰 설정
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
# 데이터 토크나이징
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 3. LoRA 설정
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj","v_proj","o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# 4. 훈련 설정
training_args = TrainingArguments(
    output_dir="./llama31-nemotron-finetuned",
    overwrite_output_dir = True,
    num_train_epochs = 1,    
    per_device_train_batch_size = 2,
    save_steps = 1000,
    save_total_limit = 2,
    logging_steps=1,
    report_to='none'
)

# 5. Trainer 설정 및 훈련
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model.print_trainable_parameters()

In [7]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
1,4.0948
2,4.6528
3,4.7063
4,4.5132
5,4.8386


TrainOutput(global_step=5, training_loss=4.561138725280761, metrics={'train_runtime': 3.1051, 'train_samples_per_second': 3.22, 'train_steps_per_second': 1.61, 'total_flos': 63352492523520.0, 'train_loss': 4.561138725280761, 'epoch': 1.0})

In [None]:
# 6. 모델 저장
model.save_pretrained("./llama31-nemotron-finetuned")
tokenizer.save_pretrained("./llama31-nemotron-finetuned")

In [14]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linea

In [38]:
# 현재 사용 용량
# overlay                    20G   20G  2.7M 100% /   -->여유공간이 없음
!df -h

Filesystem                Size  Used Avail Use% Mounted on
overlay                    20G  8.7G   12G  44% /
tmpfs                      64M     0   64M   0% /dev
mfs#euro.runpod.net:9421  1.7P  1.3P  431T  76% /workspace
shm                       132G  4.0K  132G   1% /dev/shm
/dev/md1                   14T  1.1T   13T   8% /etc/hosts
/dev/md0p1                878G   28G  807G   4% /usr/bin/nvidia-smi
tmpfs                     1.2T     0  1.2T   0% /sys/fs/cgroup
tmpfs                     1.2T   12K  1.2T   1% /proc/driver/nvidia
tmpfs                     1.2T  4.0K  1.2T   1% /etc/nvidia/nvidia-application-profiles-rc.d
tmpfs                     227G  4.3M  227G   1% /run/nvidia-persistenced/socket
tmpfs                     1.2T     0  1.2T   0% /proc/acpi
tmpfs                     1.2T     0  1.2T   0% /proc/scsi
tmpfs                     1.2T     0  1.2T   0% /sys/firmware
tmpfs                     1.2T     0  1.2T   0% /sys/devices/virtual/powercap


In [31]:
!du -sh ~/.cache/huggingface  # 다운로드 캐시에 사용된 용량 확인

12G	/root/.cache/huggingface


In [32]:
!rm -rf ~/.cache/huggingface # 토크나이져와 모델 다운로드할때 사용한 임시 캐시를 삭제(용량부족으로 에러발생시 삭제)

In [34]:
!mkdir -p /workspace/tmp /workspace/hf_cache  # 캐시경로를 변경하기위해서 로컬 경로 생성

In [26]:
# 환경변수에 다운로드 캐쉬 경로 변경
import os
os.environ["TMPDIR"] = "/workspace/tmp"
os.environ["TRANSFORMERS_CACHE"] = "/workspace/hf_cache"
os.environ["HF_HOME"] = "/workspace/hf_cache"

# Ensure tmp dir exists
os.makedirs("/workspace/tmp", exist_ok=True)