In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install datasets
!pip install transformers[torch] -U
!pip install adapter-transformers
!pip install huggingface_hub

Collecting accelerate
  Downloading accelerate-0.32.0-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.0/314.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [3]:
file_path = '/content/drive/MyDrive/DACON/INHA-DACON.jsonl'

In [4]:
import json
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer

# jsonl 파일 읽기
data = []
with open(file_path, "r") as f:
    for line in f:
        data.append(json.loads(line))

# 데이터셋 객체 생성
dataset = Dataset.from_list(data)

# 모델 및 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("beomi/Llama-3-Open-Ko-8B")

# 패딩 토큰이 설정되지 않았을 경우, eos 토큰을 패딩 토큰으로 사용
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# 전처리 함수
def preprocess_function(examples):
    inputs = [f"{ctx} {inst}" for ctx, inst in zip(examples['context'], examples['instruction'])]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["response"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# 매핑 적용
tokenized_datasets = dataset.map(preprocess_function, batched=True)

ImportError: cannot import name 'insecure_hashlib' from 'huggingface_hub.utils' (/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/__init__.py)

In [None]:
from transformers import AutoModelWithHeads
import torch

model = AutoModelWithHeads.from_pretrained("beomi/Llama-3-Open-Ko-8B")

# 어댑터 설정
adapter_name = model.load_adapter("how_to_train/your_adapter", source="hf", config="pfeiffer")
model.active_adapters = adapter_name

# 어댑터 전용으로 파라미터를 훈련시키기 위해 나머지 모델 파라미터를 동결
model.freeze_model()
# 어댑터는 자동으로 unfreeze 됩니다.


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets  # 이전에 준비한 데이터셋
)



In [None]:
trainer.train()

In [None]:
model.save_pretrained('/content/drive/MyDrive/DACON/trained_model')
tokenizer.save_pretrained('/content/drive/MyDrive/DACON/trained_model')
