In [7]:
!pip install transformers datasets torch accelerate peft bitsandbytes -q

import os
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, get_linear_schedule_with_warmup
from datasets import Dataset
from google.colab import files
from peft import LoraConfig, get_peft_model
from torch.optim import AdamW
import evaluate
import numpy as np

# GPUの確認
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# モデルとトークナイザーのロード (bitsandbytesで量子化、オフロード有効化)
model_name = "rinna/japanese-gpt-neox-3.6b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto",
    llm_int8_enable_fp32_cpu_offload=True
)

# LoRAの設定
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# データのアップロード
print("学習データ（data.json）をアップロードしてください。")
uploaded = files.upload()
data_file = list(uploaded.keys())[0]

# 学習データのロードと形式チェック
try:
    with open(data_file, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
        for entry in raw_data:
            if not ("prompt" in entry and "response" in entry):
                raise ValueError("Invalid data format: 'prompt' and 'response' keys are required.")
except json.JSONDecodeError:
    raise ValueError("Invalid JSON format.")

# データセットの整形
def format_data(entry):
    return {"text": f"{entry['prompt']}\n{entry['response']}"}

formatted_data = [format_data(entry) for entry in raw_data]

# データセットを作成 (labelsの追加と修正)
def preprocess_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = inputs["input_ids"].copy()

    new_labels = []
    for label in inputs["labels"]:
        # labelが整数ならリストに変換
        if isinstance(label, int):
            label = [label]
        new_label = [(l if l != tokenizer.pad_token_id else -100) for l in label]
        new_labels.append(new_label)
    inputs["labels"] = new_labels
    return inputs

dataset = Dataset.from_list(formatted_data) #formatted_dataの形式も確認
dataset = dataset.map(preprocess_function, batch_size=32, num_proc=os.cpu_count())
dataset = dataset.train_test_split(test_size=0.1)

# 評価指標の設定
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: v for k, v in result.items()}

# 学習パラメータの設定 (変更なし)
training_args = TrainingArguments(
    output_dir="./LLM",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    learning_rate=3e-4,
    fp16=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    report_to="tensorboard"
)

# オプティマイザーとスケジューラーの設定 (変更なし)
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
num_training_steps = len(dataset["train"]) * training_args.num_train_epochs // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
warmup_steps = int(num_training_steps * 0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)

# Trainerの作成 (変更なし)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics
)

# ファインチューニングの実行
trainer.train()

# モデルの保存 (LoRAアダプターのみ保存)
model.save_pretrained("./LLM")
tokenizer.save_pretrained("./LLM")

print("Fine-tuning complete! Model saved to ./LLM")

Using device: cuda


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


trainable params: 3,244,032 || all params: 3,610,489,344 || trainable%: 0.0899
学習データ（data.json）をアップロードしてください。


Saving data.json to data (5).json


Map (num_proc=2):   0%|          | 0/116 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss


Fine-tuning complete! Model saved to ./LLM


In [8]:
!zip -r LLM.zip ./LLM
files.download("LLM.zip")

  adding: LLM/ (stored 0%)
  adding: LLM/adapter_config.json (deflated 53%)
  adding: LLM/tokenizer.json (deflated 73%)
  adding: LLM/runs/ (stored 0%)
  adding: LLM/runs/Jan02_06-54-31_c7d419a8bbef/ (stored 0%)
  adding: LLM/runs/Jan02_06-54-31_c7d419a8bbef/events.out.tfevents.1735800872.c7d419a8bbef.1395.0 (deflated 62%)
  adding: LLM/runs/Jan02_07-05-07_c7d419a8bbef/ (stored 0%)
  adding: LLM/runs/Jan02_07-05-07_c7d419a8bbef/events.out.tfevents.1735801508.c7d419a8bbef.1395.1 (deflated 60%)
  adding: LLM/README.md (deflated 66%)
  adding: LLM/checkpoint-39/ (stored 0%)
  adding: LLM/checkpoint-39/adapter_config.json (deflated 53%)
  adding: LLM/checkpoint-39/tokenizer.json (deflated 73%)
  adding: LLM/checkpoint-39/scheduler.pt (deflated 54%)
  adding: LLM/checkpoint-39/trainer_state.json (deflated 55%)
  adding: LLM/checkpoint-39/optimizer.pt (deflated 7%)
  adding: LLM/checkpoint-39/README.md (deflated 66%)
  adding: LLM/checkpoint-39/tokenizer_config.json (deflated 78%)
  adding: 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>