In [101]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 中間状態の更新処理をLoRAとして実装

In [None]:
import os, sys
os.sys.path.append("../src")
from utils.constant import ViTExperiment
from transformers import ViTForImageClassification
from utils.helper import get_device
device = get_device()

pretrained_dir = getattr(ViTExperiment, "c100").OUTPUT_DIR.format(k=0)
model = ViTForImageClassification.from_pretrained(pretrained_dir).to(device)
model.eval();

In [48]:
print(model.vit.encoder.layer[-2].intermediate)
print(model.vit.encoder.layer[-1].intermediate)

ViTIntermediate(
  (dense): Linear(in_features=768, out_features=3072, bias=True)
  (repair): Linear(in_features=3072, out_features=3072, bias=False)
  (intermediate_act_fn): GELUActivation()
)
ViTIntermediate(
  (dense): Linear(in_features=768, out_features=3072, bias=True)
  (repair): Linear(in_features=3072, out_features=3072, bias=False)
  (intermediate_act_fn): GELUActivation()
)


In [4]:
from peft import LoraConfig, get_peft_model

lora_cfg = LoraConfig(
    r=1,
    lora_alpha=1,
    lora_dropout=0.0,
    bias="none",
    target_modules=["vit.encoder.layer.11.intermediate.repair"],  # 明示的に11層だけ指定
)

model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k")
model.eval()
lora_model = get_peft_model(model, lora_cfg)
lora_model.print_trainable_parameters()

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'encoder.layer.3.intermediate.repair.weight', 'encoder.layer.9.intermediate.repair.weight', 'encoder.layer.2.intermediate.repair.weight', 'encoder.layer.0.intermedi

trainable params: 6144 || all params: 199052546 || trainable%: 0.003086622162572088


In [5]:
print(lora_model.base_model.model.vit.encoder.layer[-2].intermediate)
print(lora_model.base_model.model.vit.encoder.layer[-1].intermediate)

ViTIntermediate(
  (dense): Linear(in_features=768, out_features=3072, bias=True)
  (repair): Linear(in_features=3072, out_features=3072, bias=False)
  (intermediate_act_fn): GELUActivation()
)
ViTIntermediate(
  (dense): Linear(in_features=768, out_features=3072, bias=True)
  (repair): Linear(
    in_features=3072, out_features=3072, bias=False
    (lora_dropout): ModuleDict(
      (default): Identity()
    )
    (lora_A): ModuleDict(
      (default): Linear(in_features=3072, out_features=1, bias=False)
    )
    (lora_B): ModuleDict(
      (default): Linear(in_features=1, out_features=3072, bias=False)
    )
    (lora_embedding_A): ParameterDict()
    (lora_embedding_B): ParameterDict()
  )
  (intermediate_act_fn): GELUActivation()
)


In [6]:
# どのパラメータが “学習対象” になっているか確認
for name, param in lora_model.named_parameters():
    if param.requires_grad:
        print(f"{name:60s}  {tuple(param.shape)}")

base_model.model.vit.encoder.layer.11.intermediate.repair.lora_A.default.weight  (1, 3072)
base_model.model.vit.encoder.layer.11.intermediate.repair.lora_B.default.weight  (3072, 1)


In [7]:
# どのレイヤにloraが挿入されたか確認
for name, module in lora_model.named_modules():
    if hasattr(module, "lora_A"):
        print(name, "→ LoRA injected")

base_model.model.vit.encoder.layer.11.intermediate.repair → LoRA injected


# ViTモジュール変更後の学習が問題ないかチェック

In [219]:
from datasets import load_from_disk
from transformers import DefaultDataCollator, ViTForImageClassification, TrainingArguments, Trainer
from utils.vit_util import processor, transforms, transforms_c100, compute_metrics
import torch

dataset_dir = ViTExperiment.DATASET_DIR
ds = load_from_disk(os.path.join(dataset_dir, f"c100_fold0"))
tf_func = transforms_c100
label_col = "fine_label"

# 読み込まれた時にリアルタイムで前処理を適用するようにする
ds_preprocessed = ds.with_transform(tf_func)
# バッチごとの処理のためのdata_collator
data_collator = DefaultDataCollator()
# ラベルを示す文字列のlist
labels = ds_preprocessed["train"].features[label_col].names

# 学習の設定
batch_size = ViTExperiment.BATCH_SIZE
logging_steps = len(ds_preprocessed["train"]) // batch_size

In [220]:
# pretrained modelのロード
model = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

state_dict = model.state_dict()

# 'repair' という名前を含むパラメータを抽出して表示
repair_keys = [k for k in state_dict.keys() if 'repair' in k]

# 'repair' を含むキーを抽出
for key in state_dict:
    if 'repair' in key:
        print(f"🔑 Key: {key}")
        print(f"🧠 Weight Tensor (shape={state_dict[key].shape}):")
        print(state_dict[key])
        print("===" * 20)

🔑 Key: vit.encoder.layer.0.intermediate.repair.weight
🧠 Weight Tensor (shape=torch.Size([3072, 3072])):
tensor([[-0.0068, -0.0041, -0.0011,  ..., -0.0170, -0.0345, -0.0077],
        [ 0.0241, -0.0017,  0.0212,  ...,  0.0049,  0.0206,  0.0086],
        [-0.0070, -0.0263, -0.0012,  ..., -0.0171,  0.0389,  0.0080],
        ...,
        [ 0.0048,  0.0103, -0.0158,  ...,  0.0230,  0.0158, -0.0026],
        [-0.0284, -0.0115,  0.0029,  ..., -0.0030, -0.0244,  0.0234],
        [-0.0018,  0.0036,  0.0327,  ...,  0.0135,  0.0163, -0.0211]],
       device='cuda:0')
🔑 Key: vit.encoder.layer.1.intermediate.repair.weight
🧠 Weight Tensor (shape=torch.Size([3072, 3072])):
tensor([[ 0.0172, -0.0247,  0.0552,  ..., -0.0100,  0.0045,  0.0093],
        [-0.0122, -0.0097,  0.0232,  ..., -0.0339,  0.0100,  0.0094],
        [-0.0391,  0.0108, -0.0155,  ..., -0.0002, -0.0028, -0.0198],
        ...,
        [ 0.0111, -0.0093,  0.0042,  ..., -0.0140, -0.0157, -0.0186],
        [ 0.0057, -0.0280, -0.0284,  ...,

In [221]:
# pretrained modelのロード
model = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

# intermediate.repairレイヤの恒等初期化を明示的にやり直す (訓練した時はrepairなかったので)
# repairレイヤを増やす前のモデルからfrom_pretrainedする時はこれが必要？
with torch.no_grad():
    for i in range(len(model.vit.encoder.layer)):
        model.vit.encoder.layer[i].intermediate.repair.weight.copy_(torch.eye(3072))
        model.vit.encoder.layer[i].intermediate.repair.weight.requires_grad = False

state_dict = model.state_dict()

# 'repair' という名前を含むパラメータを抽出して表示
repair_keys = [k for k in state_dict.keys() if 'repair' in k]

# 'repair' を含むキーを抽出
for key in state_dict:
    if 'repair' in key:
        print(f"🔑 Key: {key}")
        print(f"🧠 Weight Tensor (shape={state_dict[key].shape}):")
        print(state_dict[key])
        print("===" * 20)

🔑 Key: vit.encoder.layer.0.intermediate.repair.weight
🧠 Weight Tensor (shape=torch.Size([3072, 3072])):
tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')
🔑 Key: vit.encoder.layer.1.intermediate.repair.weight
🧠 Weight Tensor (shape=torch.Size([3072, 3072])):
tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')
🔑 Key: vit.encoder.layer.2.intermediate.repair.weight
🧠 Weight Tensor (shape=torch.Size([3072, 3072])):
tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 

In [222]:
use_lora = False

if use_lora:
    # LoRAの設定
    lora_cfg = LoraConfig(
        r=1,
        lora_alpha=1,
        lora_dropout=0.0,
        bias="none",
        target_modules=["vit.encoder.layer.11.intermediate.repair"],  # 明示的に11層だけ指定
    )
    model = get_peft_model(model, lora_cfg)
    model.train()
    # model.print_trainable_parameters()  # LoRA param ∼ 100 k だけ
else:
    # print(model.vit.encoder.layer[-1].intermediate.repair.weight)
    model.train()
    # print(model.vit.encoder.layer[-1].intermediate.repair.weight)
    # 最終レイヤだけ訓練可能にする
    # for name, param in model.named_parameters():
    #     if "vit.encoder.layer.11" not in name:
    #         param.requires_grad = False

# 訓練可能なパラメータ数の割合
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable_params / total_params:.6%} of total params ({trainable_params} / {total_params})")

Trainable params: 43.127157% of total params (85875556 / 199121764)


In [223]:
training_args = TrainingArguments(
    output_dir="./",
    num_train_epochs=2,
    learning_rate=2e-4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    remove_unused_columns=False, # img列がないとエラーになるので必要
    evaluation_strategy="epoch", # エポックの終わりごとにeval_datasetで評価
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    disable_tqdm=False,
    log_level="error",
    report_to="tensorboard",
    load_best_model_at_end=True,
)

In [224]:
print(len(ds_preprocessed["train"]), len(ds_preprocessed["test"]))

40000 10000


In [None]:
# 学習の実行
# 訓練データを100件だけに制限
ds_limited = ds_preprocessed.copy()
ds_limited["train"] = ds_preprocessed["train"].select(range(100))
ds_limited["test"] = ds_preprocessed["test"].select(range(100))
# NOTE: 表示されるプログレスバーの分母の数字は，num_epoch*num_sample/batch_size
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_preprocessed["train"],
    eval_dataset=ds_preprocessed["test"],
    tokenizer=processor,
)
train_results = trainer.train()

self.args.label_names=None
default_label_names=['labels']




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,4.1455,0,{'accuracy': 0.17},{'f1': 0.10282909578684225}
2,3.7716,0,{'accuracy': 0.17},{'f1': 0.09838475296221776}


self.label_names = ['labels']
has_labels = True
loss=4.404483795166016, logits=tensor([[-0.0537,  0.1130,  0.0404,  ...,  0.1223, -0.1201,  0.0136],
        [-0.3235, -0.0561, -0.1776,  ...,  0.2033, -0.2095, -0.0517],
        [-0.2215, -0.0197, -0.0164,  ...,  0.2380, -0.2078, -0.1188],
        ...,
        [ 0.0806, -0.0085, -0.0609,  ...,  0.1577,  0.1127,  0.2258],
        [-0.2563,  0.0035, -0.0694,  ...,  0.1316, -0.2163,  0.0128],
        [-0.2347, -0.1956, -0.1319,  ...,  0.1039, -0.1405,  0.1511]],
       device='cuda:0'), labels=tensor([49, 33, 72, 51, 71, 92, 15, 14, 23,  0, 71, 75, 81, 69, 40, 43, 92, 97,
        70, 53, 70, 49, 75, 29, 21, 16, 39,  8,  8, 70, 20, 61],
       device='cuda:0')
self.label_names = ['labels']
has_labels = True
loss=4.49449348449707, logits=tensor([[-0.1759,  0.0829, -0.0594,  ...,  0.1920, -0.2288,  0.0972],
        [-0.0441, -0.0234,  0.2048,  ...,  0.1287,  0.0015, -0.0070],
        [-0.1753,  0.0496,  0.1112,  ...,  0.3630, -0.1223,  0.0627]

In [118]:
print("📋 パラメータ一覧（trainable / untrainable 含む）")

total_elements = 0
trainable_elements = 0

for name, param in model.named_parameters():
    numel = param.numel()
    total_elements += numel
    if param.requires_grad:
        trainable_elements += numel
        status = "✅ trainable"
    else:
        status = "❌ frozen"
    print(f"{status:12} | {name:60} | shape: {str(tuple(param.shape)):25} | #params: {numel}")

print("\n📊 Summary")
print(f"Trainable: {trainable_elements:,} / {total_elements:,} ({100 * trainable_elements / total_elements:.2f}%)")


📋 パラメータ一覧（trainable / untrainable 含む）
✅ trainable  | vit.embeddings.cls_token                                     | shape: (1, 1, 768)               | #params: 768
✅ trainable  | vit.embeddings.position_embeddings                           | shape: (1, 197, 768)             | #params: 151296
✅ trainable  | vit.embeddings.patch_embeddings.projection.weight            | shape: (768, 3, 16, 16)          | #params: 589824
✅ trainable  | vit.embeddings.patch_embeddings.projection.bias              | shape: (768,)                    | #params: 768
✅ trainable  | vit.encoder.layer.0.attention.attention.query.weight         | shape: (768, 768)                | #params: 589824
✅ trainable  | vit.encoder.layer.0.attention.attention.query.bias           | shape: (768,)                    | #params: 768
✅ trainable  | vit.encoder.layer.0.attention.attention.key.weight           | shape: (768, 768)                | #params: 589824
✅ trainable  | vit.encoder.layer.0.attention.attention.key.bias     

# `get_peft_model()` でラップして実行

ここまでで,  `ViTIntermediate` の実装をLoRA用にカスタマイズした際に，*LoRAを使用しない場合で，* 訓練・推論時に問題なさそうなことがわかった．

次は3072x3072の行列にLoRAを適用してどうなるかチェック

In [228]:
# pretrained modelのロード
model = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

    # LoRAの設定
lora_cfg = LoraConfig(
    r=1,
    lora_alpha=1,
    lora_dropout=0.0,
    bias="none",
    target_modules=["vit.encoder.layer.11.intermediate.repair"],  # 明示的に11層だけ指定
)
model = get_peft_model(model, lora_cfg)
model.train()
model.print_trainable_parameters()  # LoRA param ∼ 100 k だけ

# 訓練可能なパラメータ数の割合
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable_params / total_params:.6%} of total params ({trainable_params} / {total_params})")

trainable params: 6144 || all params: 199127908 || trainable%: 0.0030854539987433603
Trainable params: 0.003085% of total params (6144 / 199127908)


In [229]:
print("📋 パラメータ一覧（trainable / untrainable 含む）")

total_elements = 0
trainable_elements = 0

for name, param in model.named_parameters():
    numel = param.numel()
    total_elements += numel
    if param.requires_grad:
        trainable_elements += numel
        status = "✅ trainable"
    else:
        status = "❌ frozen"
    print(f"{status:12} | {name:60} | shape: {str(tuple(param.shape)):25} | #params: {numel}")

print("\n📊 Summary")
print(f"Trainable: {trainable_elements:,} / {total_elements:,} ({100 * trainable_elements / total_elements:.2f}%)")
# 上の出力から，最終レイヤの中間状態のアップデートをLoRAしたいことがわかる
# 訓練する重みパラメータの数 = 3072 x r x 2

📋 パラメータ一覧（trainable / untrainable 含む）
❌ frozen     | base_model.model.vit.embeddings.cls_token                    | shape: (1, 1, 768)               | #params: 768
❌ frozen     | base_model.model.vit.embeddings.position_embeddings          | shape: (1, 197, 768)             | #params: 151296
❌ frozen     | base_model.model.vit.embeddings.patch_embeddings.projection.weight | shape: (768, 3, 16, 16)          | #params: 589824
❌ frozen     | base_model.model.vit.embeddings.patch_embeddings.projection.bias | shape: (768,)                    | #params: 768
❌ frozen     | base_model.model.vit.encoder.layer.0.attention.attention.query.weight | shape: (768, 768)                | #params: 589824
❌ frozen     | base_model.model.vit.encoder.layer.0.attention.attention.query.bias | shape: (768,)                    | #params: 768
❌ frozen     | base_model.model.vit.encoder.layer.0.attention.attention.key.weight | shape: (768, 768)                | #params: 589824
❌ frozen     | base_model.model.vit

In [230]:
training_args = TrainingArguments(
    output_dir="./",
    num_train_epochs=2,
    learning_rate=2e-4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    remove_unused_columns=False, # img列がないとエラーになるので必要
    evaluation_strategy="epoch", # エポックの終わりごとにeval_datasetで評価
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    disable_tqdm=False,
    log_level="error",
    report_to="tensorboard",
    load_best_model_at_end=True,
)

In [None]:
# 学習の実行
# 訓練データを100件だけに制限
ds_limited = ds_preprocessed.copy()
ds_limited["train"] = ds_preprocessed["train"].select(range(100))
ds_limited["test"] = ds_preprocessed["test"].select(range(100))
# NOTE: 表示されるプログレスバーの分母の数字は，num_epoch*num_sample/batch_size
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_preprocessed["train"],
    eval_dataset=ds_preprocessed["test"],
    tokenizer=processor,
)
train_results = trainer.train()

PeftModel
self.args.label_names=None
default_label_names=['labels']




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,4.6104,0,{'accuracy': 0.0},{'f1': 0.0}
2,4.5858,0,{'accuracy': 0.0},{'f1': 0.0}


self.label_names = ['labels']
has_labels = True
loss=4.629871845245361, logits=tensor([[-0.0772,  0.0243,  0.0111,  ...,  0.0638, -0.0674,  0.0436],
        [-0.0770,  0.0245,  0.0113,  ...,  0.0636, -0.0673,  0.0437],
        [-0.0770,  0.0244,  0.0112,  ...,  0.0636, -0.0673,  0.0436],
        ...,
        [-0.0769,  0.0242,  0.0111,  ...,  0.0636, -0.0673,  0.0435],
        [-0.0770,  0.0242,  0.0112,  ...,  0.0636, -0.0673,  0.0434],
        [-0.0770,  0.0243,  0.0112,  ...,  0.0635, -0.0672,  0.0437]],
       device='cuda:0'), labels=tensor([49, 33, 72, 51, 71, 92, 15, 14, 23,  0, 71, 75, 81, 69, 40, 43, 92, 97,
        70, 53, 70, 49, 75, 29, 21, 16, 39,  8,  8, 70, 20, 61],
       device='cuda:0')
self.label_names = ['labels']
has_labels = True
loss=4.601065158843994, logits=tensor([[-0.0770,  0.0243,  0.0111,  ...,  0.0636, -0.0673,  0.0435],
        [-0.0771,  0.0243,  0.0113,  ...,  0.0636, -0.0671,  0.0435],
        [-0.0770,  0.0244,  0.0112,  ...,  0.0635, -0.0673,  0.0437

パラメータ数は全然違う割にはあんまり訓練時間変わらなくない...？

LoRAの論文でも変更するパラメータ数の割に精度が高いことを言ってるだけで，別に訓練時間の削減はいってなかったように見える．