In [1]:
%load_ext autoreload
%autoreload 2

# 中間状態の更新処理をLoRAとして実装

In [2]:
import os, sys
os.sys.path.append("../src")
from utils.constant import ViTExperiment
from transformers import ViTForImageClassification
from utils.helper import get_device
device = get_device()

pretrained_dir = getattr(ViTExperiment, "c100").OUTPUT_DIR.format(k=0)
model = ViTForImageClassification.from_pretrained(pretrained_dir).to(device)
model.eval();

Device: cuda


Some weights of ViTForImageClassification were not initialized from the model checkpoint at /src/src/out_vit_c100_fold0 and are newly initialized: ['vit.encoder.layer.0.intermediate.repair.weight', 'vit.encoder.layer.3.intermediate.repair.weight', 'vit.encoder.layer.4.intermediate.repair.weight', 'vit.encoder.layer.6.intermediate.repair.weight', 'vit.encoder.layer.11.intermediate.repair.weight', 'vit.encoder.layer.7.intermediate.repair.weight', 'vit.encoder.layer.1.intermediate.repair.weight', 'vit.encoder.layer.10.intermediate.repair.weight', 'vit.encoder.layer.9.intermediate.repair.weight', 'vit.encoder.layer.8.intermediate.repair.weight', 'vit.encoder.layer.5.intermediate.repair.weight', 'vit.encoder.layer.2.intermediate.repair.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
print(model.vit.encoder.layer[-2].intermediate)
print(model.vit.encoder.layer[-1].intermediate)

ViTIntermediate(
  (dense): Linear(in_features=768, out_features=3072, bias=True)
  (repair): Linear(in_features=3072, out_features=3072, bias=False)
  (intermediate_act_fn): GELUActivation()
)
ViTIntermediate(
  (dense): Linear(in_features=768, out_features=3072, bias=True)
  (repair): Linear(in_features=3072, out_features=3072, bias=False)
  (intermediate_act_fn): GELUActivation()
)


In [4]:
from peft import LoraConfig, get_peft_model

lora_cfg = LoraConfig(
    r=1,
    lora_alpha=1,
    lora_dropout=0.0,
    bias="none",
    target_modules=["vit.encoder.layer.11.intermediate.repair"],  # 明示的に11層だけ指定
)

model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k")
model.eval()
lora_model = get_peft_model(model, lora_cfg)
lora_model.print_trainable_parameters()

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['encoder.layer.5.intermediate.repair.weight', 'encoder.layer.2.intermediate.repair.weight', 'encoder.layer.9.intermediate.repair.weight', 'encoder.layer.1.intermediate.repair.weight',

trainable params: 6144 || all params: 199052546 || trainable%: 0.003086622162572088


In [5]:
print(lora_model.base_model.model.vit.encoder.layer[-2].intermediate)
print(lora_model.base_model.model.vit.encoder.layer[-1].intermediate)

ViTIntermediate(
  (dense): Linear(in_features=768, out_features=3072, bias=True)
  (repair): Linear(in_features=3072, out_features=3072, bias=False)
  (intermediate_act_fn): GELUActivation()
)
ViTIntermediate(
  (dense): Linear(in_features=768, out_features=3072, bias=True)
  (repair): Linear(
    in_features=3072, out_features=3072, bias=False
    (lora_dropout): ModuleDict(
      (default): Identity()
    )
    (lora_A): ModuleDict(
      (default): Linear(in_features=3072, out_features=1, bias=False)
    )
    (lora_B): ModuleDict(
      (default): Linear(in_features=1, out_features=3072, bias=False)
    )
    (lora_embedding_A): ParameterDict()
    (lora_embedding_B): ParameterDict()
  )
  (intermediate_act_fn): GELUActivation()
)


In [6]:
# どのパラメータが “学習対象” になっているか確認
for name, param in lora_model.named_parameters():
    if param.requires_grad:
        print(f"{name:60s}  {tuple(param.shape)}")

base_model.model.vit.encoder.layer.11.intermediate.repair.lora_A.default.weight  (1, 3072)
base_model.model.vit.encoder.layer.11.intermediate.repair.lora_B.default.weight  (3072, 1)


In [7]:
# どのレイヤにloraが挿入されたか確認
for name, module in lora_model.named_modules():
    if hasattr(module, "lora_A"):
        print(name, "→ LoRA injected")

base_model.model.vit.encoder.layer.11.intermediate.repair → LoRA injected


# ViTモジュール変更後の学習が問題ないかチェック

In [8]:
from datasets import load_from_disk
from transformers import DefaultDataCollator, ViTForImageClassification, TrainingArguments, Trainer
from utils.vit_util import processor, transforms, transforms_c100, compute_metrics
import torch

dataset_dir = ViTExperiment.DATASET_DIR
ds = load_from_disk(os.path.join(dataset_dir, f"c100_fold0"))
tf_func = transforms_c100
label_col = "fine_label"

# 読み込まれた時にリアルタイムで前処理を適用するようにする
ds_preprocessed = ds.with_transform(tf_func)
# バッチごとの処理のためのdata_collator
data_collator = DefaultDataCollator()
# ラベルを示す文字列のlist
labels = ds_preprocessed["train"].features[label_col].names

# 学習の設定
batch_size = ViTExperiment.BATCH_SIZE
logging_steps = len(ds_preprocessed["train"]) // batch_size

2025-04-25 13:03:12.871228: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-25 13:03:13.073950: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-25 13:03:13.728784: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-04-25 13:03:13.728861: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [9]:
# pretrained modelのロード
model = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

# intermediate.repairレイヤの恒等初期化を明示的にやり直す (訓練した時はrepairなかったので)
# repairレイヤを増やす前のモデルからfrom_pretrainedする時はこれが必要？
with torch.no_grad():
    for i in range(len(model.vit.encoder.layer)):
        model.vit.encoder.layer[i].intermediate.repair.weight.copy_(torch.eye(3072))
        model.vit.encoder.layer[i].intermediate.repair.weight.requires_grad = False

state_dict = model.state_dict()

# 'repair' という名前を含むパラメータを抽出して表示
repair_keys = [k for k in state_dict.keys() if 'repair' in k]

# 'repair' を含むキーを抽出
for key in state_dict:
    if 'repair' in key:
        print(f"🔑 Key: {key}")
        print(f"🧠 Weight Tensor (shape={state_dict[key].shape}):")
        print(state_dict[key])
        print("===" * 20)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['encoder.layer.5.intermediate.repair.weight', 'encoder.layer.2.intermediate.repair.weight', 'encoder.layer.9.intermediate.repair.weight', 'encoder.layer.1.intermediate.repair.weight',

🔑 Key: vit.encoder.layer.0.intermediate.repair.weight
🧠 Weight Tensor (shape=torch.Size([3072, 3072])):
tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')
🔑 Key: vit.encoder.layer.1.intermediate.repair.weight
🧠 Weight Tensor (shape=torch.Size([3072, 3072])):
tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')
🔑 Key: vit.encoder.layer.2.intermediate.repair.weight
🧠 Weight Tensor (shape=torch.Size([3072, 3072])):
tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 

In [10]:
model.train()
# 訓練可能なパラメータ数の割合
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable_params / total_params:.6%} of total params ({trainable_params} / {total_params})")

Trainable params: 43.127157% of total params (85875556 / 199121764)


In [11]:
print("📋 パラメータ一覧（trainable / untrainable 含む）")

total_elements = 0
trainable_elements = 0

for name, param in model.named_parameters():
    numel = param.numel()
    total_elements += numel
    if param.requires_grad:
        trainable_elements += numel
        status = "✅ trainable"
    else:
        status = "❌ frozen"
    print(f"{status:12} | {name:60} | shape: {str(tuple(param.shape)):25} | #params: {numel}")

print("\n📊 Summary")
print(f"Trainable: {trainable_elements:,} / {total_elements:,} ({100 * trainable_elements / total_elements:.2f}%)")


📋 パラメータ一覧（trainable / untrainable 含む）
✅ trainable  | vit.embeddings.cls_token                                     | shape: (1, 1, 768)               | #params: 768
✅ trainable  | vit.embeddings.position_embeddings                           | shape: (1, 197, 768)             | #params: 151296
✅ trainable  | vit.embeddings.patch_embeddings.projection.weight            | shape: (768, 3, 16, 16)          | #params: 589824
✅ trainable  | vit.embeddings.patch_embeddings.projection.bias              | shape: (768,)                    | #params: 768
✅ trainable  | vit.encoder.layer.0.attention.attention.query.weight         | shape: (768, 768)                | #params: 589824
✅ trainable  | vit.encoder.layer.0.attention.attention.query.bias           | shape: (768,)                    | #params: 768
✅ trainable  | vit.encoder.layer.0.attention.attention.key.weight           | shape: (768, 768)                | #params: 589824
✅ trainable  | vit.encoder.layer.0.attention.attention.key.bias     

In [12]:
training_args = TrainingArguments(
    output_dir="./",
    num_train_epochs=2,
    learning_rate=2e-4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    remove_unused_columns=False, # img列がないとエラーになるので必要
    evaluation_strategy="epoch", # エポックの終わりごとにeval_datasetで評価
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    disable_tqdm=False,
    log_level="error",
    report_to="tensorboard",
    load_best_model_at_end=True,
)

In [13]:
print(len(ds_preprocessed["train"]), len(ds_preprocessed["test"]))

40000 10000


In [18]:
# 学習の実行
# 訓練データを100件だけに制限
ds_limited = ds_preprocessed.copy()
ds_limited["train"] = ds_preprocessed["train"].select(range(100))
ds_limited["test"] = ds_preprocessed["test"].select(range(100))
# NOTE: 表示されるプログレスバーの分母の数字は，num_epoch*num_sample/batch_size
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_preprocessed["train"],
    eval_dataset=ds_preprocessed["test"],
    tokenizer=processor,
)
train_results = trainer.train()

ViTForImageClassification




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1907,0,{'accuracy': 0.8644},{'f1': 0.864050745822665}
2,0.2307,0,{'accuracy': 0.9106},{'f1': 0.91076712742873}


# `get_peft_model()` でラップして実行

ここまでで,  `ViTIntermediate` の実装をLoRA用にカスタマイズした際に，*LoRAを使用しない場合で，* 訓練・推論時に問題なさそうなことがわかった．

次は3072x3072の行列にLoRAを適用してどうなるかチェック

In [14]:
# pretrained modelのロード
model = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)
# intermediate.repairレイヤの恒等初期化を明示的にやり直す (訓練した時はrepairなかったので)
# repairレイヤを増やす前のモデルからfrom_pretrainedする時はこれが必要？
with torch.no_grad():
    for i in range(len(model.vit.encoder.layer)):
        model.vit.encoder.layer[i].intermediate.repair.weight.copy_(torch.eye(3072))
        model.vit.encoder.layer[i].intermediate.repair.weight.requires_grad = False

    # LoRAの設定
lora_cfg = LoraConfig(
    r=1,
    lora_alpha=1,
    lora_dropout=0.0,
    bias="none",
    target_modules=["vit.encoder.layer.11.intermediate.repair"],  # 明示的に11層だけ指定
)
model = get_peft_model(model, lora_cfg)
model.train()
model.print_trainable_parameters()  # LoRA param ∼ 100 k だけ

# 訓練可能なパラメータ数の割合
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable_params / total_params:.6%} of total params ({trainable_params} / {total_params})")

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['encoder.layer.5.intermediate.repair.weight', 'encoder.layer.2.intermediate.repair.weight', 'encoder.layer.9.intermediate.repair.weight', 'encoder.layer.1.intermediate.repair.weight',

trainable params: 6144 || all params: 199127908 || trainable%: 0.0030854539987433603
Trainable params: 0.003085% of total params (6144 / 199127908)


In [15]:
print("📋 パラメータ一覧（trainable / untrainable 含む）")

total_elements = 0
trainable_elements = 0

for name, param in model.named_parameters():
    numel = param.numel()
    total_elements += numel
    if param.requires_grad:
        trainable_elements += numel
        status = "✅ trainable"
    else:
        status = "❌ frozen"
    print(f"{status:12} | {name:60} | shape: {str(tuple(param.shape)):25} | #params: {numel}")

print("\n📊 Summary")
print(f"Trainable: {trainable_elements:,} / {total_elements:,} ({100 * trainable_elements / total_elements:.2f}%)")
# 上の出力から，最終レイヤの中間状態のアップデートをLoRAしたいことがわかる
# 訓練する重みパラメータの数 = 3072 x r x 2

📋 パラメータ一覧（trainable / untrainable 含む）
❌ frozen     | base_model.model.vit.embeddings.cls_token                    | shape: (1, 1, 768)               | #params: 768
❌ frozen     | base_model.model.vit.embeddings.position_embeddings          | shape: (1, 197, 768)             | #params: 151296
❌ frozen     | base_model.model.vit.embeddings.patch_embeddings.projection.weight | shape: (768, 3, 16, 16)          | #params: 589824
❌ frozen     | base_model.model.vit.embeddings.patch_embeddings.projection.bias | shape: (768,)                    | #params: 768
❌ frozen     | base_model.model.vit.encoder.layer.0.attention.attention.query.weight | shape: (768, 768)                | #params: 589824
❌ frozen     | base_model.model.vit.encoder.layer.0.attention.attention.query.bias | shape: (768,)                    | #params: 768
❌ frozen     | base_model.model.vit.encoder.layer.0.attention.attention.key.weight | shape: (768, 768)                | #params: 589824
❌ frozen     | base_model.model.vit

In [16]:
training_args = TrainingArguments(
    output_dir="./",
    num_train_epochs=5,
    learning_rate=2e-4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    remove_unused_columns=False, # img列がないとエラーになるので必要
    evaluation_strategy="epoch", # エポックの終わりごとにeval_datasetで評価
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    disable_tqdm=False,
    log_level="error",
    report_to="tensorboard",
    load_best_model_at_end=True,
)

In [19]:
# 学習の実行
# 訓練データを100件だけに制限
ds_limited = ds_preprocessed.copy()
ds_limited["train"] = ds_preprocessed["train"].select(range(100))
ds_limited["test"] = ds_preprocessed["test"].select(range(100))
# NOTE: 表示されるプログレスバーの分母の数字は，num_epoch*num_sample/batch_size
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_preprocessed["train"],
    eval_dataset=ds_preprocessed["test"],
    tokenizer=processor,
)
train_results = trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

パラメータ数は全然違う割にはあんまり訓練時間変わらなくない...？

LoRAの論文でも変更するパラメータ数の割に精度が高いことを言ってるだけで，別に訓練時間の削減はいってなかったように見える．

結論：LoRA (r=1) は少なくともfine-tuneを置き換えられるものではない

# r=1以外でもLoRAをやってみる

In [22]:
ds_limited = ds_preprocessed.copy()
ds_limited["train"] = ds_preprocessed["train"].select(range(5000))
ds_limited["test"] = ds_preprocessed["test"].select(range(5000))
# pretrained modelのロード
model = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)
# intermediate.repairレイヤの恒等初期化を明示的にやり直す (訓練した時はrepairなかったので)
# repairレイヤを増やす前のモデルからfrom_pretrainedする時はこれが必要？
with torch.no_grad():
    for i in range(len(model.vit.encoder.layer)):
        model.vit.encoder.layer[i].intermediate.repair.weight.copy_(torch.eye(3072))
        model.vit.encoder.layer[i].intermediate.repair.weight.requires_grad = False
model.train()
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_limited["train"],
    eval_dataset=ds_limited["test"],
    tokenizer=processor,
)
train_results = trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
rs = [2, 4, 8, 16, 32, 64]
for r in rs:
    lora_cfg = LoraConfig(
        r=r,
        lora_alpha=1,
        lora_dropout=0.0,
        bias="none",
        target_modules=["vit.encoder.layer.11.intermediate.repair"],  # 明示的に11層だけ指定
    )
    # pretrained modelのロード
    model = ViTForImageClassification.from_pretrained(
        ViTExperiment.ViT_PATH,
        num_labels=len(labels),
        id2label={str(i): c for i, c in enumerate(labels)},
        label2id={c: str(i) for i, c in enumerate(labels)}
    ).to(device)
    # intermediate.repairレイヤの恒等初期化を明示的にやり直す (訓練した時はrepairなかったので)
    # repairレイヤを増やす前のモデルからfrom_pretrainedする時はこれが必要？
    with torch.no_grad():
        for i in range(len(model.vit.encoder.layer)):
            model.vit.encoder.layer[i].intermediate.repair.weight.copy_(torch.eye(3072))
            model.vit.encoder.layer[i].intermediate.repair.weight.requires_grad = False
    model = get_peft_model(model, lora_cfg)
    model.train()
    model.print_trainable_parameters()  # LoRA param ∼ 100 k だけ
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        train_dataset=ds_preprocessed["train"],
        eval_dataset=ds_preprocessed["test"],
        tokenizer=processor,
    )
    train_results = trainer.train()