In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import os, sys
os.sys.path.append("../src")
from utils.constant import ViTExperiment
from transformers import ViTForImageClassification
from utils.helper import get_device
from utils.vit_util import maybe_initialize_repair_weights_
device = get_device()
pretrained_dir = getattr(ViTExperiment, "c100").OUTPUT_DIR.format(k=0)

2025-04-26 22:09:49.849580: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-26 22:09:50.052262: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-26 22:09:50.717339: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-04-26 22:09:50.717417: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

Device: cuda


# データセット周りの処理

In [5]:

from datasets import load_from_disk
from transformers import DefaultDataCollator, ViTForImageClassification, TrainingArguments, Trainer
from utils.vit_util import processor, transforms, transforms_c100, compute_metrics
import torch

dataset_dir = ViTExperiment.DATASET_DIR
ds = load_from_disk(os.path.join(dataset_dir, f"c100_fold0"))
tf_func = transforms_c100
label_col = "fine_label"

# 読み込まれた時にリアルタイムで前処理を適用するようにする
ds_preprocessed = ds.with_transform(tf_func)
# バッチごとの処理のためのdata_collator
data_collator = DefaultDataCollator()
# ラベルを示す文字列のlist
labels = ds_preprocessed["train"].features[label_col].names

# 学習の設定
batch_size = ViTExperiment.BATCH_SIZE
logging_steps = len(ds_preprocessed["train"]) // batch_size

In [6]:
ds_limited = ds_preprocessed.copy()
ds_limited["train"] = ds_preprocessed["train"].select(range(1000))
ds_limited["test"] = ds_preprocessed["test"].select(range(100))

# 初期モデルロード

In [7]:
# pretrained modelのロード
model, loading_info = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    output_loading_info=True
)
model = model.to(device)
model = maybe_initialize_repair_weights_(model, loading_info["missing_keys"])

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['encoder.layer.10.intermediate.repair.weight', 'encoder.layer.5.intermediate.repair.weight', 'encoder.layer.0.intermediate.repair.weight', 'encoder.layer.2.intermediate.repair.weight'

🛠️ Initializing intermediate.repair.weight as identity matrix (for missing weights)


# 訓練周りの設定

In [8]:
training_args = TrainingArguments(
    output_dir="./",
    num_train_epochs=5,
    learning_rate=2e-4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    remove_unused_columns=False, # img列がないとエラーになるので必要
    evaluation_strategy="epoch", # エポックの終わりごとにeval_datasetで評価
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    disable_tqdm=False,
    log_level="error",
    report_to="tensorboard",
    load_best_model_at_end=True,
)

In [9]:
print(len(ds_limited["train"]), len(ds_limited["test"]))

1000 100


# full fine-tuning (一部のデータ)

In [43]:
print("📋 パラメータ一覧（trainable / untrainable 含む）")

total_elements = 0
trainable_elements = 0

for name, param in model.named_parameters():
    numel = param.numel()
    total_elements += numel
    if param.requires_grad:
        trainable_elements += numel
        status = "✅ trainable"
    else:
        status = "❌ frozen"
    print(f"{status:12} | {name:60} | shape: {str(tuple(param.shape)):25} | #params: {numel}")

print("\n📊 Summary")
print(f"Trainable: {trainable_elements:,} / {total_elements:,} ({100 * trainable_elements / total_elements:.2f}%)")


📋 パラメータ一覧（trainable / untrainable 含む）
✅ trainable  | vit.embeddings.cls_token                                     | shape: (1, 1, 768)               | #params: 768
✅ trainable  | vit.embeddings.position_embeddings                           | shape: (1, 197, 768)             | #params: 151296
✅ trainable  | vit.embeddings.patch_embeddings.projection.weight            | shape: (768, 3, 16, 16)          | #params: 589824
✅ trainable  | vit.embeddings.patch_embeddings.projection.bias              | shape: (768,)                    | #params: 768
✅ trainable  | vit.encoder.layer.0.attention.attention.query.weight         | shape: (768, 768)                | #params: 589824
✅ trainable  | vit.encoder.layer.0.attention.attention.query.bias           | shape: (768,)                    | #params: 768
✅ trainable  | vit.encoder.layer.0.attention.attention.key.weight           | shape: (768, 768)                | #params: 589824
✅ trainable  | vit.encoder.layer.0.attention.attention.key.bias     

In [44]:
# NOTE: 表示されるプログレスバーの分母の数字は，num_epoch*num_sample/batch_size
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_limited["train"],
    eval_dataset=ds_limited["test"],
    tokenizer=processor,
)
train_results = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,4.0475,3.857394,{'accuracy': 0.48},{'f1': 0.36001564945226916}
2,3.3162,3.419569,{'accuracy': 0.56},{'f1': 0.3906186406186406}
3,2.7098,3.124991,{'accuracy': 0.61},{'f1': 0.4697434224831485}
4,2.2836,2.943212,{'accuracy': 0.63},{'f1': 0.4822179322179322}
5,2.0319,2.883371,{'accuracy': 0.67},{'f1': 0.5060774081322026}


# full fine-tuning (全部のデータ)

In [48]:
print(len(ds_preprocessed["train"]), len(ds_preprocessed["test"]))

40000 10000


In [49]:
# pretrained modelのロード
model, loading_info = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    output_loading_info=True
)
model = model.to(device)
model = maybe_initialize_repair_weights_(model, loading_info["missing_keys"])

# NOTE: 表示されるプログレスバーの分母の数字は，num_epoch*num_sample/batch_size
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_preprocessed["train"],
    eval_dataset=ds_preprocessed["test"],
    tokenizer=processor,
)
train_results = trainer.train()

🛠️ Initializing intermediate.repair.weight as identity matrix (for missing weights)




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.2245,0.670846,{'accuracy': 0.819},{'f1': 0.8179123374587242}
2,0.3556,0.529411,{'accuracy': 0.8502},{'f1': 0.8508037342498159}
3,0.1708,0.473328,{'accuracy': 0.8758},{'f1': 0.8764186321518354}
4,0.0688,0.45563,{'accuracy': 0.8919},{'f1': 0.8919041060700329}
5,0.0193,0.44479,{'accuracy': 0.8978},{'f1': 0.8980407835784112}


# 中間状態のアップデートのLoRA化 (一部のデータ)

In [50]:
# pretrained modelのロード
model, loading_info = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    output_loading_info=True
)
model = model.to(device)
model = maybe_initialize_repair_weights_(model, loading_info["missing_keys"])

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["vit.encoder.layer.11.intermediate.repair"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)

🛠️ Initializing intermediate.repair.weight as identity matrix (for missing weights)


In [51]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_limited["train"],
    eval_dataset=ds_limited["test"],
    tokenizer=processor,
)
train_results = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,4.5656,4.476988,{'accuracy': 0.1},{'f1': 0.07357339630066903}
2,4.3867,4.389188,{'accuracy': 0.24},{'f1': 0.1651055611055611}
3,4.2957,4.333957,{'accuracy': 0.3},{'f1': 0.20384785493144625}
4,4.2333,4.303448,{'accuracy': 0.33},{'f1': 0.2448888888888889}
5,4.1963,4.291885,{'accuracy': 0.36},{'f1': 0.27449982449982446}


# 中間状態のアップデートのLoRA化 (全部のデータ)

## r=16

In [52]:
# pretrained modelのロード
model, loading_info = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    output_loading_info=True
)
model = model.to(device)
model = maybe_initialize_repair_weights_(model, loading_info["missing_keys"])

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["vit.encoder.layer.11.intermediate.repair"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)

🛠️ Initializing intermediate.repair.weight as identity matrix (for missing weights)


In [53]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_preprocessed["train"],
    eval_dataset=ds_preprocessed["test"],
    tokenizer=processor,
)
train_results = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.5787,1.321019,{'accuracy': 0.805},{'f1': 0.8028772556037737}
2,1.0091,0.854288,{'accuracy': 0.8224},{'f1': 0.8225894115913506}
3,0.7679,0.744461,{'accuracy': 0.8291},{'f1': 0.8293411052414106}
4,0.6879,0.701357,{'accuracy': 0.8299},{'f1': 0.829816059011502}
5,0.6547,0.689926,{'accuracy': 0.832},{'f1': 0.8320781254649822}


## r=1

In [10]:
# pretrained modelのロード
model, loading_info = ViTForImageClassification.from_pretrained(
    ViTExperiment.ViT_PATH,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    output_loading_info=True
)
model = model.to(device)
model = maybe_initialize_repair_weights_(model, loading_info["missing_keys"])

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=1,
    lora_alpha=1,
    target_modules=["vit.encoder.layer.11.intermediate.repair"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['encoder.layer.10.intermediate.repair.weight', 'encoder.layer.5.intermediate.repair.weight', 'encoder.layer.0.intermediate.repair.weight', 'encoder.layer.2.intermediate.repair.weight'

🛠️ Initializing intermediate.repair.weight as identity matrix (for missing weights)


In [None]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_preprocessed["train"],
    eval_dataset=ds_preprocessed["test"],
    tokenizer=processor,
)
train_results = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.9254,1.790163,{'accuracy': 0.7996},{'f1': 0.7965313283474446}
2,1.3354,1.075958,{'accuracy': 0.8165},{'f1': 0.8157431450663465}
3,0.9415,0.881814,{'accuracy': 0.8224},{'f1': 0.8221619564406377}
4,0.8149,0.812313,{'accuracy': 0.8247},{'f1': 0.8243620530933536}
5,0.7682,0.793205,{'accuracy': 0.825},{'f1': 0.8246330669064692}


: 

r=16とr=1の場合では0.01くらいのaccuracyの違いがある．
思ったより変わらない．

## ここまででわかったこと

FFN中間状態のアップデートのLoRA化でも，full fine-tuningほどではないが高い精度を達成できる (0.89 vs 0.83)．

-> 全データサンプルではなく直したいサンプルセット (+副作用抑制セット) を学習させたらどうなるか？

# あと調べたいこと
- LoRA の r を変えたらどうなる？
- LoRA の A の受付の形状をVdiffによって小さくしたらどうなる？