In [1]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from transformers import ViTForImageClassification, ViTImageProcessor
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### GPU 可用性のテスト

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### イメージフォルダを処理

In [3]:
# データの前処理
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# CIFAR-10データセットの導入
train_dataset = datasets.CIFAR10(root='./dataset_cifar10', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./dataset_cifar10', train=False, download=True, transform=transform)

Files already downloaded and verified
Files already downloaded and verified


### HunggingFaceからモデルを導入

In [4]:
# モデルの名前と保存先のパス
model_name = "google/vit-base-patch16-224"
model_path = "google_vit"

if os.path.exists(model_path):
    model = ViTForImageClassification.from_pretrained(model_path)
    print("Loaded model from vit-cifar10")
else:
    model = ViTForImageClassification.from_pretrained(model_name, num_labels=10, ignore_mismatched_sizes=True)
    print("Using default pretrained model with custom classifier")

Loaded model from vit-cifar10


導入されたモデルはGPUへ移動してみます。

In [5]:
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

### ラベルをモデルの入力形式に変換

`ViTImageProcessor.from_pretrained` 入力埋め込みの勾配を有効にします。これは、モデルの重みを固定したままアダプターの重みを微調整するのに役立ちます。

In [6]:
# rescaleはいらない、その画像もう正規化されていた。
# processor = ViTImageProcessor.from_pretrained(model_name, do_rescale=False)  # Googleへの接続が必要ですので、エラーが発生した場合は、以下の部分を利用してください。
processor = ViTImageProcessor.from_pretrained(model_name, do_rescale=False, use_auth_token=False, trust_remote_code=False, verify=False)



以下の内容はデータの前処理です。

In [7]:
def collate_fn(batch):
    images, labels = zip(*batch)

    # 画像をリサイズと正規化、テンソル化などの前処理
    images = [img.numpy().transpose((1, 2, 0)) for img in images] # 画像の次元を変更
    images = [torch.tensor(img) for img in images] # テンソル化
    encodings = processor(images=images, return_tensors="pt") # モデルに入力する形に変換
    encodings["labels"] = torch.tensor(labels) # ラベルをテンソル化
    return encodings

### 学習と評価

訓練用パラメータの設定はこちらから

In [8]:
batch_size = 64
accumulation_steps = 64  
epochs = 10

オープンソースのデータセットからデータを読み込み、そのデータをトレーニングとテストの二つの部分に分割します。

In [9]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

オプティマイザーの定義

In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

つぎからはモデルの学習など

In [11]:
# モデルの学習
model.train()
for epoch in range(epochs): 
    total_loss = 0
    optimizer.zero_grad()

    for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
        batch = {k: v.to(device) for k, v in batch.items()} 
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        if (i + 1) % accumulation_steps == 0:  # 64バッチごとに勾配を更新
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch: {epoch + 1} Loss: {avg_loss:.4f}")

  context_layer = torch.nn.functional.scaled_dot_product_attention(
Epoch 1: 100%|██████████| 782/782 [14:19<00:00,  1.10s/it]


Epoch: 1 Loss: 1.3134


Epoch 2: 100%|██████████| 782/782 [14:02<00:00,  1.08s/it]


Epoch: 2 Loss: 0.2975


Epoch 3: 100%|██████████| 782/782 [14:07<00:00,  1.08s/it]


Epoch: 3 Loss: 0.0996


Epoch 4: 100%|██████████| 782/782 [14:19<00:00,  1.10s/it]


Epoch: 4 Loss: 0.0503


Epoch 5: 100%|██████████| 782/782 [14:12<00:00,  1.09s/it]


Epoch: 5 Loss: 0.0275


Epoch 6: 100%|██████████| 782/782 [16:01<00:00,  1.23s/it]


Epoch: 6 Loss: 0.0166


Epoch 7: 100%|██████████| 782/782 [18:57<00:00,  1.45s/it]


Epoch: 7 Loss: 0.0108


Epoch 8: 100%|██████████| 782/782 [18:35<00:00,  1.43s/it]


Epoch: 8 Loss: 0.0078


Epoch 9: 100%|██████████| 782/782 [19:09<00:00,  1.47s/it]


Epoch: 9 Loss: 0.0059


Epoch 10: 100%|██████████| 782/782 [16:05<00:00,  1.23s/it]

Epoch: 10 Loss: 0.0047





訓練が終わったら、モデルの精度を評価します。

In [12]:
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

# テストデータでの精度を計算
accuracy = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=test_dataset.classes)

print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Evaluating: 100%|██████████| 157/157 [02:08<00:00,  1.22it/s]

Test Accuracy: 0.9814
Classification Report:
              precision    recall  f1-score   support

    airplane       0.98      0.98      0.98      1000
  automobile       0.98      0.99      0.98      1000
        bird       0.99      0.98      0.99      1000
         cat       0.96      0.96      0.96      1000
        deer       0.98      0.99      0.98      1000
         dog       0.97      0.96      0.97      1000
        frog       0.99      1.00      0.99      1000
       horse       0.99      0.99      0.99      1000
        ship       0.99      0.99      0.99      1000
       truck       0.98      0.98      0.98      1000

    accuracy                           0.98     10000
   macro avg       0.98      0.98      0.98     10000
weighted avg       0.98      0.98      0.98     10000






### モデルの保存

In [13]:
model_path_custom = model_path + "_cifar10"
model.save_pretrained(model_path_custom)