# C10でFine-tuningしたViTモデルのフォワードパスを実行する

## ライブラリのインポート
transoformersに関しては，.vscode/setting.jsonのextrapathにパスを記載したらうまくインポートできた．

In [1]:
import os, sys, math
sys.path.append("../src")
import numpy as np
import torch
from datasets import load_from_disk
from transformers import DefaultDataCollator, ViTForImageClassification, Trainer
from utils.helper import get_device
from utils.vit_util import processor, transforms, compute_metrics
from utils.constant import ViTExperiment

2024-04-23 19:44:17.191574: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-23 19:44:18.827673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-04-23 19:44:18.827829: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
  met_acc = load_metric("accuracy")


## 初期設定

In [2]:
# デバイス (cuda, or cpu) の取得
device = get_device()
# datasetをロード (初回の読み込みだけやや時間かかる)
dataset_dir = ViTExperiment.DATASET_DIR
cifar10 = load_from_disk(os.path.join(dataset_dir, "c10"))
# 読み込まれた時にリアルタイムで前処理を適用するようにする
cifar10_preprocessed = cifar10.with_transform(transforms)
# バッチごとの処理のためのdata_collator
data_collator = DefaultDataCollator()
# ラベルを示す文字列のlist
labels = cifar10_preprocessed["train"].features["label"].names
# pretrained modelのロード
pretrained_dir = ViTExperiment.OUTPUT_DIR
model = ViTForImageClassification.from_pretrained(pretrained_dir).to(device)
model.eval()
# 学習時の設定をロード
training_args = torch.load(os.path.join(pretrained_dir, "training_args.bin"))
# Trainerオブジェクトの作成
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=cifar10_preprocessed["train"],
    eval_dataset=cifar10_preprocessed["test"],
    tokenizer=processor,
)

Device: cuda


## 推論の実行

In [5]:
# データセットのサイズとバッチサイズからイテレーション回数を計算
training_args_dict = training_args.to_dict()
train_batch_size = training_args_dict["per_device_train_batch_size"]
eval_batch_size = training_args_dict["per_device_eval_batch_size"]
train_iter = math.ceil(len(cifar10_preprocessed["train"]) / train_batch_size)
eval_iter = math.ceil(len(cifar10_preprocessed["test"]) / eval_batch_size)

# 訓練・テストデータに対する推論の実行
print(f"predict training data... #iter = {train_iter} ({len(cifar10_preprocessed['train'])} samples / {train_batch_size} batches)")
train_pred = trainer.predict(cifar10_preprocessed["train"])
print(f"predict evaluation data... #iter = {eval_iter} ({len(cifar10_preprocessed['test'])} samples / {eval_batch_size} batches)")
test_pred = trainer.predict(cifar10_preprocessed["test"])

predict training data... #iter = 1563 (50000 samples / 32 batches)


predict evaluation data... #iter = 313 (10000 samples / 32 batches)


## 推論結果をnpyで保存する

In [6]:
# just for check
np.unique(np.array(cifar10["train"]["label"]), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000]))

In [7]:
train_labels = np.array(cifar10["train"]["label"])

In [19]:
# train_pred.predictions[1]をsoftmax関数に通して確率に変換
train_pred_proba = torch.nn.functional.softmax(torch.tensor(train_pred.predictions[1]), dim=-1)
# train_pred_probaをnumpy配列に変換
train_pred_proba = train_pred_proba.cpu().numpy()
# ラベルごとに違うファイルとして保存
for c in range(len(labels)):
    tgt_proba = train_pred_proba[train_labels == c]
    # train_pred_probaを保存
    np.save(os.path.join(pretrained_dir, "pred_results", f"train_proba_{c}.npy"), tgt_proba)
    print(f"train_proba_{c}.npy ({tgt_proba.shape}) saved")

train_proba_0.npy ((5000, 10)) saved
train_proba_1.npy ((5000, 10)) saved
train_proba_2.npy ((5000, 10)) saved
train_proba_3.npy ((5000, 10)) saved
train_proba_4.npy ((5000, 10)) saved
train_proba_5.npy ((5000, 10)) saved
train_proba_6.npy ((5000, 10)) saved
train_proba_7.npy ((5000, 10)) saved
train_proba_8.npy ((5000, 10)) saved
train_proba_9.npy ((5000, 10)) saved


In [20]:
# just for check
np.unique(np.array(cifar10["test"]["label"]), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]))

In [21]:
test_labels = np.array(cifar10["test"]["label"])

In [23]:
test_pred_proba = torch.nn.functional.softmax(torch.tensor(test_pred.predictions[1]), dim=-1)
test_pred_proba = test_pred_proba.cpu().numpy()
# ラベルごとに違うファイルとして保存
for c in range(len(labels)):
    tgt_proba = test_pred_proba[test_labels == c]
    print(tgt_proba.shape)
    # train_pred_probaを保存
    np.save(os.path.join(pretrained_dir, "pred_results", f"test_proba_{c}.npy"), tgt_proba)
    print(f"test_proba_{c}.npy ({tgt_proba.shape}) saved")

(1000, 10)
test_proba_0.npy ((1000, 10)) saved
(1000, 10)
test_proba_1.npy ((1000, 10)) saved
(1000, 10)
test_proba_2.npy ((1000, 10)) saved
(1000, 10)
test_proba_3.npy ((1000, 10)) saved
(1000, 10)
test_proba_4.npy ((1000, 10)) saved
(1000, 10)
test_proba_5.npy ((1000, 10)) saved
(1000, 10)
test_proba_6.npy ((1000, 10)) saved
(1000, 10)
test_proba_7.npy ((1000, 10)) saved
(1000, 10)
test_proba_8.npy ((1000, 10)) saved
(1000, 10)
test_proba_9.npy ((1000, 10)) saved
