Google Colabで実装を行なったため、main.pyは使用しておりません。実行する場合はfileのpathなどを適宜変更してもらえればと思います。また、本ノートブックでは9個のモデルの出力結果を結合していますが、それぞれのモデルのファインチューニングと出力に5~8時間ほどかかると思われます。自身の実装環境ではそれぞれのモデルを別々に訓練しており、時間が足りなかったため本ノートブックを通しで実行して検証することができませんでした。パラメータなどは再確認したためエラーはないと思いますが、万が一の場合はお手数おかけいたします、、、

# **1. データ読み込み**

In [None]:
! pip install accelerate
! pip install --upgrade bitsandbytes -i https://pypi.org/simple/
! pip install --upgrade transformers
! pip install datasets
! pip install peft
! pip install wandb

In [None]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/Colab Notebooks/VQA_final/VQA/train.zip", 'r')
zip_ref.extractall("/tmp")
zip_ref.close()

In [None]:
import re
import random
import time
from statistics import mode

import os
import numpy as np
import pandas
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
from PIL import Image
from tqdm.notebook import tqdm
import pandas as pd
import json


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(83)

In [None]:
import json
import os
import zipfile
import torch
from PIL import Image
import io
from torchvision import transforms
from collections import Counter
from datasets import Dataset, Features, Value, Image
from tqdm.notebook import tqdm
from collections import Counter


def createDataset(image_folder, json_path):
    trainList = []
        with open(json_path, 'r') as f:
            data = json.load(f)

        for k in tqdm(range(len(data['image']))):
            image = data['image'][str(k)]
            question = data['question'][str(k)]
            answers = data['answers'][str(k)]
            options = [entry["answer"] for entry in answers]
            answer_counts = Counter(options)
            mode_answer = answer_counts.most_common(1)[0][0]
            temp = []
            temp.append(mode_answer)
            temp.append(question)
            temp.append(image_folder + image)
            trainList.append(temp)

        f.close()
    labels = ['answer', 'question', 'image']
    df = pd.DataFrame.from_records(trainList, columns=labels)
    features = Features({
      'answer': Value('string'),
      'question': Value('string'),
      'image': Image()
    })

    dataset = Dataset.from_pandas(df, features=features)
    return dataset

train_dataset = createDataset("/tmp/train/", '/content/drive/MyDrive/Colab Notebooks/VQA_final/VQA/train.json')

In [None]:
# PaliGemmaProcessorを使用しデータをエンコーディングする
def collate_fn(examples):
  texts = ["answer en " + example["question"] for example in examples]
  labels= [example['answer'] for example in examples]
  images = [example["image"].convert("RGB") for example in examples]
  tokens = processor(text=texts, images=images, suffix=labels,
                    return_tensors="pt", padding="longest",
                    tokenize_newline_separately=False)

  tokens = tokens.to(torch.bfloat16).to(device)
  return tokens

# **2. モデルのパラメータ定義**

In [None]:
from huggingface_hub import notebook_login


#PaliGemmaモデルにアクセスするためにhuggingfaceのreadトークンとwriteトークンを登録してください
notebook_login()

In [None]:
from peft import get_peft_model, LoraConfig

device = "cuda" if torch.cuda.is_available() else "cpu"

model_ids = [
    "google/paligemma-3b-ft-vqav2-224",
    "google/paligemma-3b-ft-gqa-224",
    "google/paligemma-3b-ft-tallyqa-224",
    "google/paligemma-3b-ft-stvqa-224",
    "google/paligemma-3b-ft-textvqa-224",
    "google/paligemma-3b-ft-aokvqa-da-224",
    "google/paligemma-3b-pt-224",
    "google/paligemma-3b-ft-ocrvqa-224",
    "google/paligemma-3b-ft-okvqa-224"
]

#LoRAファインチューニングのパラメータ
lora_configs = [
    LoraConfig(
      r=8,
      target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
      lora_dropout=0.05,
      use_rslora=True,
      lora_alpha=16,
      bias="none",
      task_type="CAUSAL_LM",
    ),
    LoraConfig(
      r=8,
      target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
      task_type="CAUSAL_LM",
    ),
    LoraConfig(
      r=8,
      target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
      task_type="CAUSAL_LM",
    ),
    LoraConfig(
      r=8,
      target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
      task_type="CAUSAL_LM",
    ),
    LoraConfig(
      r=8,
      target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
      task_type="CAUSAL_LM",
    ),
    LoraConfig(
      r=8,
      target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
      task_type="CAUSAL_LM",
    ),
    LoraConfig(
      r=8,
      target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
      lora_dropout=0.05,
      task_type="CAUSAL_LM",

    ),
    LoraConfig(
      r=8,
      target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
      task_type="CAUSAL_LM",
    ),
    LoraConfig(
      r=8,
      target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
      task_type="CAUSAL_LM",
    ),
]

In [None]:
from transformers import Trainer, TrainingArguments

lr = 1e-5

#訓練時ののtrainerのパラメータ
training_args = [
    TrainingArguments(
      num_train_epochs=6,
      remove_unused_columns=False,
      per_device_train_batch_size=10,
      gradient_accumulation_steps=4,
      warmup_steps=2,
      learning_rate=lr,
      weight_decay=1e-4,
      adam_epsilon=1e-8,
      adam_beta2=0.999,
      max_grad_norm=3.0,
      logging_steps=100,
      optim="adamw_hf",
      push_to_hub=False,
      dataloader_pin_memory=False,
      lr_scheduler_type="linear",
    ),
    TrainingArguments(
      num_train_epochs=6,
      remove_unused_columns=False,
      per_device_train_batch_size=10,
      gradient_accumulation_steps=4,
      warmup_steps=2,
      learning_rate=lr,
      weight_decay=1e-4,
      adam_epsilon=1e-8,
      adam_beta2=0,
      logging_steps=100,
      optim="adamw_hf",
      push_to_hub=False,
      dataloader_pin_memory=False,
      lr_scheduler_type="linear",
    ),
    TrainingArguments(
      num_train_epochs=6,
      remove_unused_columns=False,
      per_device_train_batch_size=10,
      gradient_accumulation_steps=4,
      warmup_steps=2,
      learning_rate=lr,
      weight_decay=0,
      adam_beta2=0,
      logging_steps=100,
      optim="adamw_hf",
      push_to_hub=False,
      dataloader_pin_memory=False,
      lr_scheduler_type="linear",
    ),
    TrainingArguments(
      num_train_epochs=6,
      remove_unused_columns=False,
      per_device_train_batch_size=10,
      gradient_accumulation_steps=4,
      warmup_steps=2,
      learning_rate=lr,
      weight_decay=0,
      adam_beta2=0,
      logging_steps=100,
      optim="adamw_hf",
      push_to_hub=False,
      dataloader_pin_memory=False,
      lr_scheduler_type="linear",
    ),
    TrainingArguments(
      num_train_epochs=7,
      remove_unused_columns=False,
      per_device_train_batch_size=10,
      gradient_accumulation_steps=4,
      warmup_steps=2,
      learning_rate=lr,
      weight_decay=0,
      adam_beta2=0,
      logging_steps=100,
      optim="adamw_hf",
      push_to_hub=False,
      dataloader_pin_memory=False,
      lr_scheduler_type="linear",
    ),
    TrainingArguments(
      num_train_epochs=7,
      remove_unused_columns=False,
      per_device_train_batch_size=10,
      gradient_accumulation_steps=4,
      warmup_steps=2,
      learning_rate=lr,
      weight_decay=0,
      adam_beta2=0,
      logging_steps=100,
      optim="adamw_hf",
      push_to_hub=False,
      dataloader_pin_memory=False,
      lr_scheduler_type="linear",
    ),
    TrainingArguments(
      num_train_epochs=10,
      remove_unused_columns=False,
      per_device_train_batch_size=10,
      gradient_accumulation_steps=4,
      warmup_steps=2,
      learning_rate=lr,
      weight_decay=1e-5,
      max_grad_norm=10.0,
      logging_steps=100,
      optim="adamw_hf",
      push_to_hub=False,
      dataloader_pin_memory=False,
      lr_scheduler_type="linear",
    ),
    TrainingArguments(
      num_train_epochs=8,
      remove_unused_columns=False,
      per_device_train_batch_size=10,
      gradient_accumulation_steps=4,
      warmup_steps=2,
      learning_rate=lr,
      weight_decay=0,
      adam_beta2=0,
      logging_steps=100,
      optim="adamw_hf",
      push_to_hub=False,
      dataloader_pin_memory=False,
      lr_scheduler_type="linear",
    ),
    TrainingArguments(
      num_train_epochs=6,
      remove_unused_columns=False,
      per_device_train_batch_size=10,
      gradient_accumulation_steps=4,
      warmup_steps=2,
      learning_rate=lr,
      weight_decay=0,
      adam_beta2=0,
      logging_steps=100,
      optim="adamw_hf",
      push_to_hub=False,
      dataloader_pin_memory=False,
      lr_scheduler_type="linear",
    ),
]

# **3. ファインチューニングと推論**

In [None]:
from transformers import BitsAndBytesConfig, PaliGemmaForConditionalGeneration, PaliGemmaProcessor
from peft import get_peft_model

def load_model(model_id, lora_config):
    #PaliGemmaモデルの読み込み
    model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
    processor = PaliGemmaProcessor.from_pretrained(model_id)

    #QLoRAファインチューニングの設定
    for param in model.vision_tower.parameters():
        param.requires_grad = False

    for param in model.multi_modal_projector.parameters():
        param.requires_grad = False

    bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, torch_dtype=torch.bfloat16)
    model = get_peft_model(model, lora_config)
    return model, processor


predictions = []

# 各モデルをファインチューニングし、推論を行う
for i in range(len(model_ids)):
    model, processor = load_model(model_ids[i], lora_configs[i])
    trainer = Trainer(
        model=model,
        train_dataset=train_dataset,
        data_collator=collate_fn,
        args=training_args[i],
    )
    trainer.train()

    submission = []
    batch_size = 16
    image_paths = []
    questions = []

    # データをロード
    with open('/content/drive/MyDrive/Colab Notebooks/VQA_final/VQA/valid.json', 'r') as f:
        data = json.load(f)
        for k in range(len(data['image'])):
            image_paths.append('/tmp/valid/' + data['image'][str(k)])
            questions.append(data['question'][str(k)])

    # バッチ処理
    for j in tqdm(range(0, len(image_paths), batch_size)):
        batch_image_paths = image_paths[j:j + batch_size]
        batch_questions = questions[j:j + batch_size]

        images = [Image.open(path).convert("RGB") for path in batch_image_paths]
        inputs = processor(batch_questions, images, return_tensors="pt", padding=True).to('cuda')

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=20)

        batch_submission = [processor.decode(output, skip_special_tokens=True) for output in outputs]
        submission.extend(batch_submission)

    # 出力の形を調整
    def extract_after_newline(item):
        return item.split('\n')[-1]

    submission = np.array([extract_after_newline(item) for item in submission])
    predictions.append(submission)

    # GPUメモリを解放
    del model
    del trainer
    torch.cuda.empty_cache()

# **4. アンサンブル**

In [None]:
from collections import Counter
from collections import defaultdict

def weighted_ensemble(predictions, weights):
    num_samples = len(predictions[0])

    ensembled_predictions = []

    for i in range(num_samples):
        #各出力結果の回数を数え、重みを追加していく
        counter = Counter()
        for pred, weight in zip(predictions, weights):
            counter[pred[i]] += weight

        #一番重みの合計が高い答えを選択
        ensembled_predictions.append(counter.most_common(1)[0][0])

    return np.array(ensembled_predictions)


weights = [0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
ensembled_result = weighted_ensemble(predictions, weights)
np.save('submission.npy', ensembled_result)