## Step 1. Import Library

In [1]:
from datasets import load_dataset

## Step 2. Load Dataset

In [2]:
dataset = load_dataset("json", data_files="./image_gt.json", split="train")#[:5%]
datasets = dataset.train_test_split(test_size=0.02)
datasets

DatasetDict({
    train: Dataset({
        features: ['image1', 'image2', 'class'],
        num_rows: 4508
    })
    test: Dataset({
        features: ['image1', 'image2', 'class'],
        num_rows: 92
    })
})

In [3]:
train_ds = datasets["train"]
test_ds = datasets["test"]
train_ds

Dataset({
    features: ['image1', 'image2', 'class'],
    num_rows: 4508
})

In [4]:
from textwrap import wrap
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image


def plot_images(images1, images2, captions):
    plt.figure(figsize=(20, 20))
    for i in range(len(images1)):
        img_path = "./heatmapOn_trajOn/" + images1[i]
        image1 = Image.open(img_path)
        img_path = "./heatmapOn_trajOn/" + images2[i]
        image2 = Image.open(img_path)
        
        image2 = image2.resize((image1.width, image1.height))
        # 創建拼接後的新圖片（寬度為兩張圖片寬度之和）
        new_width = image1.width
        new_height = image1.height + image2.height
        concatenated_image = Image.new('RGB', (new_width, new_height))
        
        # 將圖片粘貼到新圖片上
        concatenated_image.paste(image1, (0, 0))
        concatenated_image.paste(image2, (0, image1.height))

        ax = plt.subplot(1, len(images1), i + 1)
        caption = captions[i]
        caption = "\n".join(wrap(caption, 12))
        plt.title(caption)
        plt.imshow(concatenated_image)
        plt.axis("off")


sample_images1_to_visualize = [train_ds[i]["image1"] for i in range(5)]
sample_images2_to_visualize = [train_ds[i]["image2"] for i in range(5)]
print(sample_images1_to_visualize)
print(sample_images2_to_visualize)
sample_captions = [train_ds[i]["class"] for i in range(5)]
plot_images(sample_images1_to_visualize, sample_images2_to_visualize, sample_captions)

['Pair-25-B-Single-EYE_trial21_player.jpg', 'Pair-25-Coop-EYE_trial39_playerA.jpg', 'Pair-25-A-Single-EYE_trial16_player.jpg', 'Pair-26-Coop-EYE_trial28_playerA.jpg', 'Pair-27-Comp-EYE_trial07_playerA.jpg']
['Pair-25-B-Single-EYE_trial21_observer.jpg', 'Pair-25-Coop-EYE_trial39_playerB.jpg', 'Pair-25-A-Single-EYE_trial16_observer.jpg', 'Pair-26-Coop-EYE_trial28_playerB.jpg', 'Pair-27-Comp-EYE_trial07_playerB.jpg']


FileNotFoundError: [Errno 2] No such file or directory: './heatmapOn_trajOn/Pair-25-B-Single-EYE_trial21_player.jpg'

<Figure size 2000x2000 with 0 Axes>

## Step 3. Preprocess Data 

In [9]:
from PIL import Image
from transformers import AutoProcessor, AutoModel

#processor = AutoProcessor.from_pretrained("microsoft/git-base")
processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
#processor = GitProcessor.from_pretrained("microsoft/git-base")


def transforms(example_batch):
    root = "./heatmapOn_trajOn/"
    
    concatenated_images = []
    #print(example_batch["image1"])
    
    # 拼接兩張圖片
    for img_path1, img_path2 in zip(example_batch["image1"], example_batch["image2"]):
        try:
            image1 = Image.open(root + img_path1)
            image2 = Image.open(root + img_path2)
            # 確保兩張圖片的尺寸相同（可選）
            image2 = image2.resize((image1.width, image1.height))
            black_image = Image.new('RGB', (image1.width, image1.height), color=(0, 0, 0))
        except Exception as e:
            print(f"Error loading images {img_path1} and {img_path2}: {e}")
            continue


        #concatenated_images.append([image1, black_image, black_image, image2, black_image, black_image])
        concatenated_images.append([image1, image2])
        
    
    # 加載文本標籤
    captions = [x for x in example_batch["class"]]
    
    # 將拼接後的圖片和標籤進行處理
    inputs = processor(images=concatenated_images, text=captions, return_tensors="pt", padding="max_length", max_length=32, truncation=True)
    #print(f"Pixel values shape: {inputs['pixel_values'].shape}")
    #print(f"Input IDs shape: {inputs['input_ids'].shape}")

    inputs.update({"labels": inputs["input_ids"]})
    return inputs



# 設定 transforms 給 train 和 test 資料集
train_ds.set_transform(transforms)
test_ds.set_transform(transforms)

print(train_ds[2])

{'input_ids': tensor([ 101, 2309,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'pixel_values': tensor([[[[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],

         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],

         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]

In [16]:
from transformers import AutoModelForCausalLM, GitConfig


#model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
configuration = GitConfig.from_pretrained("microsoft/git-base-vatex")
configuration.num_image_with_embedding = 2
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex", config=configuration)
#model.config.num_image_with_embedding = 2
print(model.config)
print(model)

Some weights of the model checkpoint at microsoft/git-base-vatex were not used when initializing GitForCausalLM: ['git.img_temperal_embedding.2', 'git.img_temperal_embedding.3', 'git.img_temperal_embedding.4', 'git.img_temperal_embedding.5']
- This IS expected if you are initializing GitForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GitForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


GitConfig {
  "_name_or_path": "microsoft/git-base-vatex",
  "architectures": [
    "GitForCausalLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 101,
  "classifier_dropout": null,
  "eos_token_id": 102,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "git",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "num_image_with_embedding": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.45.2",
  "use_cache": true,
  "vision_config": {
    "dropout": 0.0,
    "initializer_factor": 1.0,
    "model_type": "git_vision_model",
    "projection_dim": 512
  },
  "vocab_size": 30522
}

GitForCausalLM(
  (git): GitModel(
    (embeddings): GitEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=

In [17]:
from evaluate import load
import torch

# Load the metrics
accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")



def compute_metrics(eval_pred):
    torch.cuda.empty_cache()
    with torch.no_grad():
        logits, labels = eval_pred
        predicted = logits.argmax(-1)

        decoded_labels = [label.lower() for label in processor.batch_decode(labels, skip_special_tokens=True)]
        decoded_predictions = [pred.lower() for pred in processor.batch_decode(predicted, skip_special_tokens=True)]

        label_mapping = {"cooperation": 0, "single": 1, "competition": 2}
        encoded_labels = [label_mapping.get(label, 4) for label in decoded_labels]
        encoded_predictions = [label_mapping.get(pred, 4) for pred in decoded_predictions]

        # Calculate each metric
        accuracy = accuracy_metric.compute(predictions=encoded_predictions, references=encoded_labels)
        precision = precision_metric.compute(predictions=encoded_predictions, references=encoded_labels, average="weighted")
        recall = recall_metric.compute(predictions=encoded_predictions, references=encoded_labels, average="weighted")
        f1_score = f1_metric.compute(predictions=encoded_predictions, references=encoded_labels, average="weighted")

    torch.cuda.empty_cache()
    return {
        "accuracy": accuracy['accuracy'],
        "precision": precision['precision'],
        "recall": recall['recall'],
        "f1_score": f1_score['f1']
    }


In [18]:
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir="./vqa_check",
    learning_rate=5e-5,
    num_train_epochs=10,
    fp16=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=16,
    save_total_limit=3,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=50,
    logging_steps=50,
    remove_unused_columns=False,
    push_to_hub=False,
    label_names=["labels"],
    load_best_model_at_end=False,
    report_to="none"
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    #compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [20]:
trainer.train()


  0%|          | 0/1370 [00:00<?, ?it/s]

{'loss': 6.9479, 'grad_norm': 38.22489929199219, 'learning_rate': 4.821167883211679e-05, 'epoch': 0.36}
{'loss': 1.9457, 'grad_norm': 10.135743141174316, 'learning_rate': 4.6386861313868616e-05, 'epoch': 0.73}
{'loss': 0.0919, 'grad_norm': 0.204263836145401, 'learning_rate': 4.456204379562044e-05, 'epoch': 1.09}
{'loss': 0.0378, 'grad_norm': 0.21828122437000275, 'learning_rate': 4.273722627737227e-05, 'epoch': 1.46}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.03540249168872833, 'eval_runtime': 5.8601, 'eval_samples_per_second': 15.358, 'eval_steps_per_second': 15.358, 'epoch': 1.46}
{'loss': 0.0361, 'grad_norm': 0.4641824960708618, 'learning_rate': 4.091240875912409e-05, 'epoch': 1.82}
{'loss': 0.0368, 'grad_norm': 0.3566315174102783, 'learning_rate': 3.908759124087591e-05, 'epoch': 2.19}
{'loss': 0.0364, 'grad_norm': 0.2587302327156067, 'learning_rate': 3.726277372262774e-05, 'epoch': 2.55}
{'loss': 0.0352, 'grad_norm': 0.06819877028465271, 'learning_rate': 3.5437956204379565e-05, 'epoch': 2.91}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.03343524783849716, 'eval_runtime': 5.9212, 'eval_samples_per_second': 15.2, 'eval_steps_per_second': 15.2, 'epoch': 2.91}
{'loss': 0.0346, 'grad_norm': 0.27490970492362976, 'learning_rate': 3.361313868613139e-05, 'epoch': 3.28}
{'loss': 0.0342, 'grad_norm': 0.39670664072036743, 'learning_rate': 3.178832116788321e-05, 'epoch': 3.64}
{'loss': 0.0342, 'grad_norm': 0.09060075134038925, 'learning_rate': 2.996350364963504e-05, 'epoch': 4.01}
{'loss': 0.0347, 'grad_norm': 0.14605297148227692, 'learning_rate': 2.813868613138686e-05, 'epoch': 4.37}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.03389500454068184, 'eval_runtime': 5.8161, 'eval_samples_per_second': 15.474, 'eval_steps_per_second': 15.474, 'epoch': 4.37}
{'loss': 0.0344, 'grad_norm': 0.19812078773975372, 'learning_rate': 2.6313868613138688e-05, 'epoch': 4.74}
{'loss': 0.0346, 'grad_norm': 0.1126413494348526, 'learning_rate': 2.448905109489051e-05, 'epoch': 5.1}
{'loss': 0.0346, 'grad_norm': 0.29561519622802734, 'learning_rate': 2.2664233576642337e-05, 'epoch': 5.46}
{'loss': 0.0342, 'grad_norm': 0.23871533572673798, 'learning_rate': 2.0839416058394163e-05, 'epoch': 5.83}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.03387824073433876, 'eval_runtime': 6.0042, 'eval_samples_per_second': 14.99, 'eval_steps_per_second': 14.99, 'epoch': 5.83}
{'loss': 0.0349, 'grad_norm': 0.10572350025177002, 'learning_rate': 1.9014598540145986e-05, 'epoch': 6.19}
{'loss': 0.0337, 'grad_norm': 0.17494644224643707, 'learning_rate': 1.718978102189781e-05, 'epoch': 6.56}
{'loss': 0.0346, 'grad_norm': 0.28352829813957214, 'learning_rate': 1.5364963503649634e-05, 'epoch': 6.92}
{'loss': 0.0345, 'grad_norm': 0.19742584228515625, 'learning_rate': 1.3540145985401462e-05, 'epoch': 7.29}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.033545803278684616, 'eval_runtime': 5.8092, 'eval_samples_per_second': 15.493, 'eval_steps_per_second': 15.493, 'epoch': 7.29}
{'loss': 0.0338, 'grad_norm': 0.13380731642246246, 'learning_rate': 1.1715328467153286e-05, 'epoch': 7.65}
{'loss': 0.0339, 'grad_norm': 0.2839307188987732, 'learning_rate': 9.89051094890511e-06, 'epoch': 8.01}
{'loss': 0.0322, 'grad_norm': 0.3772124648094177, 'learning_rate': 8.065693430656935e-06, 'epoch': 8.38}
{'loss': 0.0321, 'grad_norm': 0.27192819118499756, 'learning_rate': 6.240875912408759e-06, 'epoch': 8.74}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.03194621577858925, 'eval_runtime': 6.0193, 'eval_samples_per_second': 14.952, 'eval_steps_per_second': 14.952, 'epoch': 8.74}
{'loss': 0.0321, 'grad_norm': 0.23210279643535614, 'learning_rate': 4.416058394160584e-06, 'epoch': 9.11}
{'loss': 0.0316, 'grad_norm': 0.3105209767818451, 'learning_rate': 2.591240875912409e-06, 'epoch': 9.47}
{'loss': 0.0317, 'grad_norm': 0.4236796498298645, 'learning_rate': 7.664233576642336e-07, 'epoch': 9.84}
{'train_runtime': 3546.8358, 'train_samples_per_second': 12.377, 'train_steps_per_second': 0.386, 'train_loss': 0.35842943587442383, 'epoch': 9.98}


TrainOutput(global_step=1370, training_loss=0.35842943587442383, metrics={'train_runtime': 3546.8358, 'train_samples_per_second': 12.377, 'train_steps_per_second': 0.386, 'total_flos': 1280937208303872.0, 'train_loss': 0.35842943587442383, 'epoch': 9.981785063752277})