In [3]:
# pip install datasets

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import ViTFeatureExtractor
from tabulate import tabulate
from transformers import AutoImageProcessor
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
import cv2 as cv
from google.colab.patches import cv2_imshow
from scipy.stats import pearsonr
import tensorflow as tf
from tensorflow import keras
import torch
import datasets
from tensorflow.keras import layers, models
import os
from sklearn.model_selection import train_test_split
from scipy.stats import ortho_group
from scipy.spatial import procrustes
from tensorflow.keras.models import Model, load_model
import os
import torch.nn as nn
from torchvision import models
import time
import warnings
warnings.filterwarnings("ignore")

Fine tuning Vision transformer from Hugging Face

In [2]:
from datasets import load_metric

metric = load_metric("accuracy")

In [4]:
def create_image_folder_dataset(root_path):

    _CLASS_NAMES= os.listdir(root_path)

    features=datasets.Features({
                      "img": datasets.Image(),
                      "label": datasets.features.ClassLabel(names=_CLASS_NAMES),
                  })

    img_data_files=[]
    label_data_files=[]

    for img_class in os.listdir(root_path):
        for img in os.listdir(os.path.join(root_path,img_class)):
            path_=os.path.join(root_path,img_class,img)
            img_data_files.append(path_)
            label_data_files.append(img_class)

    ds = datasets.Dataset.from_dict({"img":img_data_files,"label":label_data_files},features=features)
    return ds

In [5]:
rock_ds = create_image_folder_dataset("/content/drive/MyDrive/AML/360 Rocks")

In [249]:
test_ds = create_image_folder_dataset("/content/drive/MyDrive/AML/120 Rocks")

In [250]:
test_ds

Dataset({
    features: ['img', 'label'],
    num_rows: 120
})

In [6]:
rock_ds

Dataset({
    features: ['img', 'label'],
    num_rows: 360
})

In [251]:
class_labels = rock_ds.features["label"].names
class_labels_test = test_ds.features['label'].names

In [8]:
class_labels

['igneous', 'metamorphic', 'sedimentary']

In [9]:
labels = rock_ds.features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

id2label[2]

'sedimentary'

In [10]:
model_checkpoint = "google/vit-base-patch16-224-in21k"
batch_size = 32

In [11]:
image_processor  = AutoImageProcessor.from_pretrained(model_checkpoint)
image_processor

ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [12]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
if "height" in image_processor.size:
    size = (image_processor.size["height"], image_processor.size["width"])
    crop_size = size
    max_size = None
elif "shortest_edge" in image_processor.size:
    size = image_processor.size["shortest_edge"]
    crop_size = (size, size)
    max_size = image_processor.size.get("longest_edge")

train_transforms = Compose(
        [
            RandomResizedCrop(crop_size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

val_transforms = Compose(
        [
            Resize(size),
            CenterCrop(crop_size),
            ToTensor(),
            normalize,
        ]
    )

def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["img"]
    ]
    return example_batch

def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["img"]]
    return example_batch

In [13]:
rock_ds

Dataset({
    features: ['img', 'label'],
    num_rows: 360
})

In [14]:
# split up training into training and validation
splits = rock_ds.train_test_split(test_size=0.333)
train_ds = splits['train']
val_ds = splits['test']

In [15]:
train_ds

Dataset({
    features: ['img', 'label'],
    num_rows: 240
})

In [16]:
val_ds

Dataset({
    features: ['img', 'label'],
    num_rows: 120
})

In [17]:
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

In [252]:
test_ds.set_transform(preprocess_val)

In [295]:
rock_ds.set_transform(preprocess_val)

In [18]:
model_new = AutoModelForImageClassification.from_pretrained(
    model_checkpoint,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes = True,
)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# !pip install accelerate -U
# !pip install transformers[torch]

In [19]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-vitrock",
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

In [20]:
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [21]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [24]:
model_new.vit.intermediate_layer = nn.Linear(768, 8)

In [25]:
model_new.classifier = nn.Linear(768, 3)

In [26]:
model_new

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [27]:
trainer = Trainer(
    model_new,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [28]:
train_results = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.074264,0.458333
2,No log,1.053638,0.483333
3,No log,1.043774,0.525


Validation accuracy is 0.52 for this model. Lets save the model and evaluate it

In [29]:
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** train metrics *****
  epoch                    =        3.0
  total_flos               = 51966619GF
  train_loss               =     1.0556
  train_runtime            = 0:32:06.05
  train_samples_per_second =      0.374
  train_steps_per_second   =      0.003


In [30]:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =      0.525
  eval_loss               =     1.0438
  eval_runtime            = 0:02:31.46
  eval_samples_per_second =      0.792
  eval_steps_per_second   =      0.026


Fine tuned the Vision transformer model to output 3 categories and has 8 neurons in the kayer before the last one for performing procrustes analysis.(model_new printed above shows this)

Making a copy of model to extract the weights of the layer before the last one(8 neurons)

In [207]:
import copy
model_with_intermediate = copy.deepcopy(model_new)

In [232]:
intermediate_layer_weights = model_new.vit.intermediate_layer.weight.data

In [233]:
intermediate_layer_weights.shape

torch.Size([8, 768])

In [212]:
del model_with_intermediate.vit.intermediate_layer

In [234]:
model_with_intermediate.classifier = nn.Linear(in_features=768, out_features=8, bias=True)

In [235]:
model_with_intermediate.classifier.weight.data = intermediate_layer_weights

In [236]:
trainer_new = Trainer(
    model_with_intermediate,
    args,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [228]:
trainer.model.vit.intermediate_layer.weight.data

tensor([[-0.0106,  0.0056, -0.0242,  ...,  0.0201, -0.0289, -0.0159],
        [-0.0207, -0.0104,  0.0217,  ..., -0.0206, -0.0259, -0.0005],
        [-0.0054, -0.0003,  0.0286,  ...,  0.0143, -0.0064,  0.0283],
        ...,
        [ 0.0237, -0.0086,  0.0283,  ..., -0.0039,  0.0248,  0.0236],
        [-0.0074,  0.0313,  0.0097,  ...,  0.0281, -0.0201, -0.0326],
        [-0.0333,  0.0196,  0.0177,  ..., -0.0134, -0.0212, -0.0021]])

In [310]:
model_with_intermediate.classifier.weight.data

tensor([[-0.0106,  0.0056, -0.0242,  ...,  0.0201, -0.0289, -0.0159],
        [-0.0207, -0.0104,  0.0217,  ..., -0.0206, -0.0259, -0.0005],
        [-0.0054, -0.0003,  0.0286,  ...,  0.0143, -0.0064,  0.0283],
        ...,
        [ 0.0237, -0.0086,  0.0283,  ..., -0.0039,  0.0248,  0.0236],
        [-0.0074,  0.0313,  0.0097,  ...,  0.0281, -0.0201, -0.0326],
        [-0.0333,  0.0196,  0.0177,  ..., -0.0134, -0.0212, -0.0021]])

In [284]:
# test data- 120 images
test_procs = []
with torch.no_grad():
    for i in range(test_ds.num_rows):
      logits = model_with_intermediate(test_ds[i]['pixel_values'].reshape(1,3,224,224)).logits.numpy()
      test_procs.append(logits)

In [285]:
# val data- 120 images split from 360 images
val_procs = []
with torch.no_grad():
    for i in range(val_ds.num_rows):
      logits = model_with_intermediate(val_ds[i]['pixel_values'].reshape(1,3,224,224)).logits.numpy()
      val_procs.append(logits)

In [296]:
# train data + val data - 360 images in total to comapre with human_mds_360 data
rock_ds_procs = []
with torch.no_grad():
    for i in range(rock_ds.num_rows):
      logits = model_with_intermediate(rock_ds[i]['pixel_values'].reshape(1,3,224,224)).logits.numpy()
      rock_ds_procs.append(logits)

In [297]:
test_procs = np.concatenate(test_procs, axis=0)
val_procs =np.concatenate(val_procs, axis=0)
rock_ds_procs = np.concatenate(rock_ds_procs, axis=0)

In [298]:
val_procs.shape, test_procs.shape, rock_ds_procs.shape

((120, 8), (120, 8), (360, 8))

In [272]:
matrix_with_human_data =np.loadtxt('/content/drive/MyDrive/AML/mds_360.txt')

In [273]:
matrix_with_human_data.shape

(360, 8)

In [274]:
matrix_with_human_data_120 =np.loadtxt('/content/drive/MyDrive/AML/mds_120.txt')

In [275]:
matrix_with_human_data_120.shape

(120, 8)

In [299]:
# procrustes analysis
human_data_360, rock_ds, disparity = procrustes(matrix_with_human_data, rock_ds_procs)# for full data 360 images - using 360 mds human data and rock_ds(360 data)
human_data_120, val, disparity_val = procrustes(matrix_with_human_data_120, val_procs) # for validation 120 data using 120 human data and 120 val data
human_data_120, test, disparity_test = procrustes(matrix_with_human_data_120, test_procs) # for test 120 data 120 human data and 120 test data

In [301]:
print(f'Disparity for test  data {disparity_test}')
print(f'Disparity for validation data {disparity_val}')
print(f'Disparity for train data(240) plus validation(120) = (360): {disparity}')

Disparity for test  data 0.9304034038527171
Disparity for validation data 0.9466334812010115
Disparity for train data(240) plus validation(120) = (360): 0.9790258189586298


In [302]:
def compute_corr(mtx1, mtx2):
    correlations = []
    for i in range(mtx1.shape[1]):
        correlation, _ = pearsonr(mtx1[:, i], mtx2[:, i])
        correlations.append(correlation)
    return correlations

In [303]:
correlations_test = compute_corr(human_data_120, test)
correlations_val = compute_corr(human_data_120, val)
correlations_360 = compute_corr(human_data_360, rock_ds)

In [318]:
header = ["8 Features", 'Validation data', 'Test data', 'Avg_corr_val_test', '360_rocks_train_val']
table_data = []

for i in range(len(correlations_test)):
    avg_corr_val_test = (correlations_val[i] + correlations_test[i]) / 2
    row = [f"Feature {i + 1}", correlations_val[i], correlations_test[i], avg_corr_val_test, correlations_360[i]]
    table_data.append(row)

mean_row = ['Mean'] + [np.mean([row[j] for row in table_data]) for j in range(1, len(header))]
table_data.append(mean_row)

table = tabulate(table_data, headers=header, tablefmt="grid")
print("Pearson correlation between each dimension of  validation data, test data with the human data-120_mds and the avg of test and val correlations.")
print('Correlation between 8 features of human data (360_mds) and 360 images(whole data)')
print(table)


Pearson correlation between each dimension of  validation data, test data with the human data-120_mds and the avg of test and val correlations.
Correlation between 8 features of human data (360_mds) and 360 images(whole data)
+--------------+-------------------+-------------+---------------------+-----------------------+
| 8 Features   |   Validation data |   Test data |   Avg_corr_val_test |   360_rocks_train_val |
| Feature 1    |          0.17365  |    0.241073 |            0.207361 |             0.161476  |
+--------------+-------------------+-------------+---------------------+-----------------------+
| Feature 2    |          0.268372 |    0.2488   |            0.258586 |             0.228906  |
+--------------+-------------------+-------------+---------------------+-----------------------+
| Feature 3    |          0.238778 |    0.2223   |            0.230539 |             0.0927323 |
+--------------+-------------------+-------------+---------------------+-----------------------

Computed the procrustes analysis for both test data(120 images), validation data(120 images which was split from 360 images), with the human_data 120_mds.txt separetely and reporting the disparities above.

Also computed the procustes analysis for whole 360 images rock_ds data with the human data (360_mds.txt) for comparison.

As the question states to report the correlation values for test and validation datasets, I have reported them in the table along with 360 images(train+val) data correlations as well.
Also computed the average correlation between test and validation data in a separete column Avg_corr_val_test.



 **The average correlation coefficient for test and validation data with human_120_mds data for 8 features is
0.207361,
0.258586,
0.230539,
0.23292,
0.369441,
0.183765,
0.282641,
0.23723**

# References:



1.   https://huggingface.co/google/vit-base-patch16-224-in21k
2.   https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer
3. https://huggingface.co/docs/transformers/main_classes/trainer
4. https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb
5. https://github.com/rajshah4/huggingface-demos/blob/main/FoodApp/Indian_food_image_classification_fine-tuning.ipynb

