In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# !python3 -m pip install einops
# !python3 -m pip install facenet-pytorch
# !python3 -m pip install face_alignment
# !python3 -m pip install self_attention_cv

In [3]:
import cv2
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
from einops.layers.torch import Rearrange
from torchvision import datasets
from torch.utils.data import DataLoader
from transformers import ViTModel, ViTConfig
from einops import rearrange
from facenet_pytorch import MTCNN
from transformers.modeling_outputs import ImageClassifierOutput  # Corrected import
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import face_alignment
from transformers.models.vit.modeling_vit import ViTForImageClassification
#import dlib
import requests
from torchvision.transforms import functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def extract_frame(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    mid_frame_index = frame_count // 2  # Index of the frame in the middle of the video
    cap.set(cv2.CAP_PROP_POS_FRAMES, mid_frame_index)
    ret, frame = cap.read()
    if ret:
        cap.release()
        return frame
    else:
        cap.release()
        return None

In [5]:
def detect_face(frame):
    mtcnn = MTCNN()
    boxes, _ = mtcnn.detect(frame)
    if boxes is not None:
        # Assuming only one face in the frame
        box = boxes[0]
        x1, y1, x2, y2 = box
        # Crop the frame to the detected face
        cropped_frame = frame[int(y1):int(y2), int(x1):int(x2)]
        return cropped_frame
    else:
        return None

In [6]:
# Function to download the pretrained face alignment model if it doesn't exist
def download_face_alignment_model(url, save_path):
    if not os.path.exists(save_path):
        print("Downloading pretrained face alignment model...")
        response = requests.get(url)
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print("Download complete.")

# Specify the URL of the pretrained face alignment model
face_alignment_model_url = "https://github.com/1adrianb/face-alignment-models/releases/download/2.0.1/2DFAN4-11f355bf06.pth.tar"

# Download the pretrained face alignment model if it doesn't exist
face_alignment_model_path = os.path.abspath("2DFAN4-11f355bf06.pth.tar")
download_face_alignment_model(face_alignment_model_url, face_alignment_model_path)

# Initialize face alignment model
fa = face_alignment.FaceAlignment(2, flip_input=False)  # 2 corresponds to 2D landmarks

def align_face(frame):
    # Perform face alignment
    aligned_faces = fa.get_landmarks(frame)
    if aligned_faces is not None:
        aligned_face = aligned_faces[0]  # Assuming only one face in the frame
        return aligned_face
    else:
        return None


In [7]:
def preprocess_image(frame):
    frame_pil = Image.fromarray(frame.astype('uint8'))
    frame_pil = frame_pil.convert('RGB')  # Convert to RGB format
    frame_np = np.array(frame_pil)
    print(frame_np.shape)
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize using ImageNet mean and std
    ])
    frame_tensor = transform(frame_pil)
    return frame_tensor


In [8]:
def preprocess_spectrogram(image_path):
    img = Image.open(image_path).convert('RGB')  # Convert to grayscale
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to match ViT input size
        transforms.ToTensor(),           # Convert to tensor
    ])
    img_tensor = transform(img)
    return img_tensor

In [9]:
def load_spectrogram_dataset(spectrograms_folder, skipped_files):
    X = []
    y = []
    # List all files in the input folder
    files = sorted(os.listdir(spectrograms_folder))
    # Iterate over files in the folder
    for filename in tqdm(files):
        if filename.endswith(".png") and filename[:-3] not in skipped_files:  # Assuming mel spectrograms are stored as PNG files
            input_path = os.path.join(spectrograms_folder, filename)
            img_tensor = preprocess_spectrogram(input_path)
            X.append(img_tensor)
            # Extract label from filename (assuming filename is in format "abc_IEO_label_xyz.png")
            label = filename.split("_")[2]
            if label == "HAP":
                y.append(0)
            elif label == "SAD":
                y.append(1)
            elif label == "ANG":
                y.append(2)
            elif label == "DIS":
                y.append(3)
            elif label == "FEA":
                y.append(4)
            elif label == "NEU":
                y.append(5)
    return X, y

In [10]:
def load_dataset(videos_folder):
    X = []
    y = []
    skipped_files = []
    video_files = [file for file in sorted(os.listdir(videos_folder)) if file.endswith(".flv")]
    for video_file in tqdm(video_files):
        video_path = os.path.join(videos_folder, video_file)
        frame = extract_frame(video_path)
        if frame is not None:
            cropped_face = detect_face(frame)
            if cropped_face is not None:
                preprocessed_face = preprocess_image(cropped_face)
                X.append(preprocessed_face)
                label = video_file.split("_")[2].split(".")[0]  # Adjusted to handle different file extensions
                if label == "HAP":
                    y.append(0)
                elif label == "SAD":
                    y.append(1)
                elif label == "ANG":
                    y.append(2)
                elif label == "DIS":
                    y.append(3)
                elif label == "FEA":
                    y.append(4)
                elif label == "NEU":
                    y.append(5)
            else:
                print(f"No face detected in {video_file}. Skipping.")
                skipped_files.append(video_file[:-3])
        else:
            print(f"Failed to extract frame from {video_file}. Skipping.")
            skipped_files.append(video_file[:-3])
    return X, y, skipped_files

In [11]:
# Define the ConcatDataset class to concatenate video frame and spectrogram tensors
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, X1, X2, y, modality='multimodal', fullscale=False, presaved=False, target_size=(224,224)):
        self.X1 = X1
        self.X2 = X2
        self.y = y
        self.modality = modality
        self.presaved = presaved
        self.fullscale = fullscale
        self.target_size = target_size
        
    def __len__(self):
        if self.modality == 'audio':
          return len(self.X2)
        if self.modality == 'visual':
          return len(self.X1)
        return len(self.y)

    def __getitem__(self, idx):
        if not self.fullscale:
          img1 = self.X1[idx]
          img2 = self.X2[idx]
          label = self.y[idx]
        else:
          img1 = torch.from_numpy(self.X1[idx]).float()  # Convert numpy array to torch tensor
          img2 = torch.from_numpy(self.X2[idx]).float()  # Convert numpy array to torch tensor
          label = torch.tensor(self.y[idx])  # Convert numpy array to torch tensor

        concatenated_img = torch.cat((img1, img2), dim=1)  # Concatenate along 1 dimension
        concatenated_img = F.resize(concatenated_img, self.target_size)
        #print(concatenated_img.shape)
        
        if self.modality == 'visual':
          return img1, label
        if self.modality == 'audio':
          return img2, label
        return concatenated_img, label # concatenate modalities

In [12]:
def train_model(model, criterion, optimizer, train_loader, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)

        # Extract logits if the model output is wrapped in an ImageClassifierOutput object
        if isinstance(outputs, ImageClassifierOutput):
            logits = outputs.logits
        else:
            # Handle other cases
            pass
        
        # Print the size of the logits tensor
        #print("Logits size:", logits.size())
        
        labels = labels.long()  # Convert labels to long type for CrossEntropyLoss
        loss = criterion(logits, labels)  # Calculate loss using entire logits
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predicted = torch.argmax(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return total_loss / len(train_loader), correct / total


In [13]:
def test_model(model, criterion, test_loader, device):
    model.eval()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0
    
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            labels = labels.long()
            loss = criterion(outputs.logits, labels)  # Calculate loss using entire logits
            running_loss += loss.item() * inputs.size(0)
            predicted = torch.argmax(outputs.logits, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)
    
    epoch_loss = running_loss / len(test_loader.dataset)
    accuracy = correct_preds / total_preds
    return epoch_loss, accuracy

In [14]:
# class ViT(nn.Module):
#     # ViT architecture adapted from here - https://theaisummer.com/vision-transformer/
#     def __init__(self, *,
#                  img_dim,
#                  in_channels=3,
#                  patch_dim=16,
#                  num_classes=6, # full-scale CREMA-D
#                  dim=512,
#                  blocks=6,
#                  heads=4,
#                  dim_linear_block=1024,
#                  dim_head=None,
#                  dropout=0, transformer=None, classification=True):
#         """
#         Args:
#             img_dim: the spatial image size
#             in_channels: number of img channels
#             patch_dim: desired patch dim
#             num_classes: classification task classes
#             dim: the linear layer's dim to project the patches for MHSA
#             blocks: number of transformer blocks
#             heads: number of heads
#             dim_linear_block: inner dim of the transformer linear block
#             dim_head: dim head in case you want to define it. defaults to dim/heads
#             dropout: for pos emb and transformer
#             transformer: in case you want to provide another transformer implementation
#             classification: creates an extra CLS token
#         """
#         super().__init__()
#         assert img_dim % patch_dim == 0, f'patch size {patch_dim} not divisible'
#         self.p = patch_dim
#         self.classification = classification
#         tokens = (img_dim // patch_dim) ** 2
#         self.token_dim = in_channels * (patch_dim ** 2)
#         self.dim = dim
#         self.dim_head = (int(dim / heads)) if dim_head is None else dim_head
#         self.project_patches = nn.Linear(self.token_dim, dim)

#         self.emb_dropout = nn.Dropout(dropout)
#         if self.classification:
#             self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
#             self.pos_emb1D = nn.Parameter(torch.randn(tokens + 1, dim))
#             self.mlp_head = nn.Linear(dim, num_classes)
#         else:
#             self.pos_emb1D = nn.Parameter(torch.randn(tokens, dim))

#         if transformer is None:
#             self.transformer = TransformerEncoder(dim, blocks=blocks, heads=heads,
#                                                   dim_head=self.dim_head,
#                                                   dim_linear_block=dim_linear_block,
#                                                   dropout=dropout)
#         else:
#             self.transformer = transformer

#     def expand_cls_to_batch(self, batch):
#         """
#         Args:
#             batch: batch size
#         Returns: cls token expanded to the batch size
#         """
#         return self.cls_token.expand([batch, -1, -1])

#     def forward(self, img, mask=None):
#         batch_size = img.shape[0]
#         img_patches = rearrange(
#         img, 'b c (patch_x x) (patch_y y) -> b (x y) (patch_x patch_y c)',
#         patch_x=self.p, patch_y=self.p
#     )
#     # project patches with linear layer + add pos emb
#         img_patches = self.project_patches(img_patches)

#         if self.classification:
#             img_patches = torch.cat(
#             (self.expand_cls_to_batch(batch_size), img_patches), dim=1
#         )

#         patch_embeddings = self.emb_dropout(img_patches + self.pos_emb1D)

#     # feed patch_embeddings and output of transformer. shape: [batch, tokens, dim]
#         y = self.transformer(patch_embeddings, mask)

#         if self.classification:
#         # we index only the cls token for classification.
#             return self.mlp_head(y[:, 0, :])
#         else:
#             return y  # Return logits directly
#     # def forward(self, img, mask=None):
#     #     batch_size = img.shape[0]
#     #     img_patches = rearrange(
#     #         img, 'b c (patch_x x) (patch_y y) -> b (x y) (patch_x patch_y c)',
#     #                             patch_x=self.p, patch_y=self.p)
#     #     # project patches with linear layer + add pos emb
#     #     img_patches = self.project_patches(img_patches)

#     #     if self.classification:
#     #         img_patches = torch.cat(
#     #             (self.expand_cls_to_batch(batch_size), img_patches), dim=1)

#     #     patch_embeddings = self.emb_dropout(img_patches + self.pos_emb1D)

#     #     # feed patch_embeddings and output of transformer. shape: [batch, tokens, dim]
#     #     y = self.transformer(patch_embeddings, mask)

#     #     if self.classification:
#     #         # we index only the cls token for classification.
#     #         return self.mlp_head(y[:, 0, :])
#     #     else:
#     #         return y

In [15]:
_fullscale = True # Run fullscale experiment?
_presaved = True # Use presaved data .npy files?

# Define input_folder and input_folder_spec
if _fullscale:
  input_folder = '/home1/riyaranj/riya/videos_fullscale'
  input_folder_spec = '/home1/riyaranj/riya/melspec_fullscale'
else:
  input_folder = '/home1/riyaranj/riya/videos'
  input_folder_spec = '/home1/riyaranj/riya/melspec'

# Check if input folder exists
if not os.path.exists(input_folder):
    print("Input folder does not exist.")
    sys.exit(1)
# Check if input folder exists
if not os.path.exists(input_folder_spec):
    print("Input folder does not exist.")
    sys.exit(1)

#### Check that the videos and spectrograms are in one-to-one correspondence

In [16]:
assert([file[:-3] for file in sorted(os.listdir(input_folder))] == [file[:-3] for file in sorted(os.listdir(input_folder_spec))])

In [17]:
if _presaved:
  X = np.load('/home1/riyaranj/riya/X_7439.npy', mmap_mode='r')
  y = np.load('/home1/riyaranj/riya/y_7439.npy', mmap_mode='r')
  X_spec = np.load('/home1/riyaranj/riya/X_spec_7439.npy', mmap_mode='r')
  y_spec = np.load('/home1/riyaranj/riya/y_spec_7439.npy', mmap_mode='r')
else:
    X, y, skipped_files = load_dataset(input_folder)
    X_spec, y_spec = load_spectrogram_dataset(input_folder_spec, skipped_files)

# Split the data into train and test sets
print(f"Total number of samples: {len(X)}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Number of train samples (video): {len(X_train)}", f"Number of test samples: {len(X_test)}")
X_train_spec, X_test_spec, y_train_spec, y_test_spec = train_test_split(X_spec, y_spec, test_size=0.3, random_state=42)
print(f"Number of train samples (audio): {len(X_train_spec)}", f"Number of test samples: {len(X_test_spec)}")

Total number of samples: 7429
Number of train samples (video): 5200 Number of test samples: 2229
Number of train samples (audio): 5200 Number of test samples: 2229


#### Save data as .npy files (if needed)

In [18]:
# # Save X, y, X_spec, y_spec
# np.save('X_7439.npy', np.array(X))
# np.save('y_7439.npy', np.array(y))
# np.save('X_spec_7439.npy', np.array(X_spec))
# np.save('y_spec_7439.npy', np.array(y_spec))

#### Check that the number of frames and spectrograms are equal and that the train + test samples add up to total samples

In [19]:
assert(len(X_train) == len(X_train_spec) and len(X_test) == len(X_test_spec))

In [20]:
def train_ViT(_modality):
    best_test_accuracy = 0.0
    best_results = None
    best_model_state = None

    batch_sizes = [32]
    learning_rates = [0.0001]

    for _bs in batch_sizes:
        for _lr in learning_rates:
            # Adjust input channels as per modality
            if _modality == 'multimodal':
                _input_channels = 2
            else:
                _input_channels = 1

            # Initialize the ViT model
            config = ViTConfig.from_pretrained('google/vit-base-patch16-224-in21k')
            config.num_labels = 6
            model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', config=config, ignore_mismatched_sizes=True)

            # Define device
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model.to(device)

            # Define loss function and optimizer
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=_lr)

            # Concatenate datasets if multimodal
            train_dataset = ConcatDataset(X_train, X_train_spec, y_train, modality=_modality, fullscale = _fullscale)
            test_dataset = ConcatDataset(X_test, X_test_spec, y_test, modality=_modality, fullscale = _fullscale)

            # Create data loaders
            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=_bs, shuffle=True)
            test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=_bs)

            print(f"\n\nBatch size: {_bs}, lr: {_lr}")

            # Training loop
            num_epochs = 30
            print(f"Training ViT for \"{_modality}\" pipeline ...\n------------------------------------------------\n")
            for epoch in range(num_epochs):
                print("Epoch " + str(epoch))
                train_loss, train_accuracy = train_model(model, criterion, optimizer, train_loader, device)
                test_loss, test_accuracy = test_model(model, criterion, test_loader, device)
                print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

                # Update best model and results if test accuracy improves
                if test_accuracy > best_test_accuracy:
                    best_test_accuracy = test_accuracy
                    best_results = (train_loss, train_accuracy, test_loss, test_accuracy, _bs, _lr)
                    best_model_state = model.state_dict()

            # Save the model
            if _modality == 'multimodal':
                torch.save(model.state_dict(), f'HF_ViT_audio_video_fullscale_DIM1_{num_epochs}_{_bs}_{_lr}.pt')
            elif _modality == 'audio':
                torch.save(model.state_dict(), f'HF_ViT_audio_fullscale_DIM1_{num_epochs}_{_bs}_{_lr}.pt')
            elif _modality == 'visual':
                torch.save(model.state_dict(), f'HF_ViT_video_fullscale_DIM1_{num_epochs}_{_bs}_{_lr}.pt')
            else:
                print("Improper modality provided!")

    print("Best Results:")
    print(f"Train Loss: {best_results[0]}, Train Accuracy: {best_results[1]}, Test Loss: {best_results[2]}, Test Accuracy: {best_results[3]}, Batch Size: {best_results[4]}, Learning Rate: {best_results[5]}")

    return best_results



#### Choose modalities

In [21]:
# # Define modalities
_modality = ['visual', 'audio', 'multimodal']

In [22]:
#### Train ViT

In [23]:

# Train ViT
for _m in _modality:
    train_ViT(_m)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Batch size: 32, lr: 0.0001
Training ViT for "visual" pipeline ...
------------------------------------------------

Epoch 0


100%|██████████| 70/70 [00:14<00:00,  4.88it/s]


Epoch 1/30, Train Loss: 1.3392, Train Accuracy: 0.4663, Test Loss: 1.0887, Test Accuracy: 0.5617
Epoch 1


100%|██████████| 70/70 [00:14<00:00,  4.90it/s]


Epoch 2/30, Train Loss: 0.9266, Train Accuracy: 0.6542, Test Loss: 0.9846, Test Accuracy: 0.6223
Epoch 2


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 3/30, Train Loss: 0.6780, Train Accuracy: 0.7560, Test Loss: 1.0062, Test Accuracy: 0.6335
Epoch 3


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 4/30, Train Loss: 0.4935, Train Accuracy: 0.8262, Test Loss: 0.9056, Test Accuracy: 0.6685
Epoch 4


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 5/30, Train Loss: 0.3065, Train Accuracy: 0.9037, Test Loss: 0.9413, Test Accuracy: 0.6886
Epoch 5


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 6/30, Train Loss: 0.2363, Train Accuracy: 0.9219, Test Loss: 1.1432, Test Accuracy: 0.6546
Epoch 6


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 7/30, Train Loss: 0.1878, Train Accuracy: 0.9379, Test Loss: 1.1379, Test Accuracy: 0.6761
Epoch 7


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 8/30, Train Loss: 0.1585, Train Accuracy: 0.9471, Test Loss: 1.1409, Test Accuracy: 0.6783
Epoch 8


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 9/30, Train Loss: 0.0943, Train Accuracy: 0.9729, Test Loss: 1.1985, Test Accuracy: 0.6716
Epoch 9


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 10/30, Train Loss: 0.1045, Train Accuracy: 0.9679, Test Loss: 1.2460, Test Accuracy: 0.6765
Epoch 10


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 11/30, Train Loss: 0.0798, Train Accuracy: 0.9742, Test Loss: 1.3838, Test Accuracy: 0.6406
Epoch 11


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 12/30, Train Loss: 0.1018, Train Accuracy: 0.9667, Test Loss: 1.2686, Test Accuracy: 0.6761
Epoch 12


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 13/30, Train Loss: 0.0638, Train Accuracy: 0.9794, Test Loss: 1.3436, Test Accuracy: 0.6783
Epoch 13


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 14/30, Train Loss: 0.0758, Train Accuracy: 0.9756, Test Loss: 1.5786, Test Accuracy: 0.6276
Epoch 14


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 15/30, Train Loss: 0.0566, Train Accuracy: 0.9827, Test Loss: 1.4364, Test Accuracy: 0.6599
Epoch 15


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 16/30, Train Loss: 0.0454, Train Accuracy: 0.9858, Test Loss: 1.3825, Test Accuracy: 0.6895
Epoch 16


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 17/30, Train Loss: 0.0383, Train Accuracy: 0.9896, Test Loss: 1.3634, Test Accuracy: 0.6815
Epoch 17


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 18/30, Train Loss: 0.0631, Train Accuracy: 0.9800, Test Loss: 1.5137, Test Accuracy: 0.6590
Epoch 18


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 19/30, Train Loss: 0.0533, Train Accuracy: 0.9825, Test Loss: 1.4368, Test Accuracy: 0.6631
Epoch 19


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 20/30, Train Loss: 0.0430, Train Accuracy: 0.9858, Test Loss: 1.4104, Test Accuracy: 0.6738
Epoch 20


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 21/30, Train Loss: 0.0583, Train Accuracy: 0.9808, Test Loss: 1.4161, Test Accuracy: 0.6729
Epoch 21


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 22/30, Train Loss: 0.0477, Train Accuracy: 0.9842, Test Loss: 1.3953, Test Accuracy: 0.6725
Epoch 22


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 23/30, Train Loss: 0.0368, Train Accuracy: 0.9892, Test Loss: 1.5929, Test Accuracy: 0.6631
Epoch 23


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 24/30, Train Loss: 0.0461, Train Accuracy: 0.9838, Test Loss: 1.4749, Test Accuracy: 0.6756
Epoch 24


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 25/30, Train Loss: 0.0292, Train Accuracy: 0.9912, Test Loss: 1.5896, Test Accuracy: 0.6712
Epoch 25


100%|██████████| 70/70 [00:14<00:00,  4.91it/s]


Epoch 26/30, Train Loss: 0.0373, Train Accuracy: 0.9906, Test Loss: 1.5287, Test Accuracy: 0.6496
Epoch 26


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 27/30, Train Loss: 0.0366, Train Accuracy: 0.9881, Test Loss: 1.4479, Test Accuracy: 0.6703
Epoch 27


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 28/30, Train Loss: 0.0549, Train Accuracy: 0.9835, Test Loss: 1.3764, Test Accuracy: 0.6756
Epoch 28


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 29/30, Train Loss: 0.0385, Train Accuracy: 0.9879, Test Loss: 1.6881, Test Accuracy: 0.6492
Epoch 29


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 30/30, Train Loss: 0.0277, Train Accuracy: 0.9917, Test Loss: 1.5532, Test Accuracy: 0.6716
Best Results:
Train Loss: 0.0453882498746635, Train Accuracy: 0.9857692307692307, Test Loss: 1.3825240892484725, Test Accuracy: 0.6895468820098699, Batch Size: 32, Learning Rate: 0.0001


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Batch size: 32, lr: 0.0001
Training ViT for "audio" pipeline ...
------------------------------------------------

Epoch 0


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 1/30, Train Loss: 1.4547, Train Accuracy: 0.4115, Test Loss: 1.3350, Test Accuracy: 0.4625
Epoch 1


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 2/30, Train Loss: 1.1529, Train Accuracy: 0.5635, Test Loss: 1.1059, Test Accuracy: 0.5765
Epoch 2


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 3/30, Train Loss: 0.9150, Train Accuracy: 0.6696, Test Loss: 1.0073, Test Accuracy: 0.6160
Epoch 3


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 4/30, Train Loss: 0.6783, Train Accuracy: 0.7619, Test Loss: 1.1671, Test Accuracy: 0.5980
Epoch 4


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 5/30, Train Loss: 0.4892, Train Accuracy: 0.8346, Test Loss: 1.2639, Test Accuracy: 0.5716
Epoch 5


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 6/30, Train Loss: 0.3420, Train Accuracy: 0.8887, Test Loss: 1.2520, Test Accuracy: 0.5935
Epoch 6


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 7/30, Train Loss: 0.2307, Train Accuracy: 0.9271, Test Loss: 1.3478, Test Accuracy: 0.6003
Epoch 7


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 8/30, Train Loss: 0.1738, Train Accuracy: 0.9471, Test Loss: 1.5334, Test Accuracy: 0.5900
Epoch 8


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 9/30, Train Loss: 0.1659, Train Accuracy: 0.9421, Test Loss: 1.4022, Test Accuracy: 0.6092
Epoch 9


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 10/30, Train Loss: 0.1097, Train Accuracy: 0.9654, Test Loss: 1.6525, Test Accuracy: 0.5922
Epoch 10


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 11/30, Train Loss: 0.0749, Train Accuracy: 0.9771, Test Loss: 1.6061, Test Accuracy: 0.6146
Epoch 11


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 12/30, Train Loss: 0.0777, Train Accuracy: 0.9773, Test Loss: 1.6785, Test Accuracy: 0.6074
Epoch 12


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 13/30, Train Loss: 0.0792, Train Accuracy: 0.9773, Test Loss: 1.6596, Test Accuracy: 0.6101
Epoch 13


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 14/30, Train Loss: 0.1118, Train Accuracy: 0.9629, Test Loss: 1.7572, Test Accuracy: 0.5882
Epoch 14


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 15/30, Train Loss: 0.0508, Train Accuracy: 0.9840, Test Loss: 1.7703, Test Accuracy: 0.6052
Epoch 15


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 16/30, Train Loss: 0.0546, Train Accuracy: 0.9846, Test Loss: 1.7709, Test Accuracy: 0.6231
Epoch 16


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 17/30, Train Loss: 0.0442, Train Accuracy: 0.9877, Test Loss: 1.9515, Test Accuracy: 0.5774
Epoch 17


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 18/30, Train Loss: 0.0739, Train Accuracy: 0.9752, Test Loss: 1.8832, Test Accuracy: 0.5742
Epoch 18


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 19/30, Train Loss: 0.0544, Train Accuracy: 0.9808, Test Loss: 1.7781, Test Accuracy: 0.6160
Epoch 19


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 20/30, Train Loss: 0.0194, Train Accuracy: 0.9948, Test Loss: 1.9284, Test Accuracy: 0.6079
Epoch 20


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 21/30, Train Loss: 0.0439, Train Accuracy: 0.9850, Test Loss: 1.8517, Test Accuracy: 0.5904
Epoch 21


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 22/30, Train Loss: 0.0572, Train Accuracy: 0.9825, Test Loss: 1.8834, Test Accuracy: 0.5891
Epoch 22


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 23/30, Train Loss: 0.0520, Train Accuracy: 0.9831, Test Loss: 2.0403, Test Accuracy: 0.5787
Epoch 23


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 24/30, Train Loss: 0.0463, Train Accuracy: 0.9844, Test Loss: 2.0163, Test Accuracy: 0.5935
Epoch 24


100%|██████████| 70/70 [00:14<00:00,  4.93it/s]


Epoch 25/30, Train Loss: 0.0537, Train Accuracy: 0.9829, Test Loss: 1.9283, Test Accuracy: 0.6070
Epoch 25


100%|██████████| 70/70 [00:14<00:00,  4.93it/s]


Epoch 26/30, Train Loss: 0.0178, Train Accuracy: 0.9954, Test Loss: 1.9243, Test Accuracy: 0.6097
Epoch 26


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 27/30, Train Loss: 0.0313, Train Accuracy: 0.9908, Test Loss: 1.9755, Test Accuracy: 0.6061
Epoch 27


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 28/30, Train Loss: 0.0857, Train Accuracy: 0.9729, Test Loss: 1.9046, Test Accuracy: 0.5877
Epoch 28


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 29/30, Train Loss: 0.0213, Train Accuracy: 0.9935, Test Loss: 1.9126, Test Accuracy: 0.6178
Epoch 29


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 30/30, Train Loss: 0.0137, Train Accuracy: 0.9960, Test Loss: 2.1105, Test Accuracy: 0.5967
Best Results:
Train Loss: 0.05455524961755502, Train Accuracy: 0.9846153846153847, Test Loss: 1.77093934943292, Test Accuracy: 0.6231493943472409, Batch Size: 32, Learning Rate: 0.0001


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Batch size: 32, lr: 0.0001
Training ViT for "multimodal" pipeline ...
------------------------------------------------

Epoch 0


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 1/30, Train Loss: 1.2925, Train Accuracy: 0.5013, Test Loss: 1.0592, Test Accuracy: 0.5976
Epoch 1


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 2/30, Train Loss: 0.7868, Train Accuracy: 0.7133, Test Loss: 0.7210, Test Accuracy: 0.7376
Epoch 2


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 3/30, Train Loss: 0.5444, Train Accuracy: 0.8108, Test Loss: 0.6691, Test Accuracy: 0.7649
Epoch 3


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 4/30, Train Loss: 0.3775, Train Accuracy: 0.8777, Test Loss: 0.6238, Test Accuracy: 0.7802
Epoch 4


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 5/30, Train Loss: 0.2442, Train Accuracy: 0.9256, Test Loss: 0.6792, Test Accuracy: 0.7721
Epoch 5


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 6/30, Train Loss: 0.1795, Train Accuracy: 0.9440, Test Loss: 0.6467, Test Accuracy: 0.7896
Epoch 6


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 7/30, Train Loss: 0.1467, Train Accuracy: 0.9496, Test Loss: 0.7191, Test Accuracy: 0.7761
Epoch 7


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 8/30, Train Loss: 0.0714, Train Accuracy: 0.9806, Test Loss: 0.8050, Test Accuracy: 0.7824
Epoch 8


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 9/30, Train Loss: 0.1168, Train Accuracy: 0.9604, Test Loss: 0.7795, Test Accuracy: 0.7945
Epoch 9


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 10/30, Train Loss: 0.1225, Train Accuracy: 0.9608, Test Loss: 0.7036, Test Accuracy: 0.8066
Epoch 10


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 11/30, Train Loss: 0.0560, Train Accuracy: 0.9829, Test Loss: 0.7585, Test Accuracy: 0.8008
Epoch 11


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 12/30, Train Loss: 0.0689, Train Accuracy: 0.9781, Test Loss: 0.6896, Test Accuracy: 0.8174
Epoch 12


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 13/30, Train Loss: 0.0542, Train Accuracy: 0.9829, Test Loss: 0.8426, Test Accuracy: 0.7824
Epoch 13


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 14/30, Train Loss: 0.0387, Train Accuracy: 0.9888, Test Loss: 0.9258, Test Accuracy: 0.7685
Epoch 14


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 15/30, Train Loss: 0.0393, Train Accuracy: 0.9894, Test Loss: 0.8212, Test Accuracy: 0.8107
Epoch 15


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 16/30, Train Loss: 0.0284, Train Accuracy: 0.9921, Test Loss: 0.9268, Test Accuracy: 0.7770
Epoch 16


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 17/30, Train Loss: 0.0691, Train Accuracy: 0.9788, Test Loss: 0.7669, Test Accuracy: 0.7963
Epoch 17


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 18/30, Train Loss: 0.0218, Train Accuracy: 0.9946, Test Loss: 0.8964, Test Accuracy: 0.7918
Epoch 18


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 19/30, Train Loss: 0.0693, Train Accuracy: 0.9769, Test Loss: 0.8952, Test Accuracy: 0.7730
Epoch 19


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 20/30, Train Loss: 0.0397, Train Accuracy: 0.9869, Test Loss: 0.8376, Test Accuracy: 0.7986
Epoch 20


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 21/30, Train Loss: 0.0398, Train Accuracy: 0.9867, Test Loss: 0.7543, Test Accuracy: 0.8291
Epoch 21


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 22/30, Train Loss: 0.0294, Train Accuracy: 0.9906, Test Loss: 0.8186, Test Accuracy: 0.7950
Epoch 22


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 23/30, Train Loss: 0.0471, Train Accuracy: 0.9844, Test Loss: 0.9005, Test Accuracy: 0.8017
Epoch 23


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 24/30, Train Loss: 0.0342, Train Accuracy: 0.9904, Test Loss: 0.8930, Test Accuracy: 0.7954
Epoch 24


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 25/30, Train Loss: 0.0433, Train Accuracy: 0.9852, Test Loss: 0.8898, Test Accuracy: 0.7932
Epoch 25


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 26/30, Train Loss: 0.0191, Train Accuracy: 0.9952, Test Loss: 0.8928, Test Accuracy: 0.7954
Epoch 26


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 27/30, Train Loss: 0.0250, Train Accuracy: 0.9919, Test Loss: 0.9407, Test Accuracy: 0.7820
Epoch 27


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 28/30, Train Loss: 0.0254, Train Accuracy: 0.9913, Test Loss: 0.8325, Test Accuracy: 0.7963
Epoch 28


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 29/30, Train Loss: 0.0193, Train Accuracy: 0.9933, Test Loss: 0.9226, Test Accuracy: 0.8008
Epoch 29


100%|██████████| 70/70 [00:14<00:00,  4.92it/s]


Epoch 30/30, Train Loss: 0.0337, Train Accuracy: 0.9890, Test Loss: 1.0044, Test Accuracy: 0.7707
Best Results:
Train Loss: 0.03979573138715253, Train Accuracy: 0.9867307692307692, Test Loss: 0.7543248275621636, Test Accuracy: 0.82907133243607, Batch Size: 32, Learning Rate: 0.0001
