<a href="https://www.kaggle.com/code/riturajpradhan/temporal-intro?scriptVersionId=191485207" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [58]:
import os
import json

from PIL import Image
from tqdm.notebook import tqdm
import gc
import numpy as np
# from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from torchvision import transforms
# from transformers import AutoImageProcessor, ResNetModel
from transformers import ViTFeatureExtractor, ViTForImageClassification
# from transformers.image_processing_base import BatchFeature

In [59]:
image_paths = []
labels = []
label_path = '/kaggle/input/abaw-7-dataset/training_set_annotations.txt'
data_path = '/kaggle/input/abaw-7-dataset/cropped_aligned'
with open(label_path, 'r') as f:
    f.readline()
    data = f.readlines()

for d in data:
    line = d.split(',')
    image_name = line[0]
    image_label = line[3]
    if int(image_label) == -1:
        continue
    image_path = os.path.join(data_path, image_name)
    image_paths.append(image_path)
    labels.append(image_label)
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [90]:
os.rmdir('val')

In [79]:
# run cell to extract image features and store them
image_preprocessor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
feature_extractor = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', output_hidden_states = True, return_dict = True).to(device)
    
# Batch size for processing images
batch_size = 500

batch_count = 0

# Initialize lists to store extracted features
all_features = []

# Process images in batches
for batch_start in tqdm(range(0, len(image_paths), batch_size), desc='Extracting Features'):
    batch_end = min(batch_start + batch_size, len(image_paths))
    batch_images = [Image.open(image_path) for image_path in image_paths[batch_start:batch_end]]
    batch_labels = torch.tensor([float(label) for label in labels[batch_start:batch_end]]).to(device)
    # Tokenize and extract features
    inputs = image_preprocessor(images = batch_images, return_tensors="pt").to(device)
    with torch.no_grad():
        output = feature_extractor(**inputs)
    t = torch.column_stack([output.hidden_states[-1][:,0,:], torch.tensor(batch_labels)])

    # Append features to the list
    all_features.extend(t)
    break

# Save features to a file (e.g., as a PyTorch tensor)
    # Save features every 1000 images
    if batch_count%50 == 0:
        output_file = f"./training_features/image_features_val_{batch_count}.pt"
        torch.save(torch.stack(all_features).cpu(), output_file)
        print(f"Features saved to {output_file}")
        
        # Clear memory by resetting the list
        all_features = []
    batch_count += 1
    with open('counter.json', 'w') as f:
        json.dump({'batch_count' : batch_count}, f)

# Save any remaining features
if all_features:
    output_file = f"./training_features/image_features_val_{batch_count}.pt"
    torch.save(torch.stack(all_features).cpu(), output_file)
    print(f"Remaining features saved to {output_file}")


Extracting Features:   0%|          | 0/182 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.13 GiB. GPU 0 has a total capacty of 15.89 GiB of which 511.12 MiB is free. Process 2424 has 15.39 GiB memory in use. Of the allocated memory 14.19 GiB is allocated by PyTorch, and 925.76 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [81]:
len(all_features)

0

In [30]:
# # loading image features to RAM
# # just load it from ABAW_training_features.pt
# t = []

# # Define the file indices
# file_indices = [0, 50, 100, 150, 182]

# # Load the .pt files
# for i in file_indices:
#     file_path = f'/kaggle/working/val/image_features_val_{str(i)}.pt'
#     try:
#         t.append(torch.load(file_path, map_location=torch.device('cpu')))  # Use map_location if necessary
#     except RuntimeError as e:
#         print(f"Failed to load {file_path}: {e}")

# # Stack the tensors if they were loaded successfully
# if t:
#     image_features = torch.row_stack(t)
#     image_features.requires_grad = True
# else:
#     print("No tensors were loaded successfully.")


In [108]:
class ImageTransformer(nn.Module):
    def __init__(self, feature_dim, num_classes, num_heads=4, num_layers=6, dropout=0.1, sequence_length=64):
        super(ImageTransformer, self).__init__()
        self.feature_dim = feature_dim
        self.num_classes = num_classes
        
        # Positional encoding
        self.positional_encoding = nn.Parameter(torch.zeros(1, sequence_length, feature_dim))
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=feature_dim, nhead=num_classes, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Classification head
        self.fc1 = nn.Linear(feature_dim, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, num_classes)
    
    def forward(self, pixel_values):
        # Add positional encoding
        pixel_values = pixel_values + self.positional_encoding
        
        # Transformer encoder
        x = self.transformer_encoder(pixel_values)
        # Pooling: take the mean of the sequence
        x = x.mean(dim=1)
        # Classification
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        
        return x

class ABAWFeatureDataset(Dataset):
    def __init__(self, features, labels, sequence_length):
        self.features = features
        self.labels = labels
        self.sequence_length = sequence_length
        self.seq_start = 0
        self.seq_end = sequence_length
        self.length = labels.shape[0]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if idx > (self.length - self.sequence_length):
            feature = self.features[self.length - self.sequence_length:, : ]
            label = self.labels[self.length - 1]
        else:
            feature = self.features[idx:idx + self.sequence_length, :]
            label = self.labels[idx + self.sequence_length - 1]
        return feature, label

In [118]:
batch_size = 64
sequence_length = 32
feature_dim = 2048
num_classes = 8
num_epochs = 10
learning_rate = 1e-7

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

val_label_path = '/kaggle/input/abaw7-extracted-features/validation_set_annotations.txt'
val_labels = []
with open(val_label_path, 'r') as f:
    f.readline()
    data = f.readlines()

for d in data:
    line = d.split(',')
    image_label = line[3]
    if int(image_label) == -1:
        continue
    val_labels.append(image_label)
    
train_label_path = '/kaggle/input/abaw7-extracted-features/training_set_annotations.txt'
train_labels = []
with open(train_label_path, 'r') as f:
    f.readline()
    data = f.readlines()

for d in data:
    line = d.split(',')
    image_label = line[3]
    if int(image_label) == -1:
        continue
    train_labels.append(image_label)
    
# Create dataset and dataloader
train_image_features = torch.load('/kaggle/input/abaw7-extracted-features/ABAW_training_features.pt')
temp = train_image_features.detach().numpy()
train_image_features = torch.tensor(temp)

train_labels = [int(x) for x in train_labels]
train_label_tensor = torch.tensor(train_labels)
train_dataset = ABAWFeatureDataset(train_image_features, train_label_tensor, sequence_length = sequence_length)
data_loader_train = DataLoader(train_dataset, batch_size=batch_size, drop_last = True, pin_memory=True)

train_length = len(train_dataset)
    
val_image_features = torch.load('/kaggle/input/abaw7-extracted-features/ABAW_validation_features.pt')
temp = val_image_features.detach().numpy()
val_image_features = torch.tensor(temp)
val_labels = [int(x) for x in val_labels]
val_label_tensor = torch.tensor(val_labels)
val_dataset = ABAWFeatureDataset(val_image_features, val_label_tensor, sequence_length = sequence_length)
data_loader_val = DataLoader(dataset, batch_size=batch_size, drop_last = True, pin_memory=True)#, collate_fn=custom_collate_fn)

val_length = len(val_dataset)

In [None]:
# Model, loss function, optimizer
# model = ImageTransformer(feature_dim=feature_dim, num_classes=num_classes, sequence_length = sequence_length).to(device)
# model.load_state_dict(torch.load('/kaggle/working/transformer_on_resnet_50_e0.pth'))
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
scaler = GradScaler()

# training_loss_list = []
# validation_loss_list = []
# loss_list = []
# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    progress_bar = tqdm(total=len(data_loader_train), desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch')
    
    for features, targets in data_loader_train:
        features, targets = features.to(device, non_blocking=True), targets.to(device, non_blocking=True)
        optimizer.zero_grad()

        with autocast():
            outputs = model(features)
            # Reshape outputs and targets to be compatible with the loss function
#             outputs = outputs.view(-1, num_classes)
#             targets = targets.view(-1)
            # Calculate loss
            loss = criterion(outputs, targets)
        
        
#         print('before back')
        scaler.scale(loss).backward()
#         print('after back')
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * features.size(0)
        loss_list.append(loss.item())
        # Update progress bar with current loss
        progress_bar.set_postfix(loss=loss.item())
        progress_bar.update(1)
        
    epoch_loss = running_loss / train_length
    torch.save(model.state_dict(), f'transformer_on_resnet_50_e{epoch}.pth')
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for features, targets in data_loader_val:
            features, targets = features.cuda(), targets.cuda() 
            outputs = model(features)
            # Reshape outputs and targets to be compatible with the loss function
            outputs = outputs.view(-1, num_classes)
            # Calculate loss
            validation_loss = criterion(outputs, targets)
            running_loss += validation_loss.item() * features.size(0)
    
    validation_loss = running_loss / val_length
    
    training_loss_list.append(epoch_loss)
    validation_loss_list.append(validation_loss)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}, validation Loss: {validation_loss:.4f}')
    progress_bar.close()

print("Training complete!")

## Testing new feature extractor

In [5]:
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
import requests

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state,

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [50]:
from transformers import ViTFeatureExtractor, ViTForImageClassification
from PIL import Image
import requests

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
images = [image for i in range(1,32)]
labels = [i for i in range(1,32)]

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', output_hidden_states = True, return_dict = True)

inputs = feature_extractor(images=images, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
# predicted_class_idx = logits.argmax(-1).item()
# print("Predicted class:", model.config.id2label[predicted_class_idx]),

In [53]:
# outputs.hidden_states[-1][:,0,:]
import torch
t = torch.column_stack([outputs.hidden_states[-1][:,0,:], torch.tensor(labels)])

In [57]:
t[0,:]

tensor([ 2.3126e+00,  5.5116e+00,  1.1788e+01,  5.7725e-01,  6.5475e+00,
        -2.9125e+00,  4.5668e+00, -1.3786e+00,  6.1539e+00, -5.1831e+00,
         4.9711e+00,  8.4007e-01,  7.7080e+00, -3.0897e+00, -3.2443e+00,
         9.7258e+00,  1.1146e+00, -2.1286e+00,  8.7621e+00,  1.6315e+00,
        -8.4574e+00,  1.8422e+00,  1.4254e+00,  6.5619e+00, -1.0730e+01,
         6.0743e+00,  4.2650e+00,  6.0531e+00,  8.9479e+00,  2.5177e+00,
         5.4446e-01,  1.4944e+00,  4.0779e+00,  1.0520e+01, -2.9379e+00,
         6.8438e+00, -4.6463e+00, -3.0405e+00,  1.0135e+00,  8.2927e+00,
         9.9011e+00, -3.0263e+00,  7.5373e-01, -4.1869e+00,  2.0160e+00,
         7.0067e+00,  7.9857e-02, -2.2949e+00,  2.2206e+00, -4.4853e+00,
         1.8516e+00, -7.8394e-01,  4.4800e+00, -3.8795e+00, -1.1654e+01,
        -2.1962e+00,  2.5946e-01,  1.0626e+01, -5.4844e-01, -2.7896e+00,
         1.3051e+01,  8.1810e+00,  4.4351e+00, -6.2068e+00,  1.0071e+00,
        -4.3451e+00,  2.8967e+00, -1.7342e+00,  5.2

In [78]:
gc.collect()
torch.cuda.empty_cache()

In [18]:
# Training with custom feature loader

# Hyperparameters
feature_dim = 2048
num_classes = 8
batch_size = 128
num_epochs = 10
learning_rate = 1e-4
sequence_length = 64

# Model, loss function, optimizer
model = ImageTransformer(feature_dim=feature_dim, num_classes=num_classes, sequence_length = sequence_length).to('cuda')
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
# try SGD Optimizer 
 
loss_list = []

image_features = image_features.cuda()
label_tensor = label_tensor.cuda()

# Training loop
for epoch in range(1, num_epochs):
    model.train()
    running_loss = 0.0
    seq_start = 0
    seq_end = sequence_length
    progress_bar = tqdm(total = len(labels))
    while seq_end < label_tensor.shape[0]:
        optimizer.zero_grad()
        out = []
        targets = []
        for i in range(batch_size):
            if seq_end >= label_tensor.shape[0]:
                break
            inputs = image_features[seq_start:seq_start + sequence_length, : ]
            target = label_tensor[seq_end]
#             print('hello',inputs.shape)
#             print(targets.shape)
            # Forward pass
            outputs = model(inputs)

            # Reshape outputs and targets to be compatible with the loss function
            outputs = outputs.view(-1, num_classes)
            out.append(outputs)
            target = target.view(-1)
            targets.append(target)
            seq_start += 1
            seq_end += 1
            if seq_end % 100 == 0:
                progress_bar.update(100)
                    # Calculate loss
        pred = torch.row_stack(out)
        truth = torch.tensor(targets).cuda()
        loss = criterion(pred, truth)
        # Backward pass and optimize
#             print(loss)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
        loss_list.append(loss.item())
        # Update progress bar with current loss
#         data_loader.set_postfix(loss=loss.item())
    
    epoch_loss = running_loss / len(dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')
    torch.save(model.state_dict(), f'transformer_on_resnet_50_e{epoch}.pth')
    progress_bar.close()

print("Training complete!")


  0%|          | 0/15440 [00:00<?, ?it/s]

After transformer encoding:  torch.Size([64, 2048])
Take mean:  torch.Size([64])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x64 and 2048x256)

In [None]:
len(labels) - sequence_length

In [10]:
# Validation

model = ImageTransformer(feature_dim=feature_dim, num_classes=num_classes, sequence_length = sequence_length).to('cuda')
model.load_state_dict(torch.load('/kaggle/working/transformer_on_resnet_50_e0.pth'))
model.eval()
seq_start = 0
seq_end = sequence_length
progress_bar = tqdm(total = len(labels))
preds = []
with torch.no_grad():
    while seq_end < label_tensor.shape[0]:

        inputs = image_features[seq_start:seq_start + sequence_length, : ].cuda()
        target = label_tensor[seq_end]
    #             print('hello',inputs.shape)
    #             print(targets.shape)
        # Forward pass
        outputs = model(inputs)
        preds.append(torch.argmax(outputs).item())
        seq_start += 1
        seq_end += 1
        progress_bar.update(1)

print(len(preds))

In [114]:
# Validation
# feature_dim = 2048
# num_classes = 8
# batch_size = 128
# num_epochs = 10
# learning_rate = 1e-4
# sequence_length = 64

model = ImageTransformer(feature_dim=feature_dim, num_classes=num_classes, sequence_length = sequence_length).to(device)
model.load_state_dict(torch.load('/kaggle/working/transformer_on_resnet_50_e9.pth'))
criterion = nn.CrossEntropyLoss()

model.eval()

all_preds = []
all_targets = []
running_loss = 0.0
with torch.no_grad():
    progress_bar = tqdm(total=len(data_loader_val), desc='Validation', unit='batch')
    for features, targets in data_loader_val:
        features, targets = features.cuda(), targets.cuda() 
        outputs = model(features)
        # Reshape outputs and targets to be compatible with the loss function
        outputs = outputs.view(-1, num_classes)
        # Calculate loss
        validation_loss = criterion(outputs, targets)
        running_loss += validation_loss.item() * features.size(0)
        # Collect predictions and true labels
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())
        
        progress_bar.update(1)
        progress_bar.set_postfix(loss=validation_loss.item())
    
    progress_bar.close()

validation_loss = running_loss / val_length

# Calculate metrics
all_preds = np.array(all_preds)
all_targets = np.array(all_targets)

f1 = f1_score(all_targets, all_preds, average='weighted')
precision = precision_score(all_targets, all_preds, average='weighted')
recall = recall_score(all_targets, all_preds, average='weighted')
accuracy = accuracy_score(all_targets, all_preds)

print(f'Validation Loss: {validation_loss:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Accuracy: {accuracy:.4f}')

Validation:   0%|          | 0/241 [00:00<?, ?batch/s]

Validation Loss: 1.9743
F1 Score: 0.1377
Precision: 0.0894
Recall: 0.2991
Accuracy: 0.2991


  _warn_prf(average, modifier, msg_start, len(result))


In [116]:
from scipy.ndimage import gaussian_filter1d
import numpy as np

preds = torch.tensor(preds)
preds = preds.to('cpu')

sigma = 1.0  # Standard deviation of the Gaussian kernel
filtered_array = gaussian_filter1d(preds, sigma=sigma)

# Discretize the filtered values
discretized_array = np.round(filtered_array).astype(int)  # Round and convert to integers

# Convert the result back to a PyTorch tensor
filtered_tensor = torch.tensor(discretized_array, dtype=torch.int)


filtered_tensor.shape

  preds = torch.tensor(preds)


torch.Size([64])

In [117]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Example ground truth and predictions
y_true = label_tensor[sequence_length:]  # True labels
y_pred = preds  # Predicted labels

# Calculate precision, recall, and F1 score for different averaging methods
precision_macro = precision_score(y_true, y_pred, average='macro')
recall_macro = recall_score(y_true, y_pred, average='macro')
f1_macro = f1_score(y_true, y_pred, average='macro')

precision_micro = precision_score(y_true, y_pred, average='micro')
recall_micro = recall_score(y_true, y_pred, average='micro')
f1_micro = f1_score(y_true, y_pred, average='micro')

precision_weighted = precision_score(y_true, y_pred, average='weighted')
recall_weighted = recall_score(y_true, y_pred, average='weighted')
f1_weighted = f1_score(y_true, y_pred, average='weighted')

print(f'Precision (macro): {precision_macro:.2f}')
print(f'Recall (macro): {recall_macro:.2f}')
print(f'F1 Score (macro): {f1_macro:.2f}')

print(f'Precision (micro): {precision_micro:.2f}')
print(f'Recall (micro): {recall_micro:.2f}')
print(f'F1 Score (micro): {f1_micro:.2f}')

print(f'Precision (weighted): {precision_weighted:.2f}')
print(f'Recall (weighted): {recall_weighted:.2f}')
print(f'F1 Score (weighted): {f1_weighted:.2f}')

accuracy = accuracy_score(y_true, y_pred)

print(f'Accuracy: {accuracy:.2f}')


ValueError: Found input variables with inconsistent numbers of samples: [15408, 64]

In [None]:
torch.tensor(float(outputs.argmax().item()))

In [None]:
img = Image.open('/kaggle/input/abaw-7-dataset/cropped_aligned/1-30-1280x720/00001.jpg').convert('RGB')
transform = transforms.Compose([
            transforms.Resize([224, 224])
        ])

transform(img)