In [1]:
import os
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import cv2
from collections import Counter
from sklearn.metrics import accuracy_score

In [2]:
folder_path = '/kaggle/input/frames-10fps/frames'  # Replace with your folder path


In [3]:
folders=sorted(os.listdir(folder_path))
train_folder=folders[:22]
test_folder=folders[22:]

In [4]:
train_paths=[]
for i in tqdm(range(len(train_folder))):
    files=os.listdir(os.path.join(folder_path,folders[i]))
    files.sort(key=lambda x: int(x.split('.')[0]))
    files=[os.path.join(folder_path,folders[i],files[j]) for j in range(len(files))]
    train_paths=train_paths+files
test_paths=[]
for i in tqdm(range(len(test_folder))):
    files=os.listdir(os.path.join(folder_path,folders[i]))
    files.sort(key=lambda x: int(x.split('.')[0]))
    files=[os.path.join(folder_path,folders[i],files[j]) for j in range(len(files))]
    test_paths=test_paths+files

100%|██████████| 22/22 [00:11<00:00,  1.89it/s]
100%|██████████| 5/5 [00:00<00:00, 99.98it/s]


In [5]:
def get_csv_id(file_path):

    # Extract the filename without extension
    filename = os.path.basename(file_path)  # RightVideoSN001.csv

    # Extract the identifier ('1' or '11')
    if filename.startswith('RightVideoSN'):
        identifier = filename[len('RightVideoSN'):]  # Get '001' or '011'
        identifier = identifier.lstrip('0').split('.')[0]  # Remove leading zeros

    return int(identifier)
def get_id(path):

    # Split the path by '/'
    parts = path.split('/')

    # Get the directory name that contains the image
    dirname = parts[-2]  # RightVideoSN011

    # Extract the relevant parts from the directory name
    if dirname.startswith('RightVideoSN'):
        identifier = dirname[len('RightVideoSN'):]  # Get SN011 -> 011
        identifier = identifier.lstrip('0')  # Remove leading zeros

    # Get the image number from the filename
    filename = parts[-1]  # 240.jpg
    image_number = os.path.splitext(filename)[0]  # 240

    return int(identifier),int(image_number)

In [6]:
folder_path = '/kaggle/input/labels-csv/labels_csv'
csv_path=[]
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        csv_path.append(file_path)
csv_path=sorted(csv_path)
csv={}
for file in csv_path:
    df=pd.read_csv(file)
    ide=get_csv_id(file)
    csv[ide]=df

In [7]:
col=csv[1].columns
col

Index(['au1', 'au12', 'au15', 'au17', 'au2', 'au20', 'au25', 'au26', 'au4',
       'au5', 'au6', 'au9'],
      dtype='object')

In [8]:
labels=[]
for file in tqdm(train_paths):
    ide,image=get_id(file)
    try:
        label=np.array(csv[ide].iloc[image])
        label[label > 0]=1
        labels.append(label)
    except IndexError as e:
        print(f"IndexError occurred: {ide}")
        print(f"Check ide={ide} and image={image} against csv indices.")
labels=np.array(labels)

100%|██████████| 53305/53305 [00:03<00:00, 14424.75it/s]


In [9]:
test_labels=[]
for file in tqdm(test_paths):
    ide,image=get_id(file)
    try:
        label=np.array(csv[ide].iloc[image])
        label[label > 0]=1
        test_labels.append(label)
    except IndexError as e:
        print(f"IndexError occurred: {ide}")
        print(f"Check ide={ide} and image={image} against csv indices.")
test_labels=np.array(test_labels)

100%|██████████| 12115/12115 [00:00<00:00, 15860.38it/s]


In [10]:
for i in range(12):
    print(Counter(labels[:,i]))

Counter({0: 50240, 1: 3065})
Counter({0: 40425, 1: 12880})
Counter({0: 50217, 1: 3088})
Counter({0: 47465, 1: 5840})
Counter({0: 50959, 1: 2346})
Counter({0: 51704, 1: 1601})
Counter({0: 35474, 1: 17831})
Counter({0: 45206, 1: 8099})
Counter({0: 43244, 1: 10061})
Counter({0: 52116, 1: 1189})
Counter({0: 45162, 1: 8143})
Counter({0: 50405, 1: 2900})


In [11]:
zero_indices = np.where(~labels.any(axis=1))[0]
nz=np.where(labels.any(axis=1))[0]
print("Indices where all elements in rows are zero:",len(zero_indices))
print("Indices where all elements in rows are all not  zero:",len(nz))

Indices where all elements in rows are zero: 20023
Indices where all elements in rows are all not  zero: 33282


In [12]:
random_numbers = np.random.choice(zero_indices, size=4000, replace=False)
mask=np.concatenate((random_numbers,nz))
mask.sort()

In [13]:
sam_labels=labels[mask]
sam_labels.shape

(37282, 12)

In [14]:
frames=[]
target_size=(224, 224)
for i in tqdm(range(len(train_paths))):
    if i in mask:
        img=cv2.imread(train_paths[i],0)
        resized_img = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)
        rgb_image = cv2.cvtColor(resized_img, cv2.COLOR_GRAY2RGB)
        frames.append(rgb_image)

100%|██████████| 53305/53305 [05:09<00:00, 172.13it/s]


In [15]:
test_frames=[]
target_size=(224, 224)
for i in tqdm(range(len(test_paths))):
    if True:
        img=cv2.imread(test_paths[i],0)
        resized_img = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)
        rgb_image = cv2.cvtColor(resized_img, cv2.COLOR_GRAY2RGB)
        test_frames.append(rgb_image)

100%|██████████| 12115/12115 [00:39<00:00, 303.61it/s]


In [None]:
from tensorflow.keras.metrics import AUC
pr_metric = AUC(curve='PR', num_thresholds=1000)

import tensorflow as tf
from sklearn.metrics import average_precision_score

def sk_pr_auc(y_true, y_pred):
    return tf.py_function(average_precision_score, (y_true, y_pred), tf.float64)

In [17]:
def binary_focal_loss(gamma=2.0, alpha=0.25):
    def focal_loss(y_true, y_pred):
        # Define epsilon to avoid log(0)
        epsilon = tf.keras.backend.epsilon()
        # Clip predictions to prevent log(0) and log(1 - 0)
        y_pred = tf.clip_by_value(y_pred, epsilon, 1.0 - epsilon)
        # Compute the focal loss
        fl = - alpha * (y_true * (1 - y_pred)**gamma * tf.math.log(y_pred)
                       + (1 - y_true) * (y_pred**gamma) * tf.math.log(1 - y_pred))
        return tf.reduce_mean(fl, axis=-1)
    return focal_loss

loss = binary_focal_loss(gamma=2.0, alpha=0.25)

In [21]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, Normalize, ToTensor

class CustomDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx].astype(np.float32)
        if self.transform:
            image = self.transform(image)
        return image, label

# Assuming your images are in 'images' and labels are in 'labels'
images = frames
labels = sam_labels

transform = Compose([
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset = CustomDataset(images, labels, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [22]:
from transformers import ViTForImageClassification
from transformers import ViTFeatureExtractor

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=12,ignore_mismatched_sizes=True)

# Update the classifier layer to match the number of labels and use dropout
model.config.hidden_dropout_prob = 0.1

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([12]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([12, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = Adam(model.parameters(), lr=1e-4)
criterion = BCEWithLogitsLoss()

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images).logits
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

Epoch [1/10], Loss: 0.016177240759134293
Epoch [2/10], Loss: 0.002966067288070917


In [None]:
import numpy as np
import tensorflow as tf
from transformers import TFViTForImageClassification, ViTFeatureExtractor

# Assuming your images are in 'images' and labels are in 'labels'
images = frames
labels = sam_labels

# Normalize images
images = images / 255.0

# Convert labels to float32
labels = labels.astype(np.float32)

# Create a TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((images, labels))
dataset = dataset.shuffle(buffer_size=1024).batch(32).prefetch(tf.data.experimental.AUTOTUNE)

# Load the Vision Transformer model
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = TFViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=12)

# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

# Compile the model
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Train the model
model.fit(dataset, epochs=10)

# Evaluate the model
# Assuming you have a validation dataset 'val_dataset'
# val_dataset = ...
# model.evaluate(val_dataset)


In [19]:
from torch.utils.data import DataLoader

# Split dataset into training and validation sets

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [20]:
from transformers import ViTForImageClassification, ViTFeatureExtractor

# Load pretrained model and feature extractor
model_name = "google/vit-base-patch16-224"
model = ViTForImageClassification.from_pretrained(model_name, num_labels=12,ignore_mismatched_sizes=True)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([12]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([12, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [21]:
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4)

def train(model, train_loader, optimizer, device):
    model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images).logits
        loss = F.binary_cross_entropy_with_logits(outputs, labels)  # Use BCEWithLogitsLoss for multi-label
        loss.backward()
        optimizer.step()

def validate(model, val_loader, device):
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).logits
            val_loss += F.binary_cross_entropy_with_logits(outputs, labels, reduction='sum').item()
            pred = torch.round(torch.sigmoid(outputs))  # Use sigmoid and round for multi-label
            correct += (pred == labels).sum().item()
    val_loss /= len(val_loader.dataset)
    accuracy = correct / (len(val_loader.dataset) * len(labels_list[0]))  # Adjust for multi-label accuracy
    return val_loss, accuracy

# Training the model
num_epochs = 10
for epoch in range(num_epochs):
    train(model, train_loader, optimizer, device)
    val_loss, val_accuracy = validate(model, val_loader, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 15.89 GiB of which 27.12 MiB is free. Process 3269 has 15.86 GiB memory in use. Of the allocated memory 382.23 MiB is allocated by PyTorch, and 39.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF