In [41]:
train_root = '/raid/binod/prashant/CVS_train'

import cv2
from PIL import Image

def extract_frame_by_index(video_path, frame_index, output_image_path=""):
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    
    # Check if the video opened successfully
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return
    
    # Set the frame position
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
    
    # Read the frame
    ret, frame = cap.read()
    
    if ret:
        # # Save the frame as an image
        # cv2.imwrite(output_image_path, frame)
        # print(f"Frame {frame_index} extracted and saved to {output_image_path}")
        pass
    else:
        print(f"Error: Could not read frame {frame_index}")
    
    # Release the video capture object
    cap.release()
    img = cv2.cvtColor(cv2.resize(frame, (256, 256)), cv2.COLOR_BGR2RGB)
    return img

def extract_prev_frames_by_index(video_path, frame_index, output_image_path=""):
    # Open the video file
    imgs = []
    for _ in range(5):
        cap = cv2.VideoCapture(video_path)
        
        # Check if the video opened successfully
        if not cap.isOpened():
            print(f"Error: Could not open video {video_path}")
            return
        
        # Set the frame position
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
        
        # Read the frame
        ret, frame = cap.read()
        
        if ret:
            # # Save the frame as an image
            # cv2.imwrite(output_image_path, frame)
            # print(f"Frame {frame_index} extracted and saved to {output_image_path}")
            pass
        else:
            print(f"Error: Could not read frame {frame_index}")
        
        # Release the video capture object
        cap.release()
        img = cv2.cvtColor(cv2.resize(frame, (256, 256)), cv2.COLOR_BGR2RGB)
        imgs.append(img)
        frame_index -= 30
        if frame_index < 0:
            break
    return imgs




In [42]:
import numpy as np
def find_mode(numbers_array):
    values, counts = np.unique(numbers_array, return_counts=True)
    
    # Find the index of the maximum count
    max_count_index = np.argmax(counts)
    
    # The mode is the value at the index of the maximum count
    mode = values[max_count_index]
    
    return mode

In [77]:
import os
import pandas as pd
import pickle
from tqdm import tqdm
all_lists = []
for video_id_mp4 in tqdm(os.listdir(os.path.join(train_root, 'videos'))):
    video_id = video_id_mp4[:-4]
    label_csv_path = os.path.join(train_root, 'labels', video_id, 'frame.csv')
    df  = pd.read_csv(label_csv_path)
    for index, row in df.iterrows():
        frame_index = row['frame_id']
        c1 = find_mode(np.array([row['c1_rater1'], row['c1_rater2'], row['c1_rater3']]))
        c2 = find_mode(np.array([row['c2_rater1'], row['c2_rater2'], row['c2_rater3']]))
        c3 = find_mode(np.array([row['c3_rater1'], row['c3_rater2'], row['c3_rater3']]))
        video_path = os.path.join(train_root, 'videos', f'{video_id}.mp4')
        img_array = extract_frame_by_index(video_path, frame_index)
        # img_arrays = extract_prev_frames_by_index(video_path, frame_index)

        dictionary = {
                        'video_id': video_id,
                        'frame_id': frame_index,
                        # 'img_arrays': img_arrays,
                        'img_array': img_array,
                        'label': [c1, c2, c3]
                    }
        all_lists.append(dictionary)
        # print(f"{frame_index}: ({c1}, {c2}, {c3})")

with open('single_images.pkl', 'wb') as handle:
# with open('multiple_images_1fps.pkl', 'wb') as handle:
    pickle.dump(all_lists, handle, protocol=pickle.HIGHEST_PROTOCOL)

  0%|                                                                                                                                        | 0/200 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [15:11<00:00,  4.56s/it]


# Dataloader

In [46]:
import torch
import numpy as numpy
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2
import pickle
from PIL import Image


class SingleImageDataset(Dataset):
    def __init__(self, pkl_file, transform=None):

        self.transform = transform
        if self.transform is None:
            transforms = v2.Compose([
                v2.Resize(224),
                v2.ToImage(),
                v2.ToDtype(torch.float32, scale=True),
                v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ])
        with open(pkl_file, 'rb') as f:
            self.data = pickle.load(f)
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        item = self.data[idx]
        image = Image.fromarray(item['img_array'])
        if self.transform:
            image = self.transform(image)
        label = torch.tensor(item['label'], dtype=torch.float32)
        return image, label


In [47]:
import torchvision
from torch.optim import AdamW
import torch.nn as nn
from torchvision.transforms import v2
import time

In [78]:
import torch
from sklearn.metrics import average_precision_score
import numpy as np
from tqdm import tqdm
def evaluate_map(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch_idx, (images, labels) in (enumerate(dataloader)):
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images)
            preds = torch.sigmoid(outputs)  # Sigmoid to get probabilities
            
            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    # Calculate average precision for each label
    ap_per_label = []
    for i in range(all_labels.shape[1]):
        ap = average_precision_score(all_labels[:, i], all_preds[:, i])
        ap_per_label.append(ap)
    map_score = np.mean(ap_per_label)
    return map_score, ap_per_label

In [86]:
transforms = v2.Compose([
    v2.RandomResizedCrop(size=(224, 224), antialias=True),
    v2.RandomHorizontalFlip(p=0.5),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

all_data = SingleImageDataset('single_images.pkl', transform=transforms)
train_set, val_set = torch.utils.data.random_split(all_data, [0.8, 0.2])



train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False)
torch.set_float32_matmul_precision('high')
device = 'cuda:0'

model = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.IMAGENET1K_V2)
model.fc = nn.Sequential(nn.Linear(2048, 3, bias=True))
model = model.to(device)
# model = torch.compile(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.BCEWithLogitsLoss()
epochs = 50

for epoch in range(epochs):
    loss = 0
    start = time.time()

    for step, (datas, labels) in (enumerate(train_loader)):
        datas = datas.to(device)
        labels = labels.to(device)
        output = model(datas)

        loss = criterion(output, labels)
        loss.backward()
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        loss += loss.item()
        # print(f'Epoch: {epoch} | Step: {step} | Time: {round((end-start)*1000)} | Loss: {loss.item()}')
    end = time.time()
    print(f"Epoch {epoch} | validation mAP: {evaluate_map(model, val_loader, 'cuda:0')[0]} | avg_loss: {loss/(step+1)}")


23it [00:02, 10.60it/s]


Epoch 0 | validation mAP: 0.45641098051272583 | avg_loss: 0.00897589698433876


23it [00:02, 10.12it/s]


Epoch 1 | validation mAP: 0.3459917793736347 | avg_loss: 0.006703169085085392


23it [00:02, 10.37it/s]


Epoch 2 | validation mAP: 0.3780360073391806 | avg_loss: 0.016483638435602188


23it [00:02,  9.01it/s]


Epoch 3 | validation mAP: 0.45476367861280137 | avg_loss: 0.012422800064086914


23it [00:02,  9.81it/s]


Epoch 4 | validation mAP: 0.4867489806179117 | avg_loss: 0.009190602228045464


23it [00:02, 10.42it/s]


Epoch 5 | validation mAP: 0.5329949869678791 | avg_loss: 0.007803755346685648


23it [00:02, 10.43it/s]


Epoch 6 | validation mAP: 0.4035012057350606 | avg_loss: 0.010013770312070847


23it [00:02, 10.27it/s]


Epoch 7 | validation mAP: 0.5254460860775242 | avg_loss: 0.010917195118963718


23it [00:02,  9.32it/s]


Epoch 8 | validation mAP: 0.5819767961977232 | avg_loss: 0.006297580432146788


23it [00:02,  9.80it/s]


Epoch 9 | validation mAP: 0.5850289566394012 | avg_loss: 0.007941165938973427


23it [00:02, 10.58it/s]


Epoch 10 | validation mAP: 0.60046986185103 | avg_loss: 0.007438318338245153


23it [00:02, 10.02it/s]


Epoch 11 | validation mAP: 0.6502450895379425 | avg_loss: 0.007978645153343678


23it [00:02,  9.26it/s]


Epoch 12 | validation mAP: 0.6025497335940541 | avg_loss: 0.008593267761170864


23it [00:02, 10.68it/s]


Epoch 13 | validation mAP: 0.6060030880010564 | avg_loss: 0.007023943122476339


23it [00:02, 10.83it/s]


Epoch 14 | validation mAP: 0.6355775163761962 | avg_loss: 0.004376550205051899


23it [00:02, 10.88it/s]


Epoch 15 | validation mAP: 0.65794001273471 | avg_loss: 0.0057038734667003155


23it [00:02, 10.90it/s]


Epoch 16 | validation mAP: 0.6862390254617372 | avg_loss: 0.004662816878408194


23it [00:02,  9.93it/s]


Epoch 17 | validation mAP: 0.6602901740210531 | avg_loss: 0.005197088234126568


23it [00:02, 10.81it/s]


Epoch 18 | validation mAP: 0.6487355093424255 | avg_loss: 0.0040502650663256645


23it [00:02, 10.70it/s]


Epoch 19 | validation mAP: 0.7270726536675011 | avg_loss: 0.009333928115665913


23it [00:02, 10.66it/s]


Epoch 20 | validation mAP: 0.707483230776536 | avg_loss: 0.004998341668397188


23it [00:02, 10.97it/s]


Epoch 21 | validation mAP: 0.696615385563868 | avg_loss: 0.0056933630257844925


23it [00:02, 10.98it/s]


Epoch 22 | validation mAP: 0.679132669331966 | avg_loss: 0.0057898289524018764


23it [00:02, 10.96it/s]


Epoch 23 | validation mAP: 0.6806166073824148 | avg_loss: 0.0033337955828756094


23it [00:02, 10.97it/s]


Epoch 24 | validation mAP: 0.7244329611775443 | avg_loss: 0.00493922783061862


23it [00:02, 10.99it/s]


Epoch 25 | validation mAP: 0.7121334285298225 | avg_loss: 0.005750473588705063


23it [00:02, 10.97it/s]


Epoch 26 | validation mAP: 0.7261451911312014 | avg_loss: 0.007490485906600952


23it [00:02, 10.78it/s]


Epoch 27 | validation mAP: 0.7355117818918213 | avg_loss: 0.0076883211731910706


In [76]:
evaluate_map(model, val_loader, 'cuda:0')

113it [00:11,  9.84it/s]


(0.22480193480979463,
 [0.16909663521797264, 0.3269857268493618, 0.1783234423620495])