In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.io as io
import os

# Define your device (for GPU processing)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transform = transforms.Compose([
    # transforms.ToPILImage(),
    transforms.Resize((128, 128)),  # Resize frames to 128x128
    transforms.ToTensor(),  # Convert frames to tensor format
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize the RGB channels
])

In [2]:
import torch.nn.functional as F

class VideoDataset(Dataset):
    def __init__(self, root_dir, transform=None, max_frames=400, myframes = 0):
        self.root_dir = root_dir
        self.transform = transform
        self.max_frames = max_frames  # Maximum frames to load
        self.myframes = myframes
        self.classes = ['non_violent', 'violent']
        self.data = []
        
        for label, class_name in enumerate(self.classes):
            class_dir = os.path.join(root_dir, class_name)
            for cam_folder in os.listdir(class_dir):
                cam_dir = os.path.join(class_dir, cam_folder)
                if os.path.isdir(cam_dir):  
                    for video_name in os.listdir(cam_dir):
                        if video_name.endswith('.mp4'):
                            video_path = os.path.join(cam_dir, video_name)
                            self.data.append((video_path, label)) 
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        video_path, label = self.data[idx]
        
        # Load the video and all its frames
        video, _, _ = io.read_video(video_path, pts_unit='sec')

        # Print original shape
        # print(f"Original video shape: {video.shape}")
        self.myframes = self.myframes + video.shape[0]
        # Number of frames in the video
        total_frames = video.shape[0]

        # Initialize the padded tensor
        if total_frames < self.max_frames:
            # Create a new tensor filled with zeros
            padded_video = torch.zeros((self.max_frames, video.shape[1], video.shape[2], video.shape[3]), dtype=video.dtype)
            # Copy the existing frames into the padded tensor
            padded_video[:total_frames] = video  # Fill the start with actual frames
            video = padded_video  # Use the padded video
        else:
            # If we have enough frames, just take the first max_frames
            video = video[:self.max_frames]

        # Print shape after padding/truncating
        # print(f"Processed video shape: {video.shape}")

        # Apply transformations to each frame
        pil_transform = transforms.ToPILImage()
        frames = []
        for frame in video:
            # Convert to [channels, height, width]
            frame = frame.permute(2, 0, 1)  # Swap dimensions

            pil_img = pil_transform(frame)  # Convert to PIL
            if self.transform:
                pil_img = self.transform(pil_img)  # Apply transformation
            frames.append(pil_img)
        
        video = torch.stack(frames)  # Stack the frames back into a tensor
        video = video.to(device)
        label = torch.tensor(label).to(device)
        print(self.myframes)
        return video, label
    def avg_frames(self):
        return self.myframes / len(self.data)

In [3]:
path ='/home/subru/projects/CNN/A-Dataset-for-Automatic-Violence-Detection-in-Videos-master/violence-detection-dataset'
train_dataset = VideoDataset(root_dir=path, transform=transform)

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=5, shuffle=True)

# Iterate through the data loader
for videos, labels in train_loader:
    print(videos.shape)  # Should print shape [batch_size, frames, channels, height, width]
    print(labels)        # Should print corresponding labels (0 for non-violent, 1 for violent)

113
249
428
502
636
torch.Size([5, 400, 3, 128, 128])
tensor([1, 1, 1, 0, 0], device='cuda:0')
781
931
1141
1277
1388
torch.Size([5, 400, 3, 128, 128])
tensor([1, 0, 1, 1, 1], device='cuda:0')
1578
1758
1938
2040
2143
torch.Size([5, 400, 3, 128, 128])
tensor([1, 1, 1, 0, 1], device='cuda:0')
2245
2396
2568
2842
3014
torch.Size([5, 400, 3, 128, 128])
tensor([0, 1, 1, 1, 0], device='cuda:0')
3155
3281
3437
3625
3876
torch.Size([5, 400, 3, 128, 128])
tensor([0, 1, 1, 0, 1], device='cuda:0')
4076
4160
4339
4569
4785
torch.Size([5, 400, 3, 128, 128])
tensor([1, 1, 1, 0, 1], device='cuda:0')
4950
5107
5402
5593
5731
torch.Size([5, 400, 3, 128, 128])
tensor([0, 1, 1, 0, 1], device='cuda:0')
5878
5991
6215
6458
6688
torch.Size([5, 400, 3, 128, 128])
tensor([0, 1, 0, 0, 0], device='cuda:0')
6815
7007
7177
7369
7480
torch.Size([5, 400, 3, 128, 128])
tensor([1, 1, 1, 0, 0], device='cuda:0')
7618
7751
7909
8014
8174
torch.Size([5, 400, 3, 128, 128])
tensor([1, 1, 1, 1, 1], device='cuda:0')
8332
84

##### Total Frames: 59139
##### Average frames per video: 169