In [3]:
import os, cv2, torch 
import pandas as pd
from sklearn.model_selection import train_test_split # to split the data for training, validation and testing
import numpy as np
from torch.utils.data import Dataset
import torchvision.transforms as T

In [22]:
import re

df = pd.read_csv("data.csv")

# Root folder where the sampled frames are
frames_root = r"C:\Users\yashr\Downloads\sampled_frames"

def convert_video_to_frame_path(video_path):
    # Extracting parent folder
    parent_folder = os.path.basename(os.path.dirname(video_path))
    # Extracting video file name without extension
    video_file = os.path.splitext(os.path.basename(video_path))[0]
    # Checking if video_file is of the format v<number> exactly
    if re.fullmatch(r'v\d+', video_file.lower()):
        # Use parent-video combination
        new_folder = f"{parent_folder}-{video_file}"
    else:
        new_folder = video_file
    # Full path to sampled frames
    new_path = os.path.join(frames_root, new_folder)
    return os.path.normpath(new_path)

df['path'] = df['path'].apply(convert_video_to_frame_path)

# Saving updated CSV
df.to_csv("updated_dataset.csv", index=False)

print(df[['video_id', 'path']].head())

                 video_id                                              path
0    tadp_accident/video1    C:\Users\yashr\Downloads\sampled_frames\video1
1   tadp_accident/video10   C:\Users\yashr\Downloads\sampled_frames\video10
2  tadp_accident/video100  C:\Users\yashr\Downloads\sampled_frames\video100
3  tadp_accident/video101  C:\Users\yashr\Downloads\sampled_frames\video101
4  tadp_accident/video102  C:\Users\yashr\Downloads\sampled_frames\video102


In [23]:
CSV = "updated_dataset.csv"
OUT_DIR = "splits"

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

df = pd.read_csv(CSV)

# splits the dataset into training set (70%) and remaining goes into temp set 
train, temp = train_test_split(
    df, test_size=0.3, stratify=df["label"], random_state=27
)

# splits the temp set into validation and test set equally (15% each)
val, test = train_test_split(
    temp, train_size=0.5, stratify=temp["label"], random_state=27
)

train.to_csv(os.path.join(OUT_DIR, "train.csv"))
val.to_csv(os.path.join(OUT_DIR, "val.csv"))
test.to_csv(os.path.join(OUT_DIR, "test.csv"))

In [None]:
def calculate_mean_std(folder):
    """
    folder: base folder where sampled_frames/ is located
    this function is used to calculate the mean and std to normalize the image
    """
    n_pixels = 0
    mean_sum = np.zeros(3, dtype=np.float64)
    sq_sum = np.zeros(3, dtype=np.float64)
    video_folders = [os.path.join(folder,f) for f in os.listdir(folder)]
    for vf in video_folders:
        files = [os.path.join(vf, f) for f in os.listdir(vf)]
        for img_path in files:
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 #Converts the image from BGR to RGB color format adn normalizes the value to be between 0 and 1
            img = img.reshape(-1, 3) # we convert it into a 2d array basically flattening the image

            mean_sum += img.sum(axis=0)
            sq_sum += (img ** 2).sum(axis=0)
            n_pixels += img.shape[0]
        print(mean_sum)
    mean = mean_sum / n_pixels
    std = np.sqrt(sq_sum / n_pixels - mean ** 2)
    return mean, std


mean, std = calculate_mean_std("sampled_frames")
print("mean:", mean, "std:", std)

[64415878.5625 64747539.0625 64213634.4375]
[1.19920095e+08 1.23459132e+08 1.24473541e+08]
[1.66625081e+08 1.71474641e+08 1.64899402e+08]
[2.17187582e+08 2.22022605e+08 2.13919474e+08]
[2.72964165e+08 2.77667982e+08 2.69406630e+08]
[3.27863932e+08 3.32875120e+08 3.25818135e+08]
[3.80567856e+08 3.87827299e+08 3.79308086e+08]
[4.41138290e+08 4.49252994e+08 4.39862346e+08]
[4.93270487e+08 5.02599764e+08 4.91752782e+08]
[5.65614796e+08 5.75864767e+08 5.63923734e+08]
[6.04631156e+08 6.15877681e+08 6.02454902e+08]
[6.54380513e+08 6.69798744e+08 6.49545861e+08]
[6.98758473e+08 7.19738008e+08 6.92428673e+08]
[7.53049480e+08 7.72401436e+08 7.39588431e+08]
[7.95786659e+08 8.14759167e+08 7.79660088e+08]
[8.51277296e+08 8.70419190e+08 8.32721677e+08]
[9.02136864e+08 9.22560154e+08 8.83854182e+08]
[9.63284545e+08 9.81090748e+08 9.42842141e+08]
[1.02591918e+09 1.04162412e+09 1.00401761e+09]
[1.07184890e+09 1.08487794e+09 1.04740219e+09]
[1.12701730e+09 1.14042268e+09 1.09930882e+09]
[1.16843320e+09 

In [7]:
import os
import cv2
import torch
import pandas as pd
from torch.utils.data import Dataset
import torchvision.transforms as T
import numpy as np

class VideoDataset(Dataset):
    def __init__(self, csv_file, root_dir="sampled_frames", seq_len=60,
                 video_col='video_id', path_col='path', label_col='label',
                 mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
        """
        csv_file: CSV with columns : video_id, path, label
        root_dir: folder where video frame folders are located
        seq_len: number of frames per clip (pad/subsample if needed)
        """
        self.df = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.seq_len = seq_len
        self.video_col = video_col
        self.path_col = path_col
        self.label_col = label_col

        self.transform = T.Compose([
            T.ToPILImage(),
            T.Resize((224, 224)),
            T.ToTensor(),
            T.Normalize(mean=mean, std=std)
        ])

        self.video_paths = self.df[self.path_col].tolist()
        self.labels = self.df[self.label_col].tolist()

    def __len__(self):
        return len(self.video_paths)

    def load_frames_from_video(self, video_folder):
        """Loads and returns all frames from one folder."""
        frames = []
        folder_path = os.path.join(self.root_dir, video_folder)
        folder_path = os.path.normpath(folder_path)  # normalize slashes to correct the path
        files = sorted(os.listdir(folder_path))      # ensure correct frame order by sorting it

        for f in files:
            img_path = os.path.join(folder_path, f)
            img = cv2.imread(img_path)
            if img is None:
                continue  # skip missing/broken images
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            frames.append(img)

        return frames

    def __getitem__(self, idx):
        video_folder = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.load_frames_from_video(video_folder)

        # apply transform
        frames = [self.transform(frame) for frame in frames]
        frames = torch.stack(frames)  # shape: (number_of_frames, 3, H, W)

        return frames, torch.tensor(label, dtype=torch.long)

In [13]:
from torch.utils.data import DataLoader

dataset = VideoDataset(csv_file="data_final.csv", seq_len=60)
loader = DataLoader(dataset, batch_size=2, shuffle=True) # batch_size of 2 for testing

for frames, labels in loader:
    print(frames.shape, labels) # returns (batch_size, number_of_frames, color_channels. height, width)
    break

torch.Size([2, 60, 3, 224, 224]) tensor([1, 0])


In [10]:
import torch
import torch.nn as nn

class CNN_LSTM(nn.Module):
    def __init__(self, num_classes=2, hidden_dim=128, num_layers=1):
        super(CNN_LSTM, self).__init__()
        # 2D CNN to capture spacial data
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),  # input 3x224x224
            nn.ReLU(),
            nn.MaxPool2d(2),  # 16x112x112

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 32x56x56

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 64x28x28
        )

        self.feature_dim = 64 * 28 * 28  # each frame turned into a vector of 64*128*128 size
        # lstm with hidden_dim = 128, consisting of 60 feature vectors
        self.lstm = nn.LSTM(input_size=self.feature_dim,
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True)
        # fully-connected layer for classification (it maps it to logits)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        """
        x: [B, T, C, H, W]
        """
        B, T, C, H, W = x.size()

        # Merge batch and time for CNN
        x = x.view(B*T, C, H, W)  # [B*T, 3, H, W] combining batch and sequence dimension to treat each frame individually
        features = self.cnn(x)
        features = features.view(B, T, -1)  # flatten to [B, T, feature_dim] for lstm

        lstm_out, _ = self.lstm(features)  # [B, T, hidden_dim] # take the last hidden state only

        # Use last time step for classification
        last_out = lstm_out[:, -1, :]  # [B, hidden_dim]

        out = self.fc(last_out)  # [B, num_classes]
        return out


In [14]:
model = CNN_LSTM(num_classes=2)
x = torch.randn(2, 60, 3, 224, 224)  # batch_size=2, seq_len=60
out = model(x)
print(out.shape)  # (batch_size, number_of_classes) basically a list of logits

torch.Size([2, 2])


In [5]:
print(torch.cuda.is_available())
print(torch.version.cuda)

True
11.8


In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader


# hyperparameters
batch_size = 2
seq_len = 60
num_epochs = 10
learning_rate = 1e-4
hidden_dim = 128
num_classes = 2

# using nvidia gpu for faster training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# creating the training and validation dataset
train_dataset = VideoDataset(csv_file="splits/train.csv", seq_len=seq_len)
val_dataset = VideoDataset(csv_file="splits/val.csv", seq_len=seq_len)
# creating the loaders for training and validation dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# defining the model
model = CNN_LSTM(num_classes=num_classes, hidden_dim=hidden_dim).to(device)

# using adam optimizer for smoother and faster training
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# here we use cross-entropy loss as it is a binary classification
criterion = nn.CrossEntropyLoss()

# training
for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for frames, labels in train_loader:
        frames = frames.to(device)
        labels = labels.to(device)
        #using the optimizer
        optimizer.zero_grad()
        outputs = model(frames)
        
        loss = criterion(outputs, labels)
        #backpropagation
        loss.backward()

        # gradient clipping to prevent gradient explosion
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)

        optimizer.step()

        running_loss += loss.item() * frames.size(0)
        # taking the label with greater logit
        _, predicted = torch.max(outputs, 1)
        # checking against the label
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    # averaging the loss and accuracy over the training loop
    train_loss = running_loss / total
    train_acc = correct / total

    # validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for frames, labels in val_loader:
            frames = frames.to(device)
            labels = labels.to(device)

            outputs = model(frames)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * frames.size(0)
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    val_loss /= val_total
    val_acc = val_correct / val_total

    print(f"Epoch [{epoch}/{num_epochs}] "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")


Using device: cuda
Epoch [1/10] Train Loss: 0.6390 | Train Acc: 0.6667 | Val Loss: 0.5801 | Val Acc: 0.7246
Epoch [2/10] Train Loss: 0.5741 | Train Acc: 0.7068 | Val Loss: 0.5477 | Val Acc: 0.7101
Epoch [3/10] Train Loss: 0.5204 | Train Acc: 0.7377 | Val Loss: 0.5384 | Val Acc: 0.7536
Epoch [4/10] Train Loss: 0.4865 | Train Acc: 0.7685 | Val Loss: 0.5522 | Val Acc: 0.7536
Epoch [5/10] Train Loss: 0.4242 | Train Acc: 0.8025 | Val Loss: 0.5998 | Val Acc: 0.7391
Epoch [6/10] Train Loss: 0.3662 | Train Acc: 0.8457 | Val Loss: 0.6235 | Val Acc: 0.7681
Epoch [7/10] Train Loss: 0.3754 | Train Acc: 0.8241 | Val Loss: 0.5224 | Val Acc: 0.7101
Epoch [8/10] Train Loss: 0.3033 | Train Acc: 0.8704 | Val Loss: 0.6540 | Val Acc: 0.7391
Epoch [9/10] Train Loss: 0.2371 | Train Acc: 0.8858 | Val Loss: 0.6603 | Val Acc: 0.7246
Epoch [10/10] Train Loss: 0.1958 | Train Acc: 0.9228 | Val Loss: 0.6459 | Val Acc: 0.7536


In [25]:
model = CNN_LSTM(num_classes=2)

torch.save(model.state_dict(), "cnn_lstm.pth")