In [None]:
from google.colab import drive
drive.mount('/content/drive')

!unzip -q /content/drive/MyDrive/archive.zip -d /content/



Mounted at /content/drive


In [2]:
import os
import cv2

input_root = '/content/Real Life Violence Dataset'
output_root = '/content/Images'

label_map = {
    'Violence': 'fighting',
    'NonViolence': 'non-fighting'
}

frame_rate = 1  # Extract 1 frame per second

for original_label, new_label in label_map.items():
    input_folder = os.path.join(input_root, original_label)
    output_folder = os.path.join(output_root, new_label)
    os.makedirs(output_folder, exist_ok=True)

    for video_name in os.listdir(input_folder):
        if not video_name.lower().endswith('.mp4'):
            continue

        video_path = os.path.join(input_folder, video_name)
        cap = cv2.VideoCapture(video_path)

        fps = int(cap.get(cv2.CAP_PROP_FPS))
        count = 0
        frame_id = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            if count % max(1, (fps // frame_rate)) == 0:
                frame_filename = f"{video_name[:-4]}_frame{frame_id}.jpg"
                frame_path = os.path.join(output_folder, frame_filename)
                cv2.imwrite(frame_path, frame)
                frame_id += 1
            count += 1
        cap.release()

print("Frames extracted.")


Frames extracted.


In [26]:
# ─── Split extracted frames into train/val/test ───────────────────────────────
import os
import random
import shutil
from tqdm import tqdm

# reproducible splits
random.seed(42)

input_dir   = '/content/Images'
output_base = '/content/dataset'
splits      = ['train', 'val', 'test']
ratios      = [0.7, 0.15, 0.15]

# create output folder structure
for split in splits:
    for label in os.listdir(input_dir):
        os.makedirs(os.path.join(output_base, split, label), exist_ok=True)

# perform the actual split
for label in os.listdir(input_dir):
    all_imgs = os.listdir(os.path.join(input_dir, label))
    random.shuffle(all_imgs)

    n_total = len(all_imgs)
    n_train = int(ratios[0] * n_total)
    n_val   = int(ratios[1] * n_total)

    split_imgs = {
        'train': all_imgs[:n_train],
        'val'  : all_imgs[n_train:n_train + n_val],
        'test' : all_imgs[n_train + n_val:]
    }

    for split, imgs in split_imgs.items():
        for img in tqdm(imgs, desc=f"Copying {label} → {split}"):
            src = os.path.join(input_dir, label, img)
            dst = os.path.join(output_base, split, label, img)
            shutil.copy2(src, dst)

print("✅ Frame dataset split into train/val/test at", output_base)


Copying fighting → train: 100%|██████████| 4082/4082 [00:04<00:00, 947.95it/s]
Copying fighting → val: 100%|██████████| 874/874 [00:00<00:00, 4343.57it/s]
Copying fighting → test: 100%|██████████| 876/876 [00:00<00:00, 4396.39it/s]
Copying non-fighting → train: 100%|██████████| 3489/3489 [00:00<00:00, 3768.38it/s]
Copying non-fighting → val: 100%|██████████| 747/747 [00:00<00:00, 6080.90it/s]
Copying non-fighting → test: 100%|██████████| 749/749 [00:00<00:00, 4825.60it/s]

✅ Frame dataset split into train/val/test at /content/dataset





In [28]:
import torch
import torch.nn as nn
from pathlib import Path
from torch.utils.data import Dataset, DataLoader


In [29]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


cuda:0


In [30]:
from torchvision import transforms

In [31]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])


In [32]:
from PIL import Image
import os


In [33]:
class ImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []
        self.classes = sorted(os.listdir(root_dir))

        for idx, class_name in enumerate(self.classes):
            class_folder = os.path.join(root_dir, class_name)
            for img_file in os.listdir(class_folder):
                self.image_paths.append(os.path.join(class_folder, img_file))
                self.labels.append(idx)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label


In [34]:
train_dataset = ImageDataset("/content/dataset/train", transform=transform)
val_dataset   = ImageDataset("/content/dataset/val",   transform=transform)
test_dataset  = ImageDataset("/content/dataset/test",  transform=transform)


In [35]:
len(dataset)


10817

In [36]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False)


In [37]:
images, labels = next(iter(dataloader))


In [38]:
images.shape, labels.shape


(torch.Size([32, 3, 224, 224]), torch.Size([32]))

In [39]:
class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, dropout=0.1):
        super().__init__()
        hidden_features = hidden_features or in_features
        out_features = out_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x


In [40]:
class AttentionBlock(nn.Module):
    def __init__(self, dim, heads=8, dropout=0.1):
        super().__init__()
        self.heads = heads
        self.scale = dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3)
        self.attn_drop = nn.Dropout(dropout)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(dropout)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.heads, C // self.heads)
        q, k, v = qkv.permute(2, 0, 3, 1, 4)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


In [41]:
class TransformerBlock(nn.Module):
    def __init__(self, dim, heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn = AttentionBlock(dim, heads, dropout)
        self.norm2 = nn.LayerNorm(dim)
        self.mlp = MLP(dim, mlp_dim, dropout=dropout)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x


In [42]:
!pip install torchinfo



In [43]:
# Instantiate ViT and move to the selected device
model = ViT(
    image_size=224,
    patch_size=16,
    num_classes=2,
    dim=512,
    depth=6,
    heads=8,
    mlp_dim=1024,
    dropout=0.1
).to(device)

# Display a model summary (torchinfo)
from torchinfo import summary

summary(
    model,
    input_size=(1, 3, 224, 224),
    device=device  # explicitly named to avoid empty() kwargs error
)


Layer (type:depth-idx)                   Output Shape              Param #
ViT                                      [1, 2]                    101,376
├─Linear: 1-1                            [1, 196, 512]             393,728
├─Dropout: 1-2                           [1, 197, 512]             --
├─Sequential: 1-3                        [1, 197, 512]             --
│    └─TransformerBlock: 2-1             [1, 197, 512]             --
│    │    └─LayerNorm: 3-1               [1, 197, 512]             1,024
│    │    └─AttentionBlock: 3-2          [1, 197, 512]             1,050,624
│    │    └─LayerNorm: 3-3               [1, 197, 512]             1,024
│    │    └─MLP: 3-4                     [1, 197, 512]             1,050,112
│    └─TransformerBlock: 2-2             [1, 197, 512]             --
│    │    └─LayerNorm: 3-5               [1, 197, 512]             1,024
│    │    └─AttentionBlock: 3-6          [1, 197, 512]             1,050,624
│    │    └─LayerNorm: 3-7               [1, 

In [44]:
model = ViT().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)


In [45]:
num_epochs = 10

for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    model.train()

    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss:.4f}, Accuracy: {100 * correct / total:.2f}%")


Epoch 1/10, Loss: 222.4980, Accuracy: 61.56%
Epoch 2/10, Loss: 177.3808, Accuracy: 73.00%
Epoch 3/10, Loss: 146.7617, Accuracy: 79.35%
Epoch 4/10, Loss: 117.3953, Accuracy: 84.33%
Epoch 5/10, Loss: 103.6667, Accuracy: 86.93%
Epoch 6/10, Loss: 112.1306, Accuracy: 84.96%
Epoch 7/10, Loss: 82.1567, Accuracy: 89.77%
Epoch 8/10, Loss: 73.4487, Accuracy: 90.77%
Epoch 9/10, Loss: 71.7145, Accuracy: 91.14%
Epoch 10/10, Loss: 68.1225, Accuracy: 91.80%


In [46]:
# Save model with state_dict and class mapping
torch.save({
    "state_dict": model.state_dict(),
    "idx_to_class": {0: "non-fighting", 1: "fighting"}
}, "vit_model4.pth")



In [54]:
# ─── Evaluate on Test Set ───────────────────────────────────────────────────────
model.eval()
correct = 0
total   = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total   += labels.size(0)

test_acc = 100. * correct / total
print(f"Test Accuracy: {test_acc:.2f}%")


Test Accuracy: 93.97%


Predicted class: 0, Actual class: 0


NameError: name 'test_loader' is not defined

In [50]:
!cp /content/dataset /content/drive/MyDrive/AI_Project_Dataset


cp: -r not specified; omitting directory '/content/dataset'


In [52]:
!cp -r /content/dataset /content/drive/MyDrive/AI_Project_Dataset


In [53]:
# ─── Evaluate on Test Set ───────────────────────────────────────────────────────
model.eval()
correct = 0
total   = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total   += labels.size(0)

test_acc = 100. * correct / total
print(f"Test Accuracy: {test_acc:.2f}%")


Test Accuracy: 93.97%
