In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd 
import os

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.transforms import RandAugment

import random
import PIL
import os
import albumentations as A
from PIL import Image
import timm

In [59]:
from torchvision import datasets

# Load dataset with transform
dataset = datasets.ImageFolder(root='/content/drive/MyDrive/frames', transform=transform)

# Split dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4)


In [None]:
class CNNViT(nn.Module):
    def __init__(self, num_classes=7):
        super(CNNViT, self).__init__()

        self.cnn = timm.create_model('resnet34', pretrained=True, features_only=True)

        self.fusion = nn.Sequential(
            nn.Conv2d(512, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.RELU(),
            nn.Dropout(0.5),
            nn.Conv2d(256, 128, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.RELU(),
            nn.Dropout(0.5),
            nn.Conv2d(128, 3, 3, padding=1)
        )

        self.vit = timm.create_model('vit_base_patch16_224', pretrained=True)

        # Final Fusion Classifier
        self.classifier = nn.Sequential(
            nn.Linear(512 + 768, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512,128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        # CNN feature extraction
        cnn_features = self.cnn(x)[-1]  # Last layer output [B, 512, 7, 7]

        # Global average pooling (turn into [B, 512])
        cnn_global_features = F.adaptive_avg_pool2d(cnn_features, (1, 1)).view(x.size(0), -1)

        # CNN→RGB projection (for ViT input)
        projected = self.fusion(cnn_features)

        # Upsample to 224x224 for ViT input
        upsampled = F.interpolate(projected, size=(224, 224), mode='bilinear', align_corners=False)

        vit_features = self.vit.forward_features(upsampled)
        vit_cls_token = vit_features[:, 0]  # CLS token only [B, 768]

        combined_features = torch.cat([cnn_global_features, vit_cls_token], dim=1)
        logits = self.classifier(combined_features)
        return logits

In [None]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.1, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)   
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [None]:
model = CNNViT(num_classes=2)

criterion = LabelSmoothingLoss(classes=2, smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

model = nn.DataParallel(model) #imp
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DataParallel(
  (module): CNNViT(
    (cnn): FeatureListNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (drop_block): Identity()
          (act1): ReLU(inplace=True)
          (aa): Identity()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act2): ReLU(inplace=True)
        )
        (1): BasicBlock(
          (conv1)

In [77]:
def save_checkpoint(model, optimizer, epoch, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)

    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, path)
    print(f"Checkpoint saved to {path}")

In [78]:
class EarlyStopping:
    def __init__(self, patience=3, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model, optimizer):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, optimizer)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, optimizer)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, optimizer):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        if val_loss < self.val_loss_min:
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()
                }, self.path)
            self.val_loss_min = val_loss

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
scaler = torch.cuda.amp.GradScaler()

num_epochs = 25
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []
true_labels = []
predicted_labels = []
early_stopping = EarlyStopping(patience=3, verbose=True, path='/content/drive/MyDrive/checkpoints/checkpoint_early_stopping.pt')


for epoch in range(num_epochs):
    # === Training ===
    model.train()
    train_loss, correct, total = 0, 0, 0

    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        with autocast():
            output = model(images)
            loss = criterion(output, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item() * images.size(0)
        _, predicted = output.max(1)
        total += labels.size(0)

        if 'labels_a' in locals():
            correct += (lam * predicted.eq(labels_a).sum().item() + (1 - lam) * predicted.eq(labels_b).sum().item())
        else:
            correct += predicted.eq(labels).sum().item()

    train_acc = 100. * correct / total
    train_loss /= total

    # === Validation ===
    model.eval()
    val_loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            images, labels = images.to(device), labels.to(device)
            with autocast():
                output = model(images)
                loss = criterion(output, labels)

            val_loss += loss.item() * images.size(0)
            _, predicted = output.max(1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    val_acc = 100. * correct / total
    val_loss /= total

    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)

    early_stopping(val_loss, model, optimizer)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    if (epoch + 1) % 5 == 0:
        checkpoint_path = os.path.join('/content/drive/MyDrive/checkpoints', f'epoch_{epoch + 1}.pth')
        save_checkpoint(model, optimizer, epoch + 1, checkpoint_path)

    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"                    Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%\n")

  scaler = torch.cuda.amp.GradScaler()


Epoch 1/25 - Training:   0%|          | 0/162 [00:00<?, ?it/s]

  with autocast():


Epoch 1/25 - Validation:   0%|          | 0/41 [00:00<?, ?it/s]

  with autocast():


Validation loss decreased (inf --> 0.621506).  Saving model ...
Epoch 1/25: Train Loss: 0.6627, Train Acc: 62.41%
                    Val Loss: 0.6215, Val Acc: 69.84%



Epoch 2/25 - Training:   0%|          | 0/162 [00:00<?, ?it/s]

Epoch 2/25 - Validation:   0%|          | 0/41 [00:00<?, ?it/s]

Validation loss decreased (0.621506 --> 0.613814).  Saving model ...
Epoch 2/25: Train Loss: 0.6203, Train Acc: 69.74%
                    Val Loss: 0.6138, Val Acc: 68.83%



Epoch 3/25 - Training:   0%|          | 0/162 [00:00<?, ?it/s]

Epoch 3/25 - Validation:   0%|          | 0/41 [00:00<?, ?it/s]

Validation loss decreased (0.613814 --> 0.604803).  Saving model ...
Epoch 3/25: Train Loss: 0.6054, Train Acc: 71.81%
                    Val Loss: 0.6048, Val Acc: 70.46%



Epoch 4/25 - Training:   0%|          | 0/162 [00:00<?, ?it/s]

Epoch 4/25 - Validation:   0%|          | 0/41 [00:00<?, ?it/s]

Validation loss decreased (0.604803 --> 0.594189).  Saving model ...
Epoch 4/25: Train Loss: 0.5894, Train Acc: 73.98%
                    Val Loss: 0.5942, Val Acc: 72.16%



Epoch 5/25 - Training:   0%|          | 0/162 [00:00<?, ?it/s]

Epoch 5/25 - Validation:   0%|          | 0/41 [00:00<?, ?it/s]

Validation loss decreased (0.594189 --> 0.579582).  Saving model ...
Checkpoint saved to /content/drive/MyDrive/checkpoints/epoch_5.pth
Epoch 5/25: Train Loss: 0.5911, Train Acc: 73.15%
                    Val Loss: 0.5796, Val Acc: 73.32%



Epoch 6/25 - Training:   0%|          | 0/162 [00:00<?, ?it/s]

Epoch 6/25 - Validation:   0%|          | 0/41 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 3
Epoch 6/25: Train Loss: 0.5844, Train Acc: 73.77%
                    Val Loss: 0.5851, Val Acc: 72.16%



Epoch 7/25 - Training:   0%|          | 0/162 [00:00<?, ?it/s]

Epoch 7/25 - Validation:   0%|          | 0/41 [00:00<?, ?it/s]

EarlyStopping counter: 2 out of 3
Epoch 7/25: Train Loss: 0.5792, Train Acc: 74.66%
                    Val Loss: 0.5852, Val Acc: 72.93%



Epoch 8/25 - Training:   0%|          | 0/162 [00:00<?, ?it/s]