<a href="https://colab.research.google.com/github/sharon-kurant/VCE_Remission_Classification/blob/main/custom_train_VCE_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# ! pip install -q accelerate transformers pytorchvideo evaluate

In [None]:
import os
import cv2
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import glob
import numpy as np
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm import tqdm

In [None]:
class VideoDataset(Dataset):
    def __init__(self, video_dir, num_frames=64, tagged_dir = '/content/drive/MyDrive/remission/data', transform=None, multiplier = 5):
        self.video_dir = video_dir
        self.num_frames = num_frames
        self.transform = transform
        self.multiplier = multiplier
        # List all video files in the directory
        self.video_files = glob.glob(f'{tagged_dir}/**/**/*.mp4') * self.multiplier

        class_labels = sorted({str(path).split("/")[-2] for path in self.video_files})
        self.label2id = {label: i for i, label in enumerate(class_labels)}
        self.id2label = {i: label for label, i in self.label2id.items()}

        self.labels = [str(path).split("/")[-2] for path in self.video_files] * self.multiplier

    def __len__(self):
        return len(self.video_files)

    def __getitem__(self, idx):

        video_file = self.video_files[idx]
        video_path = os.path.join(self.video_dir, video_file)

        # Open the video file
        cap = cv2.VideoCapture(video_path)

        # Get the total number of frames in the video
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Randomly select 32 frames
        start_frame = torch.randint(0, total_frames - self.num_frames + 1, (1,)).item()
        frames = []

        # Read frames
        for i in range(self.num_frames):
            # Set the frame position
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)

            # Read the frame
            ret, frame = cap.read()

            if not ret:
                # Handle the case where reading the frame fails
                raise RuntimeError(f"Failed to read frame {i} from video {video_file}")

            # Convert the frame from BGR to RGB
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            if self.transform:
              frame = self.transform(frame)
            # Append the frame to the list of frames
            frames.append(frame)

        # Close the video file
        cap.release()

        # Convert frames to a tensor
        frames = torch.stack(frames)

        # Apply transformations if provided
        # if self.transform:
        #     frames = self.transform(frames)

        return frames, self.label2id[self.labels[idx]]


In [None]:
# Define transformations (if any)
transform = transforms.Compose([
    # Add any transformations you need (e.g., normalization, resizing)
    transforms.ToPILImage(),
    transforms.CenterCrop((510,510)),
    # transforms.Resize((224, 224)),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define your dataset and DataLoader
video_directory = '/content/drive/MyDrive/remission/data_short_5fps'
batch_size = 2  # Adjust as needed
multiplier=5
# num_classes = 2  # Adjust based on your classification task

dataset = VideoDataset(video_directory, num_frames=64, transform=transform, multiplier=multiplier)

In [None]:
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from transformers import AutoImageProcessor, TimesformerForVideoClassification
model_ckpt = "fcakyon/timesformer-large-finetuned-ssv2"
image_processor = AutoImageProcessor.from_pretrained(model_ckpt)
model = TimesformerForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=dataset.label2id,
    id2label=dataset.id2label,
    output_hidden_states=True,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at fcakyon/timesformer-large-finetuned-ssv2 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([174, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([174]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
image_processor = AutoImageProcessor.from_pretrained(model_ckpt)
model = TimesformerForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=dataset.label2id,
    id2label=dataset.id2label,
    output_hidden_states=True,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at fcakyon/timesformer-large-finetuned-ssv2 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([174, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([174]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
num_epochs = 1

# Define your model, loss function, and optimizer
# model = VideoClassifierModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)

# Set the number of videos in the dataset (adjust as needed)
dataset_videos_amount = 61
# Set the number of folds
num_folds = 5

# Use KFold for 5-fold cross-validation
kf = KFold(n_splits=num_folds, shuffle=True, random_state=123)
model.cuda()
# Training loop
for fold, (train_indices, val_indices) in enumerate(kf.split(range(dataset_videos_amount))):
    print(f"Fold {fold + 1}")

    train_examples = np.array([train_index + (r*61) for train_index in train_indices for r in range(multiplier)])
    val_examples = np.array([val_index + (r*61) for val_index in val_indices for r in range(multiplier)])
    # Split the dataset into training and validation sets
    train_set = torch.utils.data.Subset(dataset, train_examples)
    val_set = torch.utils.data.Subset(dataset, val_examples)

    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

    # Training for the current fold
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        # Use tqdm for progress visualization
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Training'):
            frames, labels = batch

            # Move data to CUDA
            frames, labels = frames.cuda(), labels.cuda()

            # Forward pass
            outputs = model(frames)

            # Compute the loss
            loss = criterion(outputs.logits, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Validation for the current fold
        model.eval()
        all_labels = []
        all_probs = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Validation'):
                frames, labels = batch

                # Move data to CUDA
                frames, labels = frames.cuda(), labels.cuda()

                # Forward pass
                outputs = model(frames)

                # Get predicted probabilities
                probs = torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()

                all_labels.extend(labels.cpu().numpy())
                all_probs.extend(probs)

        # Calculate AUC and accuracy
        auc = roc_auc_score(all_labels[::multiplier], np.array(all_probs).reshape(-1,multiplier).mean(axis=1))
        accuracy = accuracy_score(all_labels[::multiplier], (np.array(all_probs).reshape(-1,multiplier).mean(axis=1) > 0.5).astype(int))
        print(f"Epoch {epoch + 1}/{num_epochs}, Validation AUC: {auc:.4f}, Accuracy: {accuracy:.4f}")


Fold 1


Epoch 1/1 - Training: 100%|██████████| 120/120 [05:21<00:00,  2.68s/it]
Epoch 1/1 - Validation: 100%|██████████| 33/33 [00:53<00:00,  1.61s/it]


Epoch 1/1, Validation AUC: 0.4167, Accuracy: 0.6923
Fold 2


Epoch 1/1 - Training: 100%|██████████| 123/123 [05:23<00:00,  2.63s/it]
Epoch 1/1 - Validation: 100%|██████████| 30/30 [00:44<00:00,  1.50s/it]


Epoch 1/1, Validation AUC: 0.7407, Accuracy: 0.7500
Fold 3


Epoch 1/1 - Training: 100%|██████████| 123/123 [05:16<00:00,  2.58s/it]
Epoch 1/1 - Validation: 100%|██████████| 30/30 [00:49<00:00,  1.64s/it]


Epoch 1/1, Validation AUC: 0.9630, Accuracy: 0.7500
Fold 4


Epoch 1/1 - Training: 100%|██████████| 123/123 [05:09<00:00,  2.52s/it]
Epoch 1/1 - Validation: 100%|██████████| 30/30 [00:50<00:00,  1.67s/it]


Epoch 1/1, Validation AUC: 0.6250, Accuracy: 0.6667
Fold 5


Epoch 1/1 - Training: 100%|██████████| 123/123 [04:58<00:00,  2.43s/it]
Epoch 1/1 - Validation: 100%|██████████| 30/30 [00:48<00:00,  1.62s/it]

Epoch 1/1, Validation AUC: 0.6296, Accuracy: 0.7500





In [None]:
image_processor = AutoImageProcessor.from_pretrained(model_ckpt)
model = TimesformerForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=dataset.label2id,
    id2label=dataset.id2label,
    output_hidden_states=True,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

In [None]:
num_epochs = 2

# Define your model, loss function, and optimizer
# model = VideoClassifierModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)

# Set the number of videos in the dataset (adjust as needed)
dataset_videos_amount = 61
# Set the number of folds
num_folds = 5

# Use KFold for 5-fold cross-validation
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
model.cuda()
# Training loop
for fold, (train_indices, val_indices) in enumerate(kf.split(range(dataset_videos_amount))):
    print(f"Fold {fold + 1}")

    train_examples = np.array([train_index + (r*61) for train_index in train_indices for r in range(multiplier)])
    val_examples = np.array([val_index + (r*61) for val_index in val_indices for r in range(multiplier)])
    # Split the dataset into training and validation sets
    train_set = torch.utils.data.Subset(dataset, train_examples)
    val_set = torch.utils.data.Subset(dataset, val_examples)

    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

    # Training for the current fold
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        # Use tqdm for progress visualization
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Training'):
            frames, labels = batch

            # Move data to CUDA
            frames, labels = frames.cuda(), labels.cuda()

            # Forward pass
            outputs = model(frames)

            # Compute the loss
            loss = criterion(outputs.logits, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Validation for the current fold
        model.eval()
        all_labels = []
        all_probs = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Validation'):
                frames, labels = batch

                # Move data to CUDA
                frames, labels = frames.cuda(), labels.cuda()

                # Forward pass
                outputs = model(frames)

                # Get predicted probabilities
                probs = torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()

                all_labels.extend(labels.cpu().numpy())
                all_probs.extend(probs)

        # Calculate AUC and accuracy
        auc = roc_auc_score(all_labels[::multiplier], np.array(all_probs).reshape(-1,multiplier).mean(axis=1))
        accuracy = accuracy_score(all_labels[::multiplier], (np.array(all_probs).reshape(-1,multiplier).mean(axis=1) > 0.5).astype(int))
        print(f"Epoch {epoch + 1}/{num_epochs}, Validation AUC: {auc:.4f}, Accuracy: {accuracy:.4f}")


In [None]:
roc_auc_score(all_labels[::multiplier], np.array(all_probs).reshape(-1,multiplier).mean(axis=1))

0.17500000000000002

In [None]:
accuracy_score(all_labels[::multiplier], (np.array(all_probs).reshape(-1,multiplier).mean(axis=1) > 0.5).astype(int))

0.6153846153846154

In [None]:
np.array(all_probs).reshape(-1,multiplier).mean(axis=1)

array([0.30135745, 0.30163318, 0.30173445, 0.301951  , 0.30118406,
       0.3018083 , 0.30135703, 0.3012586 , 0.3012393 , 0.30138716,
       0.3010363 , 0.30114752, 0.30138916], dtype=float32)

In [None]:
np.ones(26).reshape(-1,multiplier)

ValueError: ignored