In [1]:
model_dataset = "vivit-videomae-d1"
dataset_root_path = "d1/Anomaly-detection-Dataset"
batch_size = 1

In [2]:
# prompt: define all_video_file_paths which contain file path of all the videos in the directory /content/UCF101_subset

import os
all_video_file_paths = []
for root, _, files in os.walk(dataset_root_path):
    for file in files:
        if file.endswith((".mp4", ".avi")):  # Add other video extensions if needed
            all_video_file_paths.append(os.path.join(root, file))
            
class_labels = sorted({str(path).split("/")[3] for path in all_video_file_paths})
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")





Unique classes: ['Anomaly', 'Normal'].


In [3]:
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from transformers import  VivitConfig,VivitForVideoClassification, VivitImageProcessor

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")

model = VideoMAEForVideoClassification.from_pretrained(
    "MCG-NJU/videomae-base",
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

#image_processor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k400")
#model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400",
#                                                          label2id=label2id,
#                                                          id2label=id2label,
#                                                          ignore_mismatched_sizes=True
#                                                          )

#image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
#model = VivitForVideoClassification.from_pretrained(
#    "google/vivit-b-16x2-kinetics400",
#    label2id=label2id,
#    id2label=id2label,
#    ignore_mismatched_sizes=True)
model.to(device)


Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VideoMAEForVideoClassification(
  (videomae): VideoMAEModel(
    (embeddings): VideoMAEEmbeddings(
      (patch_embeddings): VideoMAEPatchEmbeddings(
        (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
      )
    )
    (encoder): VideoMAEEncoder(
      (layer): ModuleList(
        (0-11): 12 x VideoMAELayer(
          (attention): VideoMAESdpaAttention(
            (attention): VideoMAESdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): VideoMAESelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): VideoMAEIntermediate(
            (den

In [9]:

import pytorchvideo.data
import torchvision

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)
mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)

train_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "train"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)
val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

val_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "val"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

test_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "test"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)
print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)

import imageio
import numpy as np
from IPython.display import Image

def unnormalize_img(img):
    """Un-normalizes the image pixels."""
    img = (img * std) + mean
    img = (img * 255).astype("uint8")
    return img.clip(0, 255)

def create_gif(video_tensor, filename="sample.gif"):
    """Prepares a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)
    kargs = {"duration": 0.25}
    imageio.mimsave(filename, frames, "GIF", **kargs)
    return filename

def display_gif(video_tensor, gif_name="sample.gif"):
    """Prepares and displays a GIF from a video tensor."""
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)

sample_video = next(iter(train_dataset))
video_tensor = sample_video["video"]


import wandb

# Replace with your actual W&B API key
API_KEY = "f7b65d8399dd6262084e166f128e83f97b568e6e"

# Log in to W&B with the API key
wandb.login(key=API_KEY,relogin=True)

# Project details
PROJECT = model_dataset
MODEL_NAME = model_dataset
DATASET = "UCF Anomaly Multiclass Classification"

# Initialize W&B with an increased timeout
wandb.init(
    project=PROJECT,
    tags=[MODEL_NAME, DATASET],
    notes="model training"  # Increase timeout to 300 seconds
)




from transformers import TrainingArguments, Trainer

model_name = model_dataset.split("-")[-2]
new_model_name = model_dataset
num_epochs = 10

args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    eval_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
    fp16=True,
    report_to="wandb"
)

import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


def collate_fn(examples):
    # permute to (num_frames, num_channels, height, width)
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}


trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)
import torch





1201 175 525


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▂▃▄▄▄▆▇██
eval/loss,▆▇▇▅▆█▅▁▄▃
eval/runtime,█▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁█████████
eval/steps_per_second,▁█████████
train/epoch,▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇██
train/grad_norm,▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▂▄▅███████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁
train/loss,▇▂▃▃▁█▃▁▅▅▁▁▄▅▆▄▄▃▁▆▃▄▁▃▂▁▁▁▁▄▁▁▁▁▄▂▁▃▆▁

0,1
eval/accuracy,0.78759
eval/loss,1.44135
eval/runtime,1671.3883
eval/samples_per_second,16.43
eval/steps_per_second,16.43
total_flos,1.0522529178565018e+19
train/epoch,9.1
train/global_step,12010.0
train/grad_norm,0.00314
train/learning_rate,0.0


  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [10]:
train_results = trainer.train()
trainer.push_to_hub()

Epoch,Training Loss,Validation Loss,Accuracy
0,0.4829,2.241519,0.621609
1,0.2241,1.786729,0.715524
2,0.4836,1.725545,0.714213
3,0.0001,1.779194,0.760424
4,0.001,1.810428,0.716289
5,0.0001,1.501779,0.75995
6,0.0008,1.8564,0.778777
7,0.6079,1.552917,0.76443
8,0.9314,1.719957,0.782018
9,0.392,1.71811,0.782892


CommitInfo(commit_url='https://huggingface.co/skywalker290/vivit-videomae-d1/commit/6de5445692c2610f389081a02d825a80faf3027a', commit_message='End of training', commit_description='', oid='6de5445692c2610f389081a02d825a80faf3027a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/skywalker290/vivit-videomae-d1', endpoint='https://huggingface.co', repo_type='model', repo_id='skywalker290/vivit-videomae-d1'), pr_revision=None, pr_num=None)

In [1]:
model_dataset = "vivit-vivit-d2"
dataset_root_path = "d2/Anomaly-Multiclass-Dataset"
batch_size = 1

# prompt: define all_video_file_paths which contain file path of all the videos in the directory /content/UCF101_subset

import os
all_video_file_paths = []
for root, _, files in os.walk(dataset_root_path):
    for file in files:
        if file.endswith((".mp4", ".avi")):  # Add other video extensions if needed
            all_video_file_paths.append(os.path.join(root, file))
            
class_labels = sorted({str(path).split("/")[3] for path in all_video_file_paths})
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")





Unique classes: ['Abuse', 'Arrest', 'Arson', 'Assault', 'Burglary', 'Explosion', 'Fighting', 'RoadAccidents', 'Robbery', 'Shooting', 'Shoplifting', 'Stealing', 'Vandalism'].


In [None]:
%pip install torch

Collecting torch
  Downloading torch-2.4.1-cp38-cp38-manylinux1_x86_64.whl (797.1 MB)
[K     |████████████████████████████████| 797.1 MB 18 kB/s  eta 0:00:017    |████████▏                       | 204.0 MB 1.3 MB/s eta 0:07:34
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[K     |████████████████████████████████| 823 kB 1.0 MB/s eta 0:00:01
[?25hCollecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)
[K     |████████████████████▌           | 113.0 MB 1.2 MB/s eta 0:00:52

In [2]:
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from transformers import  VivitConfig,VivitForVideoClassification, VivitImageProcessor

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")

#model = VideoMAEForVideoClassification.from_pretrained(
#    "MCG-NJU/videomae-base",
#    label2id=label2id,
#    id2label=id2label,
#    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
#)

#image_processor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k400")
#model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400",
#                                                          label2id=label2id,
#                                                          id2label=id2label,
#                                                          ignore_mismatched_sizes=True
#                                                          )

image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
model = VivitForVideoClassification.from_pretrained(
    "google/vivit-b-16x2-kinetics400",
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True)
model.to(device)


ModuleNotFoundError: No module named 'torch'

In [4]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Should return the number of GPUs available
print(torch.cuda.current_device())  # Should return the active device ID


True
2
1


In [None]:

import pytorchvideo.data
import torchvision

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)
mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)

train_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "train"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)
val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

val_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "val"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

test_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "test"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)
print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)

import imageio
import numpy as np
from IPython.display import Image

def unnormalize_img(img):
    """Un-normalizes the image pixels."""
    img = (img * std) + mean
    img = (img * 255).astype("uint8")
    return img.clip(0, 255)

def create_gif(video_tensor, filename="sample.gif"):
    """Prepares a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)
    kargs = {"duration": 0.25}
    imageio.mimsave(filename, frames, "GIF", **kargs)
    return filename

def display_gif(video_tensor, gif_name="sample.gif"):
    """Prepares and displays a GIF from a video tensor."""
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)

sample_video = next(iter(train_dataset))
video_tensor = sample_video["video"]


import wandb

# Replace with your actual W&B API key
API_KEY = "f7b65d8399dd6262084e166f128e83f97b568e6e"

# Log in to W&B with the API key
wandb.login(key=API_KEY,relogin=True)

# Project details
PROJECT = model_dataset
MODEL_NAME = model_dataset
DATASET = "UCF Anomaly Multiclass Classification"

# Initialize W&B with an increased timeout
wandb.init(
    project=PROJECT,
    tags=[MODEL_NAME, DATASET],
    notes="model training"  # Increase timeout to 300 seconds
)




from transformers import TrainingArguments, Trainer

model_name = model_dataset.split("-")[-2]
new_model_name = model_dataset
num_epochs = 10

args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    eval_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
    fp16=True,
    report_to="wandb"
)

import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


def collate_fn(examples):
    # permute to (num_frames, num_channels, height, width)
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}


trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)
import torch



In [None]:
train_results = trainer.train()
trainer.push_to_hub()

In [None]:
model_dataset = "vivit-videomae-d2"
dataset_root_path = "d2/Anomaly-Multiclass-Dataset"
batch_size = 1

In [None]:
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from transformers import  VivitConfig,VivitForVideoClassification, VivitImageProcessor

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")

model = VideoMAEForVideoClassification.from_pretrained(
    "MCG-NJU/videomae-base",
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

#image_processor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k400")
#model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400",
#                                                          label2id=label2id,
#                                                          id2label=id2label,
#                                                          ignore_mismatched_sizes=True
#                                                          )

image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")

model.to(device)


In [None]:

import pytorchvideo.data
import torchvision

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)
mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)

train_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "train"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)
val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

val_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "val"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

test_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "test"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)
print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)

import imageio
import numpy as np
from IPython.display import Image

def unnormalize_img(img):
    """Un-normalizes the image pixels."""
    img = (img * std) + mean
    img = (img * 255).astype("uint8")
    return img.clip(0, 255)

def create_gif(video_tensor, filename="sample.gif"):
    """Prepares a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)
    kargs = {"duration": 0.25}
    imageio.mimsave(filename, frames, "GIF", **kargs)
    return filename

def display_gif(video_tensor, gif_name="sample.gif"):
    """Prepares and displays a GIF from a video tensor."""
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)

sample_video = next(iter(train_dataset))
video_tensor = sample_video["video"]


import wandb

# Replace with your actual W&B API key
API_KEY = "f7b65d8399dd6262084e166f128e83f97b568e6e"

# Log in to W&B with the API key
wandb.login(key=API_KEY,relogin=True)

# Project details
PROJECT = model_dataset
MODEL_NAME = model_dataset
DATASET = "UCF Anomaly Multiclass Classification"

# Initialize W&B with an increased timeout
wandb.init(
    project=PROJECT,
    tags=[MODEL_NAME, DATASET],
    notes="model training"  # Increase timeout to 300 seconds
)




from transformers import TrainingArguments, Trainer

model_name = model_dataset.split("-")[-2]
new_model_name = model_dataset
num_epochs = 10

args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    eval_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
    fp16=True,
    report_to="wandb"
)

import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


def collate_fn(examples):
    # permute to (num_frames, num_channels, height, width)
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}


trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)
import torch

train_results = trainer.train()
trainer.push_to_hub()

In [None]:
model_dataset = "vivit-timesformer-d2"
dataset_root_path = "d2/Anomaly-Multiclass-Dataset"
batch_size = 1


from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from transformers import AutoImageProcessor, TimesformerForVideoClassification
from transformers import  VivitConfig,VivitForVideoClassification, VivitImageProcessor

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")



#image_processor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k400")
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400",
                                                          label2id=label2id,
                                                          id2label=id2label,
                                                          ignore_mismatched_sizes=True
                                                          )

image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")

model.to(device)


In [None]:

import pytorchvideo.data
import torchvision

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)
mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)

train_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "train"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)
val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

val_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "val"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

test_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "test"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)
print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)

import imageio
import numpy as np
from IPython.display import Image

def unnormalize_img(img):
    """Un-normalizes the image pixels."""
    img = (img * std) + mean
    img = (img * 255).astype("uint8")
    return img.clip(0, 255)

def create_gif(video_tensor, filename="sample.gif"):
    """Prepares a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)
    kargs = {"duration": 0.25}
    imageio.mimsave(filename, frames, "GIF", **kargs)
    return filename

def display_gif(video_tensor, gif_name="sample.gif"):
    """Prepares and displays a GIF from a video tensor."""
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)

sample_video = next(iter(train_dataset))
video_tensor = sample_video["video"]


import wandb

# Replace with your actual W&B API key
API_KEY = "f7b65d8399dd6262084e166f128e83f97b568e6e"

# Log in to W&B with the API key
wandb.login(key=API_KEY,relogin=True)

# Project details
PROJECT = model_dataset
MODEL_NAME = model_dataset
DATASET = "UCF Anomaly Multiclass Classification"

# Initialize W&B with an increased timeout
wandb.init(
    project=PROJECT,
    tags=[MODEL_NAME, DATASET],
    notes="model training"  # Increase timeout to 300 seconds
)




from transformers import TrainingArguments, Trainer

model_name = model_dataset.split("-")[-2]
new_model_name = model_dataset
num_epochs = 10

args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    eval_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
    fp16=True,
    report_to="wandb"
)

import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


def collate_fn(examples):
    # permute to (num_frames, num_channels, height, width)
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}


trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)
import torch
train_results = trainer.train()
trainer.push_to_hub()
