In [17]:
# #Step 1: Install Required Libraries
# !pip install transformers evaluate opencv-python huggingface_hub -q

In [18]:
# import torch
# import torchvision
# print(torch.__version__, torchvision.__version__)
#print(accelerate.version)

In [19]:
#!pip install 'accelerate>=0.26.0'

In [20]:
#Step 2: Import Libraries and Set Up the Environment
import torch
import os
import cv2
import numpy as np
from pathlib import Path
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

# Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {torch.cuda.get_device_name(0)}" if device.type == "cuda" else "Using CPU")


Using device: NVIDIA L4


In [21]:
#Step 3: Configure Model and Dataset
# Model and dataset configuration
model_ckpt = "MCG-NJU/videomae-large"
dataset_root_path = Path("train_70")  # Replace with your dataset path
resize_to = 224  # Resize frames to 224x224
num_frames = 16  # Number of frames per video
batch_size = 4  # Training batch size
num_epochs = 10  # Number of epochs


In [22]:
#Step 4: Define Video Preprocessing
def load_video_frames(video_path, num_frames=16):
    """Extract frames from video."""
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * frame_count // num_frames)
        ret, frame = cap.read()
        if ret:
            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    cap.release()
    return frames


In [23]:
#Step 5: Create a Dataset Class
class VideoDataset(Dataset):
    def __init__(self, video_paths, labels, processor, num_frames=16):
        self.video_paths = video_paths
        self.labels = labels
        self.processor = processor
        self.num_frames = num_frames

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        video_frames = load_video_frames(video_path, num_frames=self.num_frames)
        inputs = self.processor(video_frames, return_tensors="pt").pixel_values
        return {"pixel_values": inputs.squeeze(0), "labels": torch.tensor(label, dtype=torch.long)}


In [24]:
#Step 6: Prepare the Dataset
# Load video file paths
all_video_file_paths = list(dataset_root_path.glob("*/*.mp4"))

# Assign binary labels: 0 (normal), 1 (anomaly)
binary_labels = [0 if "normal" in str(path.parent.name).lower() else 1 for path in all_video_file_paths]

# Split the dataset
train_paths, test_paths, train_labels, test_labels = train_test_split(
    all_video_file_paths, binary_labels, test_size=0.2, random_state=42
)

# Initialize datasets
processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
train_dataset = VideoDataset(train_paths, train_labels, processor)
test_dataset = VideoDataset(test_paths, test_labels, processor)


In [25]:
#Step 7: Define the Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
)

model = VideoMAEForVideoClassification.from_pretrained(
    model_ckpt, num_labels=2, ignore_mismatched_sizes=True
).to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)




Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
#Step 8: Train the model
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6253,0.662732,0.552124
2,0.7016,0.621963,0.733591
3,0.7972,0.626484,0.818533
4,0.3501,0.886502,0.841699
5,0.0004,1.146523,0.822394
6,0.0001,1.354682,0.814672
7,0.0,1.182351,0.837838
8,0.0,1.48397,0.822394
9,0.0,1.506934,0.822394
10,0.0,1.515279,0.822394


TrainOutput(global_step=2590, training_loss=0.1755545456667204, metrics={'train_runtime': 5461.0965, 'train_samples_per_second': 1.897, 'train_steps_per_second': 0.474, 'total_flos': 4.549103262647452e+19, 'train_loss': 0.1755545456667204, 'epoch': 10.0})

In [27]:
#Step 9: Evaluate the Model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")


Evaluation Results: {'eval_loss': 0.6219627261161804, 'eval_accuracy': 0.7335907335907336, 'eval_runtime': 88.4879, 'eval_samples_per_second': 2.927, 'eval_steps_per_second': 0.735, 'epoch': 10.0}


In [28]:
#Step 10: Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_anomaly_detector_10")
processor.save_pretrained("./fine_tuned_anomaly_detector_10")


['./fine_tuned_anomaly_detector_10/preprocessor_config.json']

In [29]:
#Step 11: Push to Hugging Face Hub
from huggingface_hub import login
login(token="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")  # Replace with your Hugging Face token

model.push_to_hub("Sathwik-kom/anomaly-detector-videomae10")
processor.push_to_hub("Sathwik-kom/anomaly-detector-videomae10")


model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sathwik-kom/anomaly-detector-videomae10/commit/a5d4e92c098b21252c13ad26c363d2a7803af790', commit_message='Upload processor', commit_description='', oid='a5d4e92c098b21252c13ad26c363d2a7803af790', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Sathwik-kom/anomaly-detector-videomae10', endpoint='https://huggingface.co', repo_type='model', repo_id='Sathwik-kom/anomaly-detector-videomae10'), pr_revision=None, pr_num=None)

In [30]:
def detect_anomaly(model, video_path, processor, num_frames=16):
    video_frames = load_video_frames(video_path, num_frames)
    inputs = processor(video_frames, return_tensors="pt", padding=True)
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
    timestamps = []
    
    # Iterate through predictions and mark anomalies (assume non-normal class is anomaly)
    for idx, prediction in enumerate(predictions):
        if prediction != 0:  # Assuming '0' is the 'normal' class
            timestamp = idx * (1/30)  # Assuming 30 fps
            timestamps.append((timestamp, video_frames[idx]))  # Store (timestamp, frame)
    
    return timestamps


In [31]:
import cv2

def detect_anomaly_with_bounding_box(model, video_path, processor, num_frames=16):
    timestamps = detect_anomaly(model, video_path, processor, num_frames)
    
    for timestamp, frame in timestamps:
        print(f"Anomaly detected at {timestamp:.2f} seconds")

        # Draw bounding box around the anomaly (example: full-frame bounding box)
        frame_with_box = cv2.rectangle(frame, (0, 0), (frame.shape[1], frame.shape[0]), (0, 255, 0), 2)
        
        # Display the frame
        cv2.imshow(f"Anomaly at {timestamp:.2f}s", frame_with_box)
        cv2.waitKey(0)
    
    cv2.destroyAllWindows()


In [18]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("video-classification", model="Sathwik-kom/anomaly-detector-videomae10")
pipe("video8.mp4")

Device set to use cpu


[{'score': 0.9536139369010925, 'label': 'LABEL_1'},
 {'score': 0.046386029571294785, 'label': 'LABEL_0'}]