In [None]:
from torch.utils.data import Dataset
from torchvision.io import read_video
from torchvision import transforms
import os
import torch.nn as nn
from torchvision.models import video as video_models
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader

class VideoClipDataset(Dataset):
    def __init__(self, root_dir, classes, clip_len=60):
        self.root_dir = root_dir
        self.classes = classes
        self.clip_len = clip_len
        self.samples = []
        for label_idx, label in enumerate(classes):
            class_dir = os.path.join(root_dir, label)
            for fname in os.listdir(class_dir):
                if fname.endswith(".mp4"):
                    self.samples.append((os.path.join(class_dir, fname), label_idx))

        self.transform = transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.Normalize([0.43216, 0.394666, 0.37645],
                                 [0.22803, 0.22145, 0.216989])
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        video_path, label = self.samples[idx]
        video, _, _ = read_video(video_path, pts_unit='sec')  # (T, H, W, C)

        video = video.permute(0, 3, 1, 2).float() / 255.0  # (T, C, H, W)

        if video.shape[0] > self.clip_len:
            video = video[:self.clip_len]
        elif video.shape[0] < self.clip_len:
            pad = self.clip_len - video.shape[0]
            video = torch.cat([video, video[-1:].repeat(pad, 1, 1, 1)], dim=0)

        if self.transform:
            video = self.transform(video)

        video = video.permute(1, 0, 2, 3)  # (C, T, H, W) ← 여기 추가!

        return video, label

def build_model(num_classes=6):
    model = video_models.r3d_18(pretrained=False)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

def train(model, dataloader, device, epochs=5):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    model.train()
    for epoch in range(epochs):
        total_loss, total_acc = 0, 0
        for x, y in tqdm(dataloader):
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            preds = logits.argmax(dim=1)
            acc = (preds == y).float().mean()
            total_loss += loss.item()
            total_acc += acc.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}: Loss {total_loss/len(dataloader):.4f}, Acc {total_acc/len(dataloader):.4f}")

classes = ['normal', 'trespass', 'fight', 'dump', 'burglary', 'vandalism']
clip_output_dir = r"D:\clips"

dataset = VideoClipDataset(clip_output_dir, classes)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = build_model(num_classes=len(classes))
train(model, dataloader, device, epochs=10)
