<a href="https://colab.research.google.com/github/qkrjuyeol/multi-cctv/blob/main/%EC%98%81%EC%83%81_%EC%9D%B4%EC%83%81%ED%96%89%EB%8F%99_%ED%83%90%EC%A7%80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import cv2
import xml.etree.ElementTree as ET
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torchvision.models.video as video_models
import numpy as np
from glob import glob
from tqdm import tqdm
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip

In [11]:
# ========== 설정 ==========
video_path = "/content/166-1_cam01_dump02_place03_day_spring.mp4"
xml_path = "/content/166-1_cam01_dump02_place03_day_spring.xml"
clip_output_dir = "/content/clips"
clip_duration = 2  # seconds
clip_fps = 15
clip_length = clip_duration * clip_fps
os.makedirs(clip_output_dir, exist_ok=True)

In [12]:
# ========== XML 파싱 ==========
def parse_drop_ranges(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    drop_ranges = []
    for obj in root.findall("object"):
        action = obj.find("action")
        if action is not None and action.find("actionname").text == "drop":
            start = int(action.find("frame/start").text)
            end = int(action.find("frame/end").text)
            drop_ranges.append((start, end))
    return drop_ranges

In [13]:
# ========== 클립 추출 ==========
def extract_clips_from_video(video_path, drop_ranges, out_dir, fps=30, clip_len=clip_length):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width = 224
    height = 224

    def save_clip(frames, out_path):
        out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'mp4v'), clip_fps, (width, height))
        for f in frames:
            resized = cv2.resize(f, (width, height))
            out.write(resized)
        out.release()

    clip_id = 0
    for i in range(0, total_frames - clip_len, clip_len):
        label = "normal"
        for start, end in drop_ranges:
            if i >= start and i + clip_len <= end:
                label = "drop"
                break
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        frames = []
        for _ in range(clip_len):
            ret, frame = cap.read()
            if not ret:
                break
            frames.append(frame)
        if len(frames) == clip_len:
            save_path = os.path.join(out_dir, f"{label}_{clip_id}.mp4")
            save_clip(frames, save_path)
            clip_id += 1
    cap.release()

In [14]:
# ========== PyTorch Dataset ==========
class VideoClipDataset(Dataset):
    def __init__(self, clip_dir, clip_len=clip_length, transform=None):
        self.paths = glob(os.path.join(clip_dir, "*.mp4"))
        self.labels = [1 if "drop" in p else 0 for p in self.paths]
        self.transform = transform
        self.clip_len = clip_len

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        label = self.labels[idx]
        cap = cv2.VideoCapture(path)
        frames = []
        for _ in range(self.clip_len):
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (112, 112))
            frames.append(frame)
        cap.release()
        frames = np.stack(frames)
        frames = frames.transpose(3, 0, 1, 2)  # (C, T, H, W)
        frames = torch.tensor(frames, dtype=torch.float32) / 255.0
        if self.transform:
            frames = self.transform(frames)
        return frames, torch.tensor(label)

In [15]:
# ========== 모델 구성 (3D CNN - torchvision resnet18 기반) ==========
def build_model():
    model = video_models.r3d_18(pretrained=False)
    model.fc = nn.Linear(model.fc.in_features, 1)
    return model

In [16]:
# ========== 학습 ==========
def train(model, dataloader, device, epochs=5):
    model = model.to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    model.train()
    for epoch in range(epochs):
        total_loss, total_acc = 0, 0
        for x, y in tqdm(dataloader):
            x, y = x.to(device), y.float().to(device)
            logits = model(x).squeeze()
            loss = criterion(logits, y)
            preds = (torch.sigmoid(logits) > 0.5).float()
            acc = (preds == y).float().mean()
            total_loss += loss.item()
            total_acc += acc.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}: Loss {total_loss/len(dataloader):.4f}, Acc {total_acc/len(dataloader):.4f}")

In [None]:
# ========== 실행 ==========
drop_ranges = parse_drop_ranges(xml_path)
extract_clips_from_video(video_path, drop_ranges, clip_output_dir)

dataset = VideoClipDataset(clip_output_dir)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = build_model()
train(model, dataloader, device, epochs=5)

100%|██████████| 75/75 [56:11<00:00, 44.96s/it]


Epoch 1: Loss 0.0752, Acc 0.9900


100%|██████████| 75/75 [56:00<00:00, 44.81s/it]


Epoch 2: Loss 0.0410, Acc 0.9933


100%|██████████| 75/75 [55:50<00:00, 44.67s/it]


Epoch 3: Loss 0.0557, Acc 0.9900


 72%|███████▏  | 54/75 [40:01<15:30, 44.31s/it]

In [None]:
# ========== 모델 저장 ==========
torch.save(model.state_dict(), "/content/dump_detection_r3d18.pth")
print("저장 완료!")


In [None]:
import matplotlib.pyplot as plt

def test_model(model, test_dataset, device):
    model.eval()
    model.to(device)

    for i in range(5):  # 앞에서 5개만 확인해보기
        clip, label = test_dataset[i]
        input_tensor = clip.unsqueeze(0).to(device)  # (1, C, T, H, W)

        with torch.no_grad():
            logits = model(input_tensor).squeeze()
            prob = torch.sigmoid(logits).item()
            pred_label = 1 if prob > 0.5 else 0

        true_label = int(label.item())
        print(f"[Clip {i}] GT: {true_label} | Pred: {pred_label} | Confidence: {prob:.4f}")

        # 첫 프레임 시각화
        frames = clip.permute(1, 2, 3, 0).numpy()  # (T, H, W, C)
        plt.imshow(frames[0])
        plt.title(f"GT: {'DROP' if true_label else 'NORMAL'} | Pred: {'DROP' if pred_label else 'NORMAL'}")
        plt.axis('off')
        plt.show()

# 실행
test_model(model, dataset, device)