In [1]:
import torchvision
from torchvision.io import read_video
import lightning
import torchmetrics
import timm
import os
import glob
import numpy as np
import av
import cv2
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import ast
import lightning as L
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import WeightedRandomSampler
from torchmetrics import MetricCollection
from torchmetrics.classification import (
    MulticlassAccuracy,
    MulticlassF1Score,
    MulticlassAUROC,
    BinaryAccuracy
)

from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed = 42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    L.seed_everything(seed)

In [3]:
set_seed(42)

Seed set to 42


In [4]:

train_dir = "data_train_short\data_train_short"
test_dir = "data_test_short"

video_train_paths = glob.glob(os.path.join(train_dir, "*", "*.mp4"))
video_test_paths = glob.glob(os.path.join(test_dir, "*", "*.mp4"))
print(video_train_paths)

['data_train_short\\data_train_short\\-220020068_456239859\\-220020068_456239859.mp4', 'data_train_short\\data_train_short\\-220020068_456241671\\-220020068_456241671.mp4', 'data_train_short\\data_train_short\\-220020068_456241672\\-220020068_456241672.mp4', 'data_train_short\\data_train_short\\-220020068_456241673\\-220020068_456241673.mp4', 'data_train_short\\data_train_short\\-220020068_456241682\\-220020068_456241682.mp4', 'data_train_short\\data_train_short\\-220020068_456241755\\-220020068_456241755.mp4', 'data_train_short\\data_train_short\\-220020068_456241756\\-220020068_456241756.mp4', 'data_train_short\\data_train_short\\-220020068_456241758\\-220020068_456241758.mp4', 'data_train_short\\data_train_short\\-220020068_456241844\\-220020068_456241844.mp4', 'data_train_short\\data_train_short\\-220020068_456241845\\-220020068_456241845.mp4', 'data_train_short\\data_train_short\\-220020068_456241846\\-220020068_456241846.mp4', 'data_train_short\\data_train_short\\-220020068_45624

In [5]:
def read_video_frame_by_frame(path):
    container = av.open(path)
    for frame in container.decode(video=0):
        yield frame.to_ndarray(format="rgb24")

In [6]:
def read_video_safe(path, target_size=(112, 112)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break 
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame).resize(target_size)
            frames.append(np.array(frame))

    except Exception as e:
        print(f"error {path}")
        if len(frames) > 0:
            return frames
        else:
            return None
    finally:
        cap.release()
    if len(frames)>0:
        return frames
    else:
        return None



In [7]:
valid_videos = []
for path in video_train_paths[0:1]:
    video = read_video_safe(path)
    if video is not None:
        valid_videos.append(video)
video_train = valid_videos

In [8]:
valid_videos = []
for path in video_test_paths[0:1]:
    video = read_video_safe(path)
    if video is not None:
        valid_videos.append(video)
video_test = valid_videos

In [9]:
def time_to_frames(str):
    h, m, s = map(int, str.split(':'))
    return (h*3600 + m*60 + s)*24

In [10]:
with open(r"C:\UCHYOBA\VK\labels_json\labels_json\train_labels.json", "r", encoding='utf-8') as f:
    content = f.read()
    data = ast.literal_eval(content)
    result = {key: (min(time_to_frames(val["start"]), time_to_frames(val["end"])),
                    max(time_to_frames(val["start"]), time_to_frames(val["end"])))
    for key, val in data.items()
}
sorted_data_train = {k: result[k] for k in sorted(result)}
print(sorted_data_train)

        

{'-220020068_456239859': (360, 672), '-220020068_456241671': (4560, 5448), '-220020068_456241672': (4680, 5544), '-220020068_456241673': (3288, 4128), '-220020068_456241682': (3144, 3720), '-220020068_456241755': (4512, 4608), '-220020068_456241756': (1584, 1680), '-220020068_456241758': (1488, 2832), '-220020068_456241844': (3936, 4032), '-220020068_456241845': (1608, 1680), '-220020068_456241846': (1800, 1896), '-220020068_456241847': (9744, 9840), '-220020068_456241849': (4392, 5736), '-220020068_456241850': (1464, 1536), '-220020068_456241851': (360, 1560), '-220020068_456248657': (3576, 3600), '-220020068_456249667': (144, 240), '-220020068_456249692': (144, 240), '-220020068_456249693': (144, 240), '-220020068_456249716': (144, 216), '-220020068_456249719': (144, 240), '-220020068_456249720': (144, 216), '-220020068_456249732': (144, 240), '-220020068_456249733': (144, 216), '-220020068_456249739': (144, 240), '-220020068_456252055': (264, 11040), '-220020068_456253855': (11808, 

In [11]:
with open(r"C:\UCHYOBA\VK\labels_json\labels_json\test_labels.json", "r", encoding='utf-8') as f:
    content = f.read()
    data = ast.literal_eval(content)
    result = {key: (min(time_to_frames(val["start"]), time_to_frames(val["end"])),
                    max(time_to_frames(val["start"]), time_to_frames(val["end"])))
    for key, val in data.items()
}

sorted_data_test = {k: result[k] for k in sorted(result)}
    

In [12]:
class IntroDataset(Dataset):
    def __init__(self, videos, labels_dict, filenames, clip_len=16, fps=24, transform=None):
        self.clip_len = clip_len
        self.fps = fps
        self.transform = transform

        self.samples = []

        for idx, video_frames in enumerate(videos):
            name = filenames[idx]
            if name not in labels_dict:
                continue
            label_range = labels_dict[name]
            frames = np.array(video_frames) 
            for start_idx in range(0, len(frames) - clip_len + 1):
                clip = frames[start_idx:start_idx + clip_len]
                overlap = sum([
                    1 for i in range(start_idx, start_idx + clip_len)
                    if label_range[0] <= i <= label_range[1]
                ])
                if overlap / clip_len >= 0.5:
                    label = 1.0
                else:
                    label = 0.0
                self.samples.append((clip, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        clip, label = self.samples[idx]
        clip = clip.transpose(3, 0, 1, 2)
        clip = torch.from_numpy(clip).float()
        if self.transform:
            clip = self.transform(clip)
        return clip, torch.tensor(label, dtype=torch.float32)


In [13]:
train_filenames = [os.path.splitext(os.path.basename(path))[0] for path in video_train_paths[0:2]]


train_dataset = IntroDataset(
    videos=video_train,
    labels_dict=sorted_data_train,
    filenames=train_filenames,
    clip_len=16,
    fps=24
)


In [14]:
test_filenames = [os.path.splitext(os.path.basename(path))[0] for path in video_test_paths[0:2]]

test_dataset = IntroDataset(
    videos=video_test,
    labels_dict=sorted_data_test,
    filenames=test_filenames,
    clip_len=16,
    fps=24
)

In [15]:

intro_frames_train = sum((sorted_data_train[name][1]-sorted_data_train[name][0]) for name in train_filenames)
print(intro_frames_train)
all_frames = sum(len(video) for video in video_train)
print(all_frames)
weights = intro_frames_train/all_frames, 1-intro_frames_train/all_frames 
print(weights)
sampler = WeightedRandomSampler(weights=weights, num_samples=len(train_dataset))

1200
11206
(0.10708548991611637, 0.8929145100838837)


In [16]:
train_set, val_set = train_test_split(
    train_dataset,
    test_size=0.2,
    random_state=42
)

In [17]:
train_loader = DataLoader(train_set, batch_size=4, sampler=sampler)
val_loader =  DataLoader(val_set, batch_size=4, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=4)


In [18]:
model = torchvision.models.video.r3d_18(pretrained=True)
model.fc = nn.Linear(512, 1)



In [19]:
print(model)

VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [20]:
class LModel(L.LightningModule):
    def __init__(self, model, lr=0.001):
        super().__init__()
        self.save_hyperparameters(logger=False)

        # for optimizer and shaduler
        self.lr = lr

        # model
        self.model = model
        self.criterion = nn.CrossEntropyLoss()

        # metrics
        self.metrics = MetricCollection(
            [
                BinaryAccuracy()
            ]
        )
        self.train_metrics = self.metrics.clone(postfix="/train")
        self.val_metrics = self.metrics.clone(postfix="/val")

    def configure_optimizers(self):
        # set optimizer
        optimizer = torch.optim.Adam(
            self.model.parameters(),
            lr=self.lr,
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "epoch",  # or 'step'
                "monitor": "loss/val",  # only for self.log
            },
        }

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.model(x).squeeze(1)
        loss = self.criterion(logits, y)
        probs = torch.sigmoid(logits)
        self.train_metrics.update(probs, y.int())
        self.log("loss/train", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        out = self.model(x)
        loss = self.criterion(out, y)
        self.log("loss/val", loss, prog_bar=True)
        self.val_metrics.update(out.softmax(-1), y)

    def on_train_epoch_end(self):
        self.log_dict(self.train_metrics.compute())
        self.train_metrics.reset()

        self.log_dict(self.val_metrics.compute())
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        out = self.model(x)
        self.metrics.update(out.softmax(-1), y)

    def on_test_epoch_end(self):
        self.log_dict(self.metrics.compute())
        self.metrics.reset()

In [21]:
max_epochs = 20
learning_rate = 0.001
pl_model = LModel(model=model, lr = learning_rate)
logger=L.pytorch.loggers.TensorBoardLogger(save_dir="./logs/ex3/")
trainer = L.Trainer(max_epochs=max_epochs, logger=logger)
trainer.fit(
    model = pl_model,
    train_dataloaders = train_loader,
    val_dataloaders = val_loader
)

C:\Users\Асус\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\lightning\pytorch\utilities\parsing.py:209: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name          | Type             | Params | Mode 
-----------------------------------------------------------
0 | model         | VideoResNet      | 33.2 M | train
1 | criterion     | CrossEntropyLoss | 0      | train
2 | metrics       | MetricCollection | 0      | train
3 | train_metrics | MetricCollection | 0      | train
4 | val_metrics   | MetricCollection | 0      | train
------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\Асус\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=21` in the `DataLoader` to improve performance.


RuntimeError: expected scalar type Long but found Float