In [1]:
import random
import torchvision.models as models
import torch.nn as nn
import pandas as pd
import torch
from torch.utils.data import Dataset
import cv2
import numpy as np
from torch.utils.data import DataLoader
import torch
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision

device = 'cuda' if torch.cuda.is_available() else 'cpu'

2024-03-16 13:38:07.239461: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-16 13:38:07.286213: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd

# Загрузка CSV-файла
df = pd.read_csv('train.csv')

# Подсчет количества меток каждого класса
class_counts = df['label'].value_counts()

# Вычисление весов для каждого класса
total_samples = len(df)
class_weights = {}
for class_label, count in class_counts.items():
    weight = total_samples / (len(class_counts) * count)
    class_weights[class_label] = weight

print("Class Weights:", class_weights)

Class Weights: {'notviolence': 0.6802103250478011, 'violence': 1.8872679045092837}


In [3]:
#TODO Возможно нужно подбирать гиперпаораметры 
IMG_SIZE = 640
BATCH_SIZE = 8
EPOCHS = 100
SEQ_LENGTH = 10

In [4]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((640, 640)),
    transforms.ToTensor(),
])

In [5]:
import torch
import torchvision.transforms as transforms
import random

# Функция для аугментации видео
def augment_video(video_tensor):
    # Создаем случайные значения для всех трансформаций
    flip = random.random() > 0.5
    rotation_degree = random.uniform(-15, 15)
    grayscale = random.random() > 0.5

    # Определяем трансформации
    transform = transforms.Compose([
        transforms.RandomRotation(degrees=(rotation_degree, rotation_degree)),
        transforms.Grayscale(num_output_channels=3) if grayscale else transforms.Lambda(lambda x: x)
    ])

    # Применяем трансформации к каждому кадру видео
    augmented_frames = []
    for frame in video_tensor:
        augmented_frame = transform(frame)
        if flip:
            augmented_frame = transforms.functional.hflip(augmented_frame)
        augmented_frames.append(augmented_frame)

    # Преобразуем список кадров в тензор
    augmented_video = torch.stack(augmented_frames)

    return augmented_video

# Пример использования
# Предположим, что video_tensor - это ваш входной тензор размерности [len, 3, 640, 640]
# Создадим случайный тензор для демонстрации
video_tensor = torch.randn(5, 3, 640, 640)

# Вызываем функцию аугментации
augmented_video = augment_video(video_tensor)

# Результат - аугментированный тензор
print(augmented_video.size())

torch.Size([5, 3, 640, 640])


In [6]:
class CustomDataset(Dataset):
    def __init__(self, root_path="", df_path="train.csv", img_size=224, SEQ_LENGTH=10, transform=None):
        self.SEQ_LENGTH = SEQ_LENGTH
        self.root_path = root_path
        self.img_size = img_size
        df = pd.read_csv(df_path)
        self.video_paths = df['path'].tolist()
        self.labels = df['label'].tolist()
        self.transform = transform
        unique_labels = sorted(set(self.labels))
        self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
        self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()}

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        cap = cv2.VideoCapture(self.video_paths[idx])
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        frames = []
        if total_frames >= self.SEQ_LENGTH:
            frame_indices = np.linspace(0, total_frames - 1, self.SEQ_LENGTH, dtype=int)
        else:
            frame_indices = np.tile(np.arange(total_frames), self.SEQ_LENGTH // total_frames + 1)[:self.SEQ_LENGTH]

        for i in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if not ret:
                break

            #
            frame_sum = np.sum(frame, axis = 2)
            y_idx, x_idx = np.where(frame_sum>0)
            #print(y_idx.shape, x_idx.shape)
            if len(y_idx) == 0 and len(x_idx) == 0:
                print(f"({idx}")
                return self.__getitem__(idx-1)
            y_min, y_max = y_idx.min(), y_idx.max()
            x_min, x_max = x_idx.min(), x_idx.max()
            
            cropped_frame = frame[y_min:y_max, x_min:x_max]
            h, w, _ = cropped_frame.shape
            
            if h>w:
                pad = int((h-w)/2)
                padded_frame = np.pad(cropped_frame, ((0,0), (pad,pad), (0,0)))
            else:
                pad = int((w-h)/2)
                padded_frame = np.pad(cropped_frame, ((pad,pad), (0,0), (0,0)))
            
            #
            frame_tensor = self.transform(padded_frame).unsqueeze(0)
            frames.append(frame_tensor)

        while len(frames) < self.SEQ_LENGTH:
            frames.append(torch.zeros_like(frames[0]))

        frames_tensor = torch.cat(frames, dim=0)
        frames_tensor = augment_video(frames_tensor)
        return frames_tensor, self.label_to_idx[self.labels[idx]]

In [7]:
train_dataset = CustomDataset(df_path="train.csv", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=10)
test_dataset = CustomDataset(df_path="test.csv", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=10)

In [8]:
class Model(nn.Module):
  def __init__(self, num_classes):
    super().__init__()
    self.model = torchvision.models.video.swin3d_t()
    self.fc1 = nn.Linear(400, 100)
    self.fc2 = nn.Linear(100, num_classes)
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, x):
    x = self.model(x)
    x = self.fc1(x)
    x = self.fc2(x)
    x = self.softmax(x)
    return x
# model = torchvision.models.video.swin3d_t().to(device)

model = Model(num_classes=2).to(device)

In [9]:
model = nn.DataParallel(model, device_ids = [ 0, 1, 2, 3]).cuda()
#model = nn.DataParallel(model).cuda()

#optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, eps=1e-08, weight_decay=0.0001)
class_weights_tensor = torch.tensor(list(class_weights.values())).cuda()

criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

writer = SummaryWriter()

In [None]:
import logging
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import accuracy_score


open('sgd_training_with_seq_length10.log', 'w').close()
logging.basicConfig(filename='sgd_training_with_seq_length10.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.permute((0, 2, 1, 3, 4)) # torch.Size([8, 10, 3, 640, 640])
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        if i % 10 == 9:  
            print('[%d, %5d] loss: %.7f accuracy: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100, correct / total))
            logging.info('[%d, %5d] loss: %.7f accuracy: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100, correct / total))
            # TODO добавить данные в борд 
            writer.add_scalar('Training Loss', running_loss / 100, epoch * len(train_loader) + i)
            writer.add_scalar('Training Accuracy', correct / total, epoch * len(train_loader) + i)
            running_loss = 0.0
            correct = 0
            total = 0

    model.eval()


    with torch.no_grad():
        all_predictions = []
        all_labels = []
        for inputs, labels in test_loader:
            inputs = inputs.permute((0, 2, 1, 3, 4)) # torch.Size([8, 10, 3, 640, 640])
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
        accuracy = accuracy_score(all_labels, all_predictions)
        f1 = f1_score(all_labels, all_predictions)
        confusion = confusion_matrix(all_labels, all_predictions)
        
        # Вывод F1-меры, аккуратности и матрицы ошибок
        logging.info('Accuracy: %.3f' % accuracy)
        logging.info('F1 Score: %.3f' % f1)
        logging.info('Confusion Matrix:\n %s' % confusion)
        print('Accuracy: %.3f' % accuracy)
        print('F1 Score: %.3f' % f1)
        print('Confusion Matrix:\n', confusion)
        
        if f1 < best_valid_loss:
            best_valid_loss = f1
            torch.save(model.state_dict(), 'best_model.pth')
            logging.info('Model saved with accuracy: %.3f' % (accuracy))
            print('Model saved with accuracy: %.3f' % (accuracy))

print('Finished Training')

writer.close()

[1,    10] loss: 0.0616286 accuracy: 0.675
[1,    20] loss: 0.0550762 accuracy: 0.762
[1,    30] loss: 0.0600762 accuracy: 0.713
[1,    40] loss: 0.0550762 accuracy: 0.762
[1,    50] loss: 0.0650762 accuracy: 0.662
[1,    60] loss: 0.0538262 accuracy: 0.775
[1,    70] loss: 0.0550762 accuracy: 0.762
[1,    80] loss: 0.0588262 accuracy: 0.725
[1,    90] loss: 0.0575762 accuracy: 0.738
[1,   100] loss: 0.0638228 accuracy: 0.675
[1,   110] loss: 0.0538389 accuracy: 0.775
[1,   120] loss: 0.0563262 accuracy: 0.750
[1,   130] loss: 0.0525763 accuracy: 0.787
[1,   140] loss: 0.0550772 accuracy: 0.762
[1,   150] loss: 0.0640020 accuracy: 0.675
[1,   160] loss: 0.0588262 accuracy: 0.725
[1,   170] loss: 0.0550762 accuracy: 0.762
(3
Accuracy: 0.772
F1 Score: 0.000
Confusion Matrix:
 [[275   0]
 [ 81   0]]
Model saved with accuracy: 0.772
[2,    10] loss: 0.0538262 accuracy: 0.775
[2,    20] loss: 0.0575762 accuracy: 0.738
[2,    30] loss: 0.0588262 accuracy: 0.725
[2,    40] loss: 0.0550764 acc