In [172]:
import os
import tqdm
import torch
import torchvision

import numpy as np

from enum import StrEnum

from models.s3d import S3D
from utils import LabelEncoderFactory, UniformTemporalSubsample

from video_dataset import VideoDataset
from video_dataset.dataset import VideoShapeComponents
from video_dataset.video import VideoFromVideoFramesDirectory
from video_dataset.preprocessor import extract_frames_from_videos
from video_dataset.annotations import AnnotationsFromSegmentLevelCsvFileAnnotations

from cached_dataset import DiskCachedDataset

from data_augmentation import AugmentDataset

In [90]:
DATASET_PATH = "/Users/nadir/Documents/research-project-dataset"

VIDEOS_DIRECTORY_NAME = "videos"
ANNOTATIONS_DIRECTORY_NAME = "annotations"
VIDEOS_FRAMES_DIRECTORY_NAME = "videos_frames"

TRAINING_IDS_FILE_NAME = "training_ids.txt"
TESTING_IDS_FILE_NAME = "testing_ids.txt"
ANNOTATED_IDS_FILE_NAME = "annotated_ids.txt"
UNANNOTATED_IDS_FILE_NAME = "unannotated_ids.txt"

In [91]:
label_encoder = LabelEncoderFactory.get()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [92]:
extract_frames_from_videos(
    videos_dir=os.path.join(DATASET_PATH, VIDEOS_DIRECTORY_NAME),
    output_dir=os.path.join(DATASET_PATH, VIDEOS_FRAMES_DIRECTORY_NAME),
)

[INFO]: frames for "climb_1-climber_MoubeAdrian-bloc_1-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_1-climber_MoubeAdrian-bloc_1-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_10-climber_DouglasSophia-bloc_1-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_10-climber_DouglasSophia-bloc_1-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_11-climber_MoubeAdrian-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_11-climber_MoubeAdrian-bloc_2-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_12-climber_MrideEsteban-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_12-climber_MrideEsteban-bloc_2-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_13-climber_FonneLana-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_13-climber_FonneLana-blo

In [166]:
model = S3D(num_class=400)

weights_file = 'weights/s3d_kinetics400.pt'
        
if os.path.isfile(weights_file):
    print('[s3d]: loading weights.')
        
    weight_dict = torch.load(weights_file, map_location=device, weights_only=False)
    model_dict = model.state_dict()
    for name, param in weight_dict.items():
        if 'module' in name:
            name = '.'.join(name.split('.')[1:])
        if name in model_dict:
            if param.size() == model_dict[name].size():
                model_dict[name].copy_(param)
            else:
                print(' size? ' + name, param.size(), model_dict[name].size())

    print('[s3d]: loaded weights.')
else:
    raise ValueError('No weight file.')

print(f"--- --- ---")

modules = list(model.modules())

print(f"[len(modules)]: {len(modules)}")
print(f"[modules]: {modules[-1]}")

model.fc = torch.nn.Conv3d(1024, len(label_encoder.classes_), kernel_size=(1, 1, 1), stride=(1, 1, 1))

modules = list(model.modules())
print(f"[modules]: {modules[-1]}")

[s3d]: loading weights.
[s3d]: loaded weights.
--- --- ---
[len(modules)]: 350
[modules]: Conv3d(1024, 400, kernel_size=(1, 1, 1), stride=(1, 1, 1))
[modules]: Conv3d(1024, 5, kernel_size=(1, 1, 1), stride=(1, 1, 1))


In [167]:
def __aggregate_labels(str_labels):
    labels = label_encoder.transform(str_labels)
    
    unique_elements, counts = np.unique(labels, return_counts=True)

    max_count_index = np.argmax(counts)

    most_frequent_element = unique_elements[max_count_index]
    
    return most_frequent_element

In [168]:
class DatasetVariant(StrEnum):
    TRAINING = TRAINING_IDS_FILE_NAME
    VALIDATION = TESTING_IDS_FILE_NAME

def __get_dataset(variant: DatasetVariant, load_videos=True):
    ids_file = variant.value
    
    return VideoDataset(
        annotations_dir=os.path.join(DATASET_PATH, ANNOTATIONS_DIRECTORY_NAME),
        videos_dir=os.path.join(DATASET_PATH, VIDEOS_FRAMES_DIRECTORY_NAME),
        ids_file=os.path.join(DATASET_PATH, ids_file),
        segment_size=32,
        video_processor=VideoFromVideoFramesDirectory,
        annotations_processor=AnnotationsFromSegmentLevelCsvFileAnnotations,
        annotations_processor_kwargs={"fps": 25, "delimiter": ","},
        video_shape=(VideoShapeComponents.CHANNELS, VideoShapeComponents.TIME, VideoShapeComponents.HEIGHT, VideoShapeComponents.WIDTH),
        annotations_transform=__aggregate_labels,
        frames_transform=torchvision.transforms.Compose([
            torchvision.transforms.Lambda(torch.tensor),
            UniformTemporalSubsample(num_samples=16, temporal_dim=1),
            torchvision.transforms.Resize(size=(224, 224)),
            torchvision.transforms.Lambda(lambda x: x / max(255.0, x.max())),
        ]),
        overlap=0,
        verbose=False,
        load_videos=load_videos
)

In [169]:
yolo_dataset = DiskCachedDataset(
    base_path=os.path.join(DATASET_PATH, "features/yolo-features"),
)

temp_dataset = {
    DatasetVariant.TRAINING: __get_dataset(DatasetVariant.TRAINING, load_videos=False),
    DatasetVariant.VALIDATION: __get_dataset(DatasetVariant.VALIDATION, load_videos=False),
}

def with_ignore_classes(classes: list[str], dataset, variant: DatasetVariant):
    return torch.utils.data.Subset(dataset, [i for i in range(len(dataset)) if temp_dataset[variant][i][1] not in label_encoder.transform(classes)])

def with_ignore_pensionless_segments(dataset):
    return torch.utils.data.Subset(dataset, [i for i in range(len(dataset)) if torch.count_nonzero(torch.sum(yolo_dataset[i][0], dim=1)) >= 4])

In [170]:
# wrapper = lambda dataset, variant: with_ignore_pensionless_segments(with_ignore_classes(["nothing"], dataset, variant))
wrapper = lambda dataset, variant: with_ignore_pensionless_segments(dataset)

training_dataset = wrapper(__get_dataset(DatasetVariant.TRAINING), DatasetVariant.TRAINING)
validation_dataset = wrapper(__get_dataset(DatasetVariant.VALIDATION), DatasetVariant.VALIDATION)

print(f"[len(training_dataset)]: {len(training_dataset)}")
print(f"[len(validation_dataset)]: {len(validation_dataset)}")

[len(training_dataset)]: 2584
[len(validation_dataset)]: 986


In [173]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

model.to(device)

training_loader = torch.utils.data.DataLoader(training_dataset, batch_size=4, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=4, shuffle=True)

def train():
    model.train()

    running_loss = 0.0

    with tqdm.tqdm(iterable=enumerate(training_loader), desc="[training]", unit="batch") as progress_bar:
        for i, (inputs, labels, _) in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if i % 10 == 9:
                print(f"[{i + 1}, {running_loss / 10}]")
                running_loss = 0.0
                
train()

[10, 1.4952720403671265]
[20, 1.4575333833694457]
[30, 1.2265396118164062]
[40, 1.27165447473526]
[50, 1.112617027759552]
[60, 1.2272875964641572]
[70, 1.2306511342525481]
[80, 1.238953161239624]
[90, 0.7867084860801696]
[100, 0.8353454291820526]
[110, 0.7859298378229141]
[120, 1.0596587002277373]
[130, 0.7499672621488571]
[140, 0.767366573214531]
[150, 0.8671666622161865]
[160, 1.0782278716564178]
[170, 1.0196568965911865]
[180, 0.7630046546459198]


KeyboardInterrupt: 