In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import tqdm
import torch

import numpy as np
import matplotlib.pyplot as plt

<div class="alert alert-info">

#### **0 - Requirements**

**NOTE:** If you downloaded the dataset from the github repository, this part can be skipped.

In order to run this notebook a dataset is required. To make it simple you'll need to define the following variables in the `helpers.constants` file:
- `DATASET_PATH`: This is the base path were all the transformations and other thing happening on the dataset will be done.
- `VIDEOS_DIRECTORY_NAME`: This is the directory name inside the `DATASET_PATH` were you put your videos file (.mp4, or .mov, etc).
- `ANNOTATIONS_DIRECTORY_NAME`: This is the directory were you put the annotations file, must be csv files named the same way as the corresponding video file (except the extension).
- `ANNOTATED_IDS_FILE_NAME`: This is a text file containing the names of the annotated videos.
- `UNANNOTATED_IDS_FILE_NAME`: This is a text file containing the names of the unannotated videos.

And:
- `VIDEOS_FRAMES_DIRECTORY_NAME`: This is a name you need to specify and on which the extracted features of the videos will be stored.

</div>

<div class="alert alert-info">

#### **1- Data Preparation**

During this step we'll transform the videos from a video format into a frame by frame format, and thus we'll store each frame of each video in a .pnj file separately.

We do this for faster training as loading images is faster than videos. This step can be skipped (a small modification will be required if so).

</div>

In [5]:
from video_dataset.preprocessor import extract_frames_from_videos

from helpers.constants import \
    DATASET_PATH, \
    VIDEOS_DIRECTORY_NAME, \
    VIDEOS_FRAMES_DIRECTORY_NAME \
    
extract_frames_from_videos(
    videos_dir=os.path.join(DATASET_PATH, VIDEOS_DIRECTORY_NAME),
    output_dir=os.path.join(DATASET_PATH, VIDEOS_FRAMES_DIRECTORY_NAME),
    verbose=True
)

[INFO]: frames for "climb_1-climber_MoubeAdrian-bloc_1-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_1-climber_MoubeAdrian-bloc_1-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_10-climber_DouglasSophia-bloc_1-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_10-climber_DouglasSophia-bloc_1-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_11-climber_MoubeAdrian-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_11-climber_MoubeAdrian-bloc_2-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_12-climber_MrideEsteban-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_12-climber_MrideEsteban-bloc_2-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_13-climber_FonneLana-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_13-climber_FonneLana-blo

<div class="alert alert-info">

#### **2- Feature Extraction**

During this step we are going to import a set of predefined feature extractors which is located in the `helpers.constants` module. We'll then extract features from the annotated videos using this feature extractors.

</div>

In [6]:
from helpers.constants import \
    FEATURES_EXTRACTORS, \
    DATASET_PATH, \
    VIDEOS_FRAMES_DIRECTORY_NAME, \
    ANNOTATIONS_DIRECTORY_NAME, \
    ANNOTATED_IDS_FILE_NAME
from utils import LabelEncoderFactory
from cached_dataset import DiskCachedDataset

from video_dataset import VideoDataset, VideoShapeComponents
from video_dataset.video import VideoFromVideoFramesDirectory
from video_dataset.annotations import AnnotationsFromSegmentLevelCsvFileAnnotations

In [7]:
label_encoder = LabelEncoderFactory.get()

In [16]:
def returns_transform(sample):
    # sample keys: 'frames', 'annotations', 'video_index', 'video_id', 'starting_frame_number_in_video', 'segment_index'
    
    return sample["frames"], sample["annotations"], sample["video_id"], sample["segment_index"]

In [17]:
def __aggregate_labels(string_labels):
    """
    Given a list of string labels, returns the most frequent label and the number of unique labels.
    
    NOTE: The number of unique labels might be used later to determine if a video segment contain a transition in action or not.
    """
    labels = label_encoder.transform(string_labels)
    
    unique_elements, counts = np.unique(labels, return_counts=True)

    max_count_index = np.argmax(counts)

    most_frequent_element = unique_elements[max_count_index]
    
    return most_frequent_element, len(unique_elements)

In [18]:
datasets = []

for extractor in FEATURES_EXTRACTORS:
    segment_size = 32
    
    step = VideoDataset.compute_step(segment_size, extractor.get_required_number_of_frames())

    dataset = VideoDataset(
        annotations_dir=os.path.join(DATASET_PATH, ANNOTATIONS_DIRECTORY_NAME),
        videos_dir=os.path.join(DATASET_PATH, VIDEOS_FRAMES_DIRECTORY_NAME),
        ids_file=os.path.join(DATASET_PATH, ANNOTATED_IDS_FILE_NAME),
        segment_size=segment_size,
        step=step,
        video_processor=VideoFromVideoFramesDirectory,
        annotations_processor=AnnotationsFromSegmentLevelCsvFileAnnotations,
        annotations_processor_kwargs={"fps": 25, "delimiter": ","},
        video_shape=(VideoShapeComponents.CHANNELS, VideoShapeComponents.TIME, VideoShapeComponents.HEIGHT, VideoShapeComponents.WIDTH),
        frames_transform=extractor.transform_and_extract,
        annotations_transform=__aggregate_labels,
        verbose=False,
        return_transform=returns_transform
    )

    print(f"[extractor-{extractor.get_name()}]:")

    disk_cached_dataset = DiskCachedDataset.load_dataset_or_cache_it(
        dataset=dataset, 
        base_path=os.path.join(DATASET_PATH, "features", extractor.get_name()),
        verbose=True
    )
    
    datasets.append(disk_cached_dataset)

[extractor-yolo]:
[extractor-dino]:
[extractor-resnet-3d]:
[extractor-i3d]:
[extractor-clip]:
[extractor-x3d_xs]:
[extractor-x3d_s]:
[extractor-x3d_m]:
[extractor-x3d_l]:
[extractor-s3d-kinetics]:
[extractor-s3d-howto100m]:
[extractor-slowfast]:


<div class="alert alert-info">

#### **3- Data Filtering**

Now we are going to filter the dataset(s) and get rid of the segments were:
- The "nothing" class is present, meaning they are not annotated.
- No person is present in more than half the segment.

</div>

In [None]:
yolo_dataset_index = FEATURES_EXTRACTORS.index(next(filter(lambda x: x.get_name() == "yolo", FEATURES_EXTRACTORS)))

yolo_dataset = datasets[yolo_dataset_index]

# --- --- ---

def extract_segments_without_classless_indices(yolo_dataset, classes: list[str]):
    label_encoder = LabelEncoderFactory.get()
    
    return [i for i in range(len(yolo_dataset)) if yolo_dataset[i][1] not in label_encoder.transform(classes)]

def extract_segments_with_persons_indices(yolo_dataset):
    label_encoder = LabelEncoderFactory.get()
    
    # NOTE: we chose 4 as we take a sample of 32 frames, we then subsample 8 and extract the yolo features from them, thus 4 is the half of 8
    return [i for i in range(len(dataset)) if torch.count_nonzero(torch.sum(yolo_dataset[i][0], dim=1)) >= 4]

# NOTE: this is the indices to keep
filtered_indices = list(set(extract_segments_without_classless_indices(yolo_dataset, ["nothing"]) + extract_segments_with_persons_indices(yolo_dataset)))

# --- --- ---

filtered_datasets = [torch.utils.data.Subset(dataset, filtered_indices) for dataset in datasets]

In [45]:
print(f"[len(datasets[0])]: {len(datasets[0])}")
print(f"[len(filtered_datasets[0])]: {len(filtered_datasets[0])}")
print(f"[difference]: {len(datasets[0]) - len(filtered_datasets[0])}")
print(f"--- --- ---")
print(f"[percentage]: {(len(datasets[0]) - len(filtered_datasets[0])) / len(datasets[0]) * 100:.2f}%")

[len(datasets[0])]: 4098
[len(filtered_datasets[0])]: 3995
[difference]: 103
--- --- ---
[percentage]: 2.51%
