In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import tqdm
import torch

import numpy as np
import matplotlib.pyplot as plt

<div class="alert alert-info">

#### **0 - Requirements**

**NOTE:** If you downloaded the dataset from the github repository, this part can be skipped.

In order to run this notebook a dataset is required. To make it simple you'll need to define the following variables in the `helpers.constants` file:
- `DATASET_PATH`: This is the base path were all the transformations and other thing happening on the dataset will be done.
- `VIDEOS_DIRECTORY_NAME`: This is the directory name inside the `DATASET_PATH` were you put your videos file (.mp4, or .mov, etc).
- `ANNOTATIONS_DIRECTORY_NAME`: This is the directory were you put the annotations file, must be csv files named the same way as the corresponding video file (except the extension).
- `ANNOTATED_IDS_FILE_NAME`: This is a text file containing the names of the annotated videos.
- `UNANNOTATED_IDS_FILE_NAME`: This is a text file containing the names of the unannotated videos.

And:
- `VIDEOS_FRAMES_DIRECTORY_NAME`: This is a name you need to specify and on which the extracted features of the videos will be stored.

</div>

<div class="alert alert-info">

#### **1- Data Preparation**

During this step we'll transform the videos from a video format into a frame by frame format, and thus we'll store each frame of each video in a .pnj file separately.

We do this for faster training as loading images is faster than videos. This step can be skipped (a small modification will be required if so).

</div>

In [5]:
from video_dataset.preprocessor import extract_frames_from_videos

from helpers.constants import \
    DATASET_PATH, \
    VIDEOS_DIRECTORY_NAME, \
    VIDEOS_FRAMES_DIRECTORY_NAME \
    
extract_frames_from_videos(
    videos_dir=os.path.join(DATASET_PATH, VIDEOS_DIRECTORY_NAME),
    output_dir=os.path.join(DATASET_PATH, VIDEOS_FRAMES_DIRECTORY_NAME),
    verbose=True
)

[INFO]: frames for "climb_1-climber_MoubeAdrian-bloc_1-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_1-climber_MoubeAdrian-bloc_1-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_10-climber_DouglasSophia-bloc_1-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_10-climber_DouglasSophia-bloc_1-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_11-climber_MoubeAdrian-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_11-climber_MoubeAdrian-bloc_2-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_12-climber_MrideEsteban-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_12-climber_MrideEsteban-bloc_2-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_13-climber_FonneLana-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_13-climber_FonneLana-blo

<div class="alert alert-info">

#### **2- Feature Extraction**

During this step we are going to import a set of predefined feature extractors which is located in the `helpers.constants` module. We'll then extract features from the annotated videos using this feature extractors.

</div>

In [6]:
from helpers.constants import \
    FEATURES_EXTRACTORS, \
    DATASET_PATH, \
    VIDEOS_FRAMES_DIRECTORY_NAME, \
    ANNOTATIONS_DIRECTORY_NAME, \
    ANNOTATED_IDS_FILE_NAME
from utils import LabelEncoderFactory
from cached_dataset import DiskCachedDataset

from video_dataset import VideoDataset, VideoShapeComponents
from video_dataset.video import VideoFromVideoFramesDirectory
from video_dataset.annotations import AnnotationsFromSegmentLevelCsvFileAnnotations

In [7]:
label_encoder = LabelEncoderFactory.get()

In [16]:
def returns_transform(sample):
    # sample keys: 'frames', 'annotations', 'video_index', 'video_id', 'starting_frame_number_in_video', 'segment_index'
    
    return sample["frames"], sample["annotations"], sample["video_id"], sample["segment_index"]

In [17]:
def __aggregate_labels(string_labels):
    """
    Given a list of string labels, returns the most frequent label and the number of unique labels.
    
    NOTE: The number of unique labels might be used later to determine if a video segment contain a transition in action or not.
    """
    labels = label_encoder.transform(string_labels)
    
    unique_elements, counts = np.unique(labels, return_counts=True)

    max_count_index = np.argmax(counts)

    most_frequent_element = unique_elements[max_count_index]
    
    return most_frequent_element, len(unique_elements)

In [18]:
datasets = []

for extractor in FEATURES_EXTRACTORS:
    segment_size = 32
    
    step = VideoDataset.compute_step(segment_size, extractor.get_required_number_of_frames())

    dataset = VideoDataset(
        annotations_dir=os.path.join(DATASET_PATH, ANNOTATIONS_DIRECTORY_NAME),
        videos_dir=os.path.join(DATASET_PATH, VIDEOS_FRAMES_DIRECTORY_NAME),
        ids_file=os.path.join(DATASET_PATH, ANNOTATED_IDS_FILE_NAME),
        segment_size=segment_size,
        step=step,
        video_processor=VideoFromVideoFramesDirectory,
        annotations_processor=AnnotationsFromSegmentLevelCsvFileAnnotations,
        annotations_processor_kwargs={"fps": 25, "delimiter": ","},
        video_shape=(VideoShapeComponents.CHANNELS, VideoShapeComponents.TIME, VideoShapeComponents.HEIGHT, VideoShapeComponents.WIDTH),
        frames_transform=extractor.transform_and_extract,
        annotations_transform=__aggregate_labels,
        verbose=False,
        return_transform=returns_transform
    )

    print(f"[extractor-{extractor.get_name()}]:")

    disk_cached_dataset = DiskCachedDataset.load_dataset_or_cache_it(
        dataset=dataset, 
        base_path=os.path.join(DATASET_PATH, "features", extractor.get_name()),
        verbose=True
    )
    
    datasets.append(disk_cached_dataset)

[extractor-yolo]:
[extractor-dino]:
[extractor-resnet-3d]:
[extractor-i3d]:
[extractor-clip]:
[extractor-x3d_xs]:
[extractor-x3d_s]:
[extractor-x3d_m]:
[extractor-x3d_l]:
[extractor-s3d-kinetics]:
[extractor-s3d-howto100m]:
[extractor-slowfast]:


<div class="alert alert-info">

#### **3- Data Filtering**

Now we are going to filter the dataset(s) and get rid of the segments were:
- The "nothing" class is present, meaning they are not annotated.
- No person is present in more than half the segment.

</div>

In [None]:
yolo_dataset_index = FEATURES_EXTRACTORS.index(next(filter(lambda x: x.get_name() == "yolo", FEATURES_EXTRACTORS)))

yolo_dataset = datasets[yolo_dataset_index]

# --- --- ---

def extract_segments_without_classless_indices(yolo_dataset, classes: list[str]):
    label_encoder = LabelEncoderFactory.get()
    
    return [i for i in range(len(yolo_dataset)) if yolo_dataset[i][1] not in label_encoder.transform(classes)]

def extract_segments_with_persons_indices(yolo_dataset):
    label_encoder = LabelEncoderFactory.get()
    
    # NOTE: we chose 4 as we take a sample of 32 frames, we then subsample 8 and extract the yolo features from them, thus 4 is the half of 8
    return [i for i in range(len(dataset)) if torch.count_nonzero(torch.sum(yolo_dataset[i][0], dim=1)) >= 4]

# NOTE: this is the indices to keep
filtered_indices = list(set(extract_segments_without_classless_indices(yolo_dataset, ["nothing"]) + extract_segments_with_persons_indices(yolo_dataset)))

# --- --- ---

filtered_datasets = [torch.utils.data.Subset(dataset, filtered_indices) for dataset in datasets]

In [45]:
print(f"[len(datasets[0])]: {len(datasets[0])}")
print(f"[len(filtered_datasets[0])]: {len(filtered_datasets[0])}")
print(f"[difference]: {len(datasets[0]) - len(filtered_datasets[0])}")
print(f"--- --- ---")
print(f"[percentage]: {(len(datasets[0]) - len(filtered_datasets[0])) / len(datasets[0]) * 100:.2f}%")

[len(datasets[0])]: 4098
[len(filtered_datasets[0])]: 3995
[difference]: 103
--- --- ---
[percentage]: 2.51%


In [47]:
class FullVideoFeaturesDataset():
    def __init__(self, dataset: VideoDataset, transform=None, verbose=True):
        self.dataset = dataset
        self.transform = transform
        self.verbose = verbose
        
        self.videos_segments_indices = self.__get_videos_segments_indices()
    
    def __get_videos_segments_indices(self):
        if hasattr(self.dataset, '_cached_videos_segments_indices'):
            return self.dataset._cached_videos_segments_indices

        videos_segments_indices = {}
        iterator = tqdm.tqdm(range(len(self.dataset))) if self.verbose else range(len(self.dataset))
        
        for sample_index in iterator:
            _, _, video_id, segment_index = self.dataset[sample_index]
            
            if video_id not in videos_segments_indices:
                videos_segments_indices[video_id] = [sample_index]
            else:
                videos_segments_indices[video_id].append(sample_index)
        
        for video_id in videos_segments_indices:
            videos_segments_indices[video_id] = sorted(
                videos_segments_indices[video_id], 
                key=lambda sample_index: self.dataset[sample_index][3]
            )
        
        self.dataset._cached_videos_segments_indices = videos_segments_indices
        return videos_segments_indices
    
    def __len__(self):
        return len(self.videos_segments_indices.keys())
    
    def __getitem__(self, video_index):
        video_id = list(self.videos_segments_indices.keys())[video_index]
        video_segments_indices = self.videos_segments_indices[video_id]

        features = []
        annotations = []

        for sample_index in video_segments_indices:
            frames, annotation, _, _ = self.dataset[sample_index]
            features.append(frames)
            annotations.append(annotation)
            
        if self.transform is not None:
            return self.transform(features, annotations, video_id)
        else:
            return features, annotations, video_id

In [48]:
yolo_features_dataset = DiskCachedDataset.load_dataset_or_cache_it(
    dataset=dataset,
    base_path=os.path.join(DATASET_PATH, "augmented-features", "yolo-features"),
    verbose=True
)

# --- --- ---

label_encoder = LabelEncoderFactory.get()

yolo_dataset_index = FEATURES_EXTRACTORS.index(next(filter(lambda x: x["name"] == "yolo", FEATURES_EXTRACTORS)))

yolo_dataset = DiskCachedDataset(
    base_path=os.path.join(DATASET_PATH, "augmented-features", "yolo-features"),
)

# --- --- ---

def extract_segments_without_classless_indices(yolo_dataset, classes: list[str]):
    label_encoder = LabelEncoderFactory.get()
    
    return [i for i in range(len(yolo_dataset)) if yolo_dataset[i][1] not in label_encoder.transform(classes)]

def extract_segments_with_persons_indices(yolo_dataset):
    label_encoder = LabelEncoderFactory.get()
    
    return [i for i in range(len(dataset)) if torch.count_nonzero(torch.sum(yolo_dataset[i][0], dim=1)) >= 4]

In [49]:
filtered_indices = list(set(extract_segments_without_classless_indices(yolo_dataset, ["nothing"]) + extract_segments_with_persons_indices(yolo_dataset)))

videos = FullVideoFeaturesDataset(
    dataset=torch.utils.data.Subset(disk_cached_dataset, filtered_indices),
    transform=lambda features, annotations, video_id: (torch.stack(features), torch.tensor(np.array(annotations)[:, 0]), video_id),
    verbose=True    
)

100%|██████████| 3995/3995 [00:00<00:00, 4990.08it/s]


In [82]:
print(f"[len(videos)]: {len(videos)}")

print(f"--- --- ---")

video_index = 0

features, annotations, video_id = videos[video_index]

print(f"[video_id]: {video_id}")
print(f"[features.shape]: {features.shape}")
print(f"[len(annotations)]: {annotations.shape}")

[len(videos)]: 22
--- --- ---
[video_id]: climb_9-climber_MasseQuentin-bloc_1-angle_face
[features.shape]: torch.Size([173, 2048])
[len(annotations)]: torch.Size([173])


In [83]:
class GloballyTemporalAwareModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0.0):
        """
        Parameters:
        -----------
        input_size: The feature size for each time step.
        hidden_size: The number of hidden units in the LSTM.
        output_size: The number of classes.
        num_layers: The number of LSTM layers. Default is 1.
        dropout: The dropout probability. Default is 0.0.
        """
        super(GloballyTemporalAwareModel, self).__init__()
        
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, (final_hidden_states, final_cell_states) = self.lstm(x, None)
        
        output = self.fc(lstm_out)
        
        return output

In [84]:
model = GloballyTemporalAwareModel(128, 100, 5)

video = torch.zeros((1, 180, 128))

print(f"[video.shape]: {video.shape}")

output = model(video)

print(f"[model(video).shape]: {output.shape}")

[video.shape]: torch.Size([1, 180, 128])
[model(video).shape]: torch.Size([1, 180, 5])


In [85]:
# Define the split sizes
train_size = int(0.7 * len(videos))
val_size = len(videos) - train_size

# Split the dataset
train_videos, val_videos = torch.utils.data.random_split(videos, [train_size, val_size])

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_videos, batch_size=1, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_videos, batch_size=1, shuffle=False)

In [86]:
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for features, annotations, _ in train_loader:
        features, annotations = features.to(device), annotations.to(device)

        optimizer.zero_grad()
        
        # Forward pass
        output = model(features)

        # Compute loss
        loss = criterion(output.view(-1, output.size(-1)), annotations.view(-1))
        loss.backward()
        
        optimizer.step()
        
        running_loss += loss.item()

        # Compute accuracy
        _, predicted = torch.max(output, -1)
        correct_predictions += (predicted.view(-1) == annotations.view(-1)).sum().item()
        total_predictions += annotations.numel()

    avg_loss = running_loss / len(train_loader)
    accuracy = correct_predictions / total_predictions * 100
    return avg_loss, accuracy

def validate_model(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for features, annotations, _ in val_loader:
            features, annotations = features.to(device), annotations.to(device)
            
            # Forward pass
            output = model(features)
            
            # Compute loss
            loss = criterion(output.view(-1, output.size(-1)), annotations.view(-1))
            running_loss += loss.item()

            # Compute accuracy
            _, predicted = torch.max(output, -1)
            correct_predictions += (predicted.view(-1) == annotations.view(-1)).sum().item()
            total_predictions += annotations.numel()

    avg_loss = running_loss / len(val_loader)
    accuracy = correct_predictions / total_predictions * 100
    return avg_loss, accuracy

In [87]:
len(label_encoder.classes_)

5

In [92]:
# Hyperparameters
input_size = 2048
hidden_size = 128
output_size = 5
learning_rate = 0.001
num_epochs = 32
num_layers = 4
dropout = 0.0

# Initialize model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GloballyTemporalAwareModel(input_size, hidden_size, output_size, num_layers, dropout).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training and validation loop with best model tracking
best_val_acc = 0.0
best_model_state = None

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    # Train the model
    train_loss, train_acc = train_model(model, train_loader, optimizer, criterion, device)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%")

    # Validate the model
    val_loss, val_acc = validate_model(model, val_loader, criterion, device)
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%")

    # Track the best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict()

print(f"\nBest Validation Accuracy: {best_val_acc:.2f}%")

# Optionally save the best model
torch.save(best_model_state, "best_model.pth")

Epoch 1/32
Train Loss: 1.4450, Train Accuracy: 32.46%
Validation Loss: 1.4717, Validation Accuracy: 34.10%
Epoch 2/32
Train Loss: 1.4060, Train Accuracy: 34.87%
Validation Loss: 1.4194, Validation Accuracy: 34.10%
Epoch 3/32
Train Loss: 1.3418, Train Accuracy: 51.56%
Validation Loss: 1.3300, Validation Accuracy: 51.96%
Epoch 4/32
Train Loss: 1.1888, Train Accuracy: 57.46%
Validation Loss: 1.2516, Validation Accuracy: 51.19%
Epoch 5/32
Train Loss: 1.1000, Train Accuracy: 60.13%
Validation Loss: 1.1343, Validation Accuracy: 55.97%
Epoch 6/32
Train Loss: 1.0252, Train Accuracy: 62.43%
Validation Loss: 1.2163, Validation Accuracy: 53.27%
Epoch 7/32
Train Loss: 1.0032, Train Accuracy: 62.31%
Validation Loss: 1.0949, Validation Accuracy: 58.28%
Epoch 8/32
Train Loss: 0.9996, Train Accuracy: 61.50%
Validation Loss: 1.0743, Validation Accuracy: 58.97%
Epoch 9/32
Train Loss: 0.9073, Train Accuracy: 65.10%
Validation Loss: 1.0754, Validation Accuracy: 59.82%
Epoch 10/32
Train Loss: 0.8663, Train