<div class="alert alert-info">

#### **MLP Filtering Experiment**

This notebook is just a duplicate of MLP experiment where we run the MLP experiment multiple times with all possible filtering combinations in order to see which one performs the best.

</div>

In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
from itertools import chain, combinations

def generate_power_set(input_set, with_empty_set: bool = False):
    power_set = list(chain.from_iterable(combinations(input_set, r) for r in range(len(input_set) + 1)))
    
    if not with_empty_set:
        power_set = power_set[1:]
        
    return power_set

In [32]:
from functools import reduce
from experiments.helpers.preliminary import preliminary, FilteringMode, FilteringOperator

filtering_modes = [
    [FilteringMode(0)],
    *generate_power_set(
        [
            FilteringMode.NO_PERSONLESS,
            FilteringMode.NO_STOPWATCH_CLASS,
            FilteringMode.NO_NOTHING_CLASS,
            FilteringMode.NO_MULTI_CLASS
        ]
    ),
]

filtering_modes = [reduce(lambda x, y: x | y, filtering_mode) for filtering_mode in filtering_modes]

In [None]:
import tqdm

for FILTERING_MODE in tqdm.tqdm(iterable=filtering_modes, desc="[filtering-modes]"):
    datasets, filtered_datasets, extractors = preliminary(
        filtering_mode=FILTERING_MODE,
        filtering_operator=FilteringOperator.OR
    )
    
    initial_size = len(datasets[0])
    filtered_size = len(filtered_datasets[0])

    reduction_percentage = 100 * (initial_size - filtered_size) / initial_size

    print(f"[filtering]: {reduction_percentage:.2f}%")
    
    import torch

    class SimpleLinearClassifier(torch.nn.Module):
        def __init__(self, input_size, output_size):
            super(SimpleLinearClassifier, self).__init__()
            
            self.network = torch.nn.Sequential(
                torch.nn.Flatten(),
                torch.nn.LazyLinear(out_features=output_size)
            )
            
        def forward(self, x):
            return self.network(x)
        
    class WrapperDataset(torch.utils.data.Dataset):
        def __init__(self, dataset, transform=None):
            self.dataset = dataset
            self.transform = transform
            
        def __getitem__(self, index):
            if self.transform:
                return self.transform(self.dataset[index])
            else:
                return self.dataset[index]
            
        def __len__(self):
            return len(self.dataset)
        
    def transform(sample):
        features, annotations, video_id, segment_index = sample
        
        return features, annotations[0]
    
    NUMBER_OF_FOLDS = 5
    NUMBER_ANNOTATED_VIDEOS = 22

    from utils import LabelEncoderFactory

    from experiments.helpers.trainer import Trainer
    from experiments.helpers.splits_generator import splits_generator
    from experiments.helpers.videos_to_indices import videos_to_indices

    label_encoder = LabelEncoderFactory.get()

    folds_histories: list[dict] = []

    for fold_index, folds in enumerate(splits_generator(dataset_length=NUMBER_ANNOTATED_VIDEOS, k=NUMBER_OF_FOLDS)):
        histories = {}
        
        for dataset, extractor in zip(filtered_datasets, extractors):
            training_videos_ids, validation_videos_ids = folds
        
            # TODO: the issue is here, as the cached version is being used, some indices that have been filtered are trying to be reused
            training_samples_ids = videos_to_indices(dataset, training_videos_ids)
            testing_samples_ids = videos_to_indices(dataset, validation_videos_ids)
            
            training_dataset = WrapperDataset(torch.utils.data.Subset(dataset, training_samples_ids), transform)
            validation_dataset = WrapperDataset(torch.utils.data.Subset(dataset, testing_samples_ids), transform)
            
            training_dataloader = torch.utils.data.DataLoader(training_dataset, batch_size=32, shuffle=True)
            validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=32, shuffle=False)
        
            linear_classifier = SimpleLinearClassifier(input_size=training_dataset[0][0].shape[0], output_size=len(label_encoder.classes_))
            
            trainer = Trainer(linear_classifier)
            
            statistics = trainer.train(training_dataloader, validation_dataloader, title=f"[training-{extractor.get_name()}-{fold_index + 1}/{NUMBER_OF_FOLDS}]")
            
            histories[extractor.get_name()] = statistics
            
        folds_histories.append(histories)
        
    # NOTE: For each model we are going to put a plot, a box plot or something to display the variance of the validation accuracy between each fold
    models_training_accuracies = {}
    models_validation_accuracies = {}

    for fold_history in folds_histories:
        for extractor_name, details in fold_history.items():
            if models_validation_accuracies.get(extractor_name) is None:
                models_validation_accuracies[extractor_name] = [details["best_validation_accuracy"]] 
            else:
                models_validation_accuracies[extractor_name].append(details["best_validation_accuracy"])
                
            if models_training_accuracies.get(extractor_name) is None:
                models_training_accuracies[extractor_name] = [details["best_training_accuracy"]] 
            else:
                models_training_accuracies[extractor_name].append(details["best_training_accuracy"])
                
    models_training_losses = {}
    models_validation_losses = {}

    for fold_history in folds_histories:
        for extractor_name, details in fold_history.items():
            if models_validation_losses.get(extractor_name) is None:
                models_validation_losses[extractor_name] = [details["best_validation_loss"]] 
            else:
                models_validation_losses[extractor_name].append(details["best_validation_loss"])
                
            if models_training_losses.get(extractor_name) is None:
                models_training_losses[extractor_name] = [details["best_training_loss"]] 
            else:
                models_training_losses[extractor_name].append(details["best_training_loss"])
                
                
    import numpy as np
    import matplotlib.pyplot as plt

    # --- --- ---

    plt.figure(figsize=(12, 6))

    plt.suptitle(f"Model Performance Across 5 Folds - {", ".join(FilteringMode.get_str_components(FILTERING_MODE))}")

    plt.subplot(1, 2, 1)
    plt.boxplot(models_validation_accuracies.values(), tick_labels=models_validation_accuracies.keys())
    # plt.title("Model Performance Across 5 Folds (Accuracy)")
    plt.ylabel("Accuracy")
    plt.xticks(rotation=90)
    plt.axhline(y=0.9, color='r', linestyle='--', label='90% Accuracy')
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.ylim(0, 1)

    plt.subplot(1, 2, 2)
    plt.boxplot(models_validation_losses.values(), tick_labels=models_validation_losses.keys())
    # plt.title("Model Performance Across 5 Folds (Loss)")
    plt.ylabel("Loss")
    plt.xticks(rotation=90)
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.ylim(0, 1.5)

    plt.tight_layout()

    name = ",".join(FilteringMode.get_str_components(FILTERING_MODE)).replace("_", "-").lower() + ".boxplot.png"

    plt.savefig(name)

    # plt.show()

    # --- --- ---

    for model_name, accuracies in models_validation_accuracies.items():
        average_accuracy = np.mean(accuracies)
        min_accuracy, max_accuracy = np.min(accuracies), np.max(accuracies)
        std = np.std(accuracies)
        print(f"[{model_name}]: {average_accuracy:.2f} ± {std:.2f}; ({min_accuracy:.2f}, {max_accuracy:.2f})")

    for model_name, losses in models_validation_losses.items():
        average_loss = np.mean(losses)
        min_loss, max_loss = np.min(losses), np.max(losses)
        std = np.std(losses)
        print(f"[{model_name} Loss]: {average_loss:.2f} ± {std:.2f}; ({min_loss:.2f}, {max_loss:.2f})")
        
    from matplotlib.lines import Line2D

    models_names = models_validation_accuracies.keys()

    markers = list(Line2D.markers.keys())
    markers = markers[:len(models_names)]

    plt.figure(figsize=(12, 6))

    plt.suptitle(f"Model Performance Across 5 Folds - {", ".join(FilteringMode.get_str_components(FILTERING_MODE))}")

    plt.subplot(1, 2, 1)
    for i, model_name, marker in zip(range(len(models_names)), models_names, markers):
        model_average_validation_accuracy = np.mean(models_validation_accuracies[model_name])
        model_average_training_accuracy = np.mean(models_training_accuracies[model_name])
        
        plt.scatter(model_average_validation_accuracy, model_average_training_accuracy, label=model_name, marker=marker, s=100)
        # plt.text(model_average_validation_accuracy, model_average_training_accuracy - 0.02, model_name, ha='center', fontsize=10, fontweight='bold')

    plt.xlabel("Validation Accuracy")
    plt.ylabel("Training Accuracy")
    # plt.title("Comparison of Model Accuracies")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.5)

    plt.subplot(1, 2, 2)
    for i, model_name, marker in zip(range(len(models_names)), models_names, markers):
        model_average_validation_loss = np.mean(models_validation_losses[model_name])
        model_average_training_loss = np.mean(models_training_losses[model_name])
        
        plt.scatter(model_average_validation_loss, model_average_training_loss, label=model_name, marker=marker, s=100)
        # plt.text(model_average_validation_accuracy, model_average_training_accuracy - 0.02, model_name, ha='center', fontsize=10, fontweight='bold')

    plt.xlabel("Validation Loss")
    plt.ylabel("Training Loss")
    # plt.title("Comparison of Model Losses")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.5)

    plt.tight_layout()
    
    name = ",".join(FilteringMode.get_str_components(FILTERING_MODE)).replace("_", "-").lower() + ".scatterplot.png"

    plt.savefig(name)
    
    # plt.show()

[filtering-modes]:   0%|          | 0/16 [00:00<?, ?it/s]Using cache found in /Users/nadir/.cache/torch/hub/facebookresearch_pytorchvideo_main


[missing-keys]: <All keys matched successfully>


Using cache found in /Users/nadir/.cache/torch/hub/facebookresearch_pytorchvideo_main
Using cache found in /Users/nadir/.cache/torch/hub/facebookresearch_pytorchvideo_main
Using cache found in /Users/nadir/.cache/torch/hub/facebookresearch_pytorchvideo_main
Using cache found in /Users/nadir/.cache/torch/hub/facebookresearch_pytorchvideo_main
Using cache found in /Users/nadir/.cache/torch/hub/facebookresearch_pytorchvideo_main


[INFO]: frames for "climb_1-climber_MoubeAdrian-bloc_1-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_1-climber_MoubeAdrian-bloc_1-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_10-climber_DouglasSophia-bloc_1-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_10-climber_DouglasSophia-bloc_1-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_11-climber_MoubeAdrian-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_11-climber_MoubeAdrian-bloc_2-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_12-climber_MrideEsteban-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_12-climber_MrideEsteban-bloc_2-angle_profile" already exist. skipping extraction.
[INFO]: frames for "climb_13-climber_FonneLana-bloc_2-angle_face" already exist. skipping extraction.
[INFO]: frames for "climb_13-climber_FonneLana-blo

[dataset-mappings-construction]:: 100%|██████████| 4098/4098 [00:00<00:00, 5822.90it/s]
[dataset-mappings-construction]:: 100%|██████████| 4098/4098 [00:00<00:00, 15392.30it/s]
[training-yolo-1/5]: 100%|██████████| 32/32 [00:10<00:00,  3.13epoch/s, training-loss=1.06, training-accuracy=0.602, validation-loss=1.06, validation-accuracy=0.616, best-validation-accuracy=0.616, best-training-accuracy=0.602]
[dataset-mappings-construction]:: 100%|██████████| 4098/4098 [00:00<00:00, 4609.32it/s]
[dataset-mappings-construction]:: 100%|██████████| 4098/4098 [00:00<00:00, 13487.48it/s]
