<div class="alert alert-info">

## **Dataset Analysis**

This notebook will analyze and summarize all different aspects of the dataset.

</div>

In [None]:
import os
import tqdm
import torch
import torchvision

import numpy as np
import matplotlib.pyplot as plt

from enum import IntEnum

from video_dataset import VideoDataset
from video_dataset.padder import LastValuePadder
from video_dataset.dataset import VideoShapeComponents
from video_dataset.video import VideoFromVideoFramesDirectory
from video_dataset.preprocessor import extract_frames_from_videos
from video_dataset.annotations import AnnotationsFromSegmentLevelCsvFileAnnotations

from tas_helpers.visualization import SegmentationVisualizer
from tas_helpers.scores import repetition_score, order_variation_score
from tas_helpers.metrics import mean_over_frames, f1_score, edit_distance

from cached_dataset.dataset import DiskCachedDataset

from utils import LabelEncoderFactory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET_PATH = "/Users/nadir/Documents/research-project-dataset"

VIDEOS_DIRECTORY_NAME = "videos"
ANNOTATIONS_DIRECTORY_NAME = "annotations"
VIDEOS_FRAMES_DIRECTORY_NAME = "videos_frames"

ALL_IDS_FILE_NAME = "all_ids.txt"
TESTING_IDS_FILE_NAME = "testing_ids.txt"
TRAINING_IDS_FILE_NAME = "training_ids.txt"
VALIDATION_IDS_FILE_NAME = "validation_ids.txt"

ANNOTATED_IDS_FILE_NAME = "annotated_ids.txt"
UNANNOTATED_IDS_FILE_NAME = "unannotated_ids.txt"

In [3]:
label_encoder = LabelEncoderFactory.get()

In [4]:
def __aggregate_labels(str_labels):
    labels = label_encoder.transform(str_labels)
    
    unique_elements, counts = np.unique(labels, return_counts=True)

    max_count_index = np.argmax(counts)

    most_frequent_element = unique_elements[max_count_index]
    
    return most_frequent_element

In [12]:
all_videos_dataset = VideoDataset(
    annotations_dir=os.path.join(DATASET_PATH, ANNOTATIONS_DIRECTORY_NAME),
    videos_dir=os.path.join(DATASET_PATH, VIDEOS_FRAMES_DIRECTORY_NAME),
    ids_file=os.path.join(DATASET_PATH, ALL_IDS_FILE_NAME),
    segment_size=32,
    video_processor=VideoFromVideoFramesDirectory,
    annotations_processor=AnnotationsFromSegmentLevelCsvFileAnnotations,
    annotations_processor_kwargs={"fps": 25, "delimiter": ","},
    video_shape=(VideoShapeComponents.CHANNELS, VideoShapeComponents.TIME, VideoShapeComponents.HEIGHT, VideoShapeComponents.WIDTH),
    step=1,
    # padder=LastValuePadder(),
    annotations_transform=__aggregate_labels,
    overlap=0,
    allow_undefined_annotations=True,
    verbose=False
)

In [15]:
annotated_videos_dataset = VideoDataset(
    annotations_dir=os.path.join(DATASET_PATH, ANNOTATIONS_DIRECTORY_NAME),
    videos_dir=os.path.join(DATASET_PATH, VIDEOS_FRAMES_DIRECTORY_NAME),
    ids_file=os.path.join(DATASET_PATH, ANNOTATED_IDS_FILE_NAME),
    segment_size=VideoDataset.FULL_VIDEO_SEGMENT,
    video_processor=VideoFromVideoFramesDirectory,
    annotations_processor=AnnotationsFromSegmentLevelCsvFileAnnotations,
    annotations_processor_kwargs={"fps": 25, "delimiter": ","},
    video_shape=(VideoShapeComponents.CHANNELS, VideoShapeComponents.TIME, VideoShapeComponents.HEIGHT, VideoShapeComponents.WIDTH),
    step=1,
    # padder=LastValuePadder(),
    # annotations_transform=__aggregate_labels,
    overlap=0,
    allow_undefined_annotations=True,
    load_videos=False
)

In [17]:
print(f"[dataset-size]: {len(annotated_videos_dataset)}")

[dataset-size]: 22


<div class="alert alert-info">

#### **Repetition Score:**

The repetition score ranges between 0 and -1. A higher value such as in our case signifies a higher degree of repetition within the sequences. In simple words, actions usually repeat in the sequence / activity.

</div>

In [26]:
repetition_scores = [repetition_score(annotations) for _, annotations in annotated_videos_dataset]

average_repetition_score = np.mean(repetition_scores)
std_repetition_score = np.std(repetition_scores)
    
print(f"[repetition-score]: {average_repetition_score} ± {std_repetition_score}")

[repetition-score]: 0.8131712451488049 ± 0.03650003778074824


<div class="alert alert-info">

#### **Order Variation Score:**

The repetition score ranges between 0 and -1. A higher value such as in our case signifies a higher degree of repetition within the sequences. In simple words, actions usually repeat in the sequence / activity.

</div>

In [28]:
videos_annotations = [annotations for _, annotations in annotated_videos_dataset]

order_variation_scores = order_variation_score(videos_annotations)

KeyboardInterrupt: 

In [None]:
print(order_variation_scores)

In [None]:
label_encoder.classes_

<div class="alert alert-info">

#### **Actions Durations:**



</div>

<div class="alert alert-info">

#### **Segmentations Visualizations Examples:**

</div>