In [1]:
from pathlib import Path
from typing import List, Tuple

import pandas as pd
import numpy as np

In [2]:
%load_ext autoreload
%autoreload 2

# Sequence generation

Generate the same amount of TP and FP:

* **TP**: The alarms in the alarms.csv
* **FP**: Combinations of:
    * _distractions_
    * only background

Try to distribute the load of sequence across videos/stages.

In [3]:
# Student number
np.random.seed(16896375)

In [4]:
PROJECT_DIR = Path("..")
DATA_DIR = PROJECT_DIR / "data"

ILIDS_META_DIR = DATA_DIR / "ilids-metadata"

In [5]:
videos_csv = ILIDS_META_DIR / "videos.csv"

videos_df = pd.read_csv(videos_csv, index_col="format.filename")
videos_df["format.duration"] = pd.to_timedelta(
    videos_df["format.duration"], unit="second"
)

In [6]:
alarms_csv = ILIDS_META_DIR / "alarms.csv"

alarms_df = pd.read_csv(alarms_csv, index_col="filename")

alarms_df["AlarmDuration"] = pd.to_timedelta(alarms_df["AlarmDuration"])
alarms_df["StartTime"] = pd.to_timedelta(alarms_df["StartTime"])
alarms_df["EndTime"] = alarms_df["StartTime"] + alarms_df["AlarmDuration"]

alarms_duration_min = alarms_df["AlarmDuration"].dt.seconds.min()
alarms_duration_max = alarms_df["AlarmDuration"].dt.seconds.max()

In [7]:
hand_distractions_csv = (
    DATA_DIR / "handcrafted-metadata" / "szte_distractions.extended.corrected.csv"
)

distractions_df = pd.read_csv(hand_distractions_csv, index_col="filename")
distractions_df["start time"] = pd.to_timedelta(distractions_df["start time"])
distractions_df["end time"] = pd.to_timedelta(distractions_df["end time"])
distractions_df["duration"] = pd.to_timedelta(distractions_df["duration"])

In [8]:
len(alarms_df), len(distractions_df)

(432, 35)

In [9]:
alarms_df = alarms_df.rename(columns={"AlarmDuration": "Duration"})
TP = alarms_df[
    [
        "StartTime",
        "EndTime",
        "Duration",
        "Distance",
        "SubjectApproachType",
        "SubjectDescription",
        "SubjectOrientation",
    ]
]

In [10]:
TP.index.value_counts()

SZTR/SZTEA204a.mov    31
SZTR/SZTEA104a.mov    31
SZTE/SZTEA104a.mov    31
SZTE/SZTEA204a.mov    31
SZTE/SZTEA203a.mov    17
SZTR/SZTEA103a.mov    17
SZTR/SZTEA102b.mov    17
SZTE/SZTEA202b.mov    17
SZTR/SZTEA202b.mov    17
SZTR/SZTEA203a.mov    17
SZTE/SZTEA103a.mov    17
SZTE/SZTEA102b.mov    17
SZTE/SZTEA201b.mov    15
SZTR/SZTEA201b.mov    15
SZTE/SZTEA101b.mov    15
SZTR/SZTEA101b.mov    15
SZTE/SZTEA202a.mov    13
SZTR/SZTEA102a.mov    13
SZTE/SZTEA102a.mov    13
SZTR/SZTEA202a.mov    13
SZTE/SZTEA101a.mov    10
SZTR/SZTEA201a.mov    10
SZTR/SZTEA105a.mov    10
SZTE/SZTEA201a.mov    10
SZTE/SZTEA105a.mov    10
SZTR/SZTEA101a.mov    10
Name: filename, dtype: int64

In [11]:
distractions_df = distractions_df.rename(
    columns={
        "distraction": "Distraction",
        "start time": "StartTime",
        "end time": "EndTime",
        "duration": "Duration",
    }
)
FP = distractions_df[["StartTime", "EndTime", "Duration", "Distraction"]]

In [12]:
SEQUENCES = pd.concat([TP, FP])
SEQUENCES.head()

Unnamed: 0_level_0,StartTime,EndTime,Duration,Distance,SubjectApproachType,SubjectDescription,SubjectOrientation,Distraction
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SZTE/SZTEA101a.mov,0 days 00:05:37,0 days 00:06:37,0 days 00:01:00,30.0,Crouch Walk,One Person,Perpendicular,
SZTE/SZTEA101a.mov,0 days 00:08:58,0 days 00:10:06,0 days 00:01:08,15.0,Crawl,One Person,Perpendicular,
SZTE/SZTEA101a.mov,0 days 00:12:12,0 days 00:13:12,0 days 00:01:00,10.0,Run,One Person,Perpendicular,
SZTE/SZTEA101a.mov,0 days 00:15:14,0 days 00:16:52,0 days 00:01:38,15.0,Creep walk,One Person,Perpendicular,
SZTE/SZTEA101a.mov,0 days 00:18:41,0 days 00:19:33,0 days 00:00:52,30.0,Run,One Person,Perpendicular,


In [13]:
SEQUENCES.tail()

Unnamed: 0_level_0,StartTime,EndTime,Duration,Distance,SubjectApproachType,SubjectDescription,SubjectOrientation,Distraction
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SZTE/SZTEN202a.mov,0 days 00:06:02,0 days 00:06:02,0 days 00:02:00,,,,,Birds
SZTE/SZTEN202b.mov,0 days 00:20:49,0 days 00:20:49,0 days 00:01:28,,,,,Birds
SZTE/SZTEN202c.mov,0 days 00:09:24,0 days 00:09:24,0 days 00:01:02,,,,,Birds
SZTE/SZTEN202c.mov,0 days 00:15:18,0 days 00:15:39,0 days 00:00:21,,,,,Insect on camera
SZTE/SZTEN202d.mov,0 days 00:19:50,0 days 00:20:18,0 days 00:00:28,,,,,Foxes


In [14]:
def check_in_sequences(df: pd.DataFrame) -> Tuple[List[bool], List[bool], List[bool]]:
    # For matching filename and
    # startTime <= start <= endTime or
    # startTime <= end <= endTime

    # df = pd.DataFrame({'StartTime': pd.to_timedelta([0, 0, 6*60], unit="second"), 'EndTime': pd.to_timedelta([9*60, 12, 8*60], unit="second"), 'filename': ["SZTE/SZTEA101a.mov",
    # "SZTE/SZTEA101a.mov",
    # "SZTE/SZTEA101a.mov",]}).set_index("filename")
    matching_filename = [(df.index[i] == SEQUENCES.index).any() for i in range(len(df))]
    matching_filename_df = SEQUENCES[
        [(df.index == SEQUENCES.index[i]).any() for i in range(len(SEQUENCES))]
    ]

    if not any(matching_filename_df):
        return matching_filename_df

    matching_start_df = [
        (
            matching_filename_df["StartTime"]
            <= df.iloc[i, df.columns.get_loc("StartTime")]
        ).any()
        and (
            df.iloc[i, df.columns.get_loc("StartTime")]
            <= matching_filename_df["EndTime"]
        ).any()
        for i in range(len(df))
    ]

    matching_end_df = [
        (
            matching_filename_df["StartTime"]
            <= df.iloc[i, df.columns.get_loc("EndTime")]
        ).any()
        and (
            df.iloc[i, df.columns.get_loc("EndTime")] <= matching_filename_df["EndTime"]
        ).any()
        for i in range(len(df))
    ]

    return matching_filename & (
        np.array(matching_start_df) | np.array(matching_end_df)
    )  # , matching_filename_df[matching_start_df], matching_filename_df[matching_end_df]

In [15]:
missing_fp = 2 * len(alarms_df) - len(SEQUENCES)

video_files_idx = np.random.randint(0, len(videos_df), missing_fp)

video_files = videos_df.index[video_files_idx]
video_durations = videos_df.iloc[video_files_idx]["format.duration"]
fp_durations = np.random.randint(alarms_duration_min, alarms_duration_max, missing_fp)
fp_durations_delta = pd.to_timedelta(fp_durations, unit="second")

video_remaining_proportion = video_durations - fp_durations_delta
# TODO in case of remaining not enough, drop it

fp_start = (video_remaining_proportion * 0.95) * np.random.random(missing_fp)

fp_df = pd.DataFrame(
    {
        "StartTime": pd.to_timedelta(fp_start, unit="second").dt.floor("S"),
        "Duration": fp_durations_delta,
    },
    index=video_files,
)
fp_df["EndTime"] = fp_df["StartTime"] + fp_df["Duration"]
fp_df.head()

Unnamed: 0_level_0,StartTime,Duration,EndTime
format.filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SZTR/SZTRA101a21.mov,0 days 00:00:01,0 days 00:01:43,0 days 00:01:44
SZTE/SZTEN202b.mov,0 days 00:27:33,0 days 00:00:56,0 days 00:28:29
SZTR/SZTRA103b15.mov,0 days 00:00:08,0 days 00:01:30,0 days 00:01:38
SZTR/SZTRN101a.mov,0 days 00:11:10,0 days 00:01:21,0 days 00:12:31
SZTR/SZTRA203b11.mov,0 days 00:01:12,0 days 00:01:55,0 days 00:03:07


In [16]:
check_in_sequences(fp_df)

array([False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False,  True, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,