In [1]:
from pathlib import Path
from time import strftime, gmtime

import pandas as pd
import numpy as np
from tqdm import tqdm

# Sequence generation

Generate the same amount of TP and FP:

* **TP**: The alarms in the alarms.csv
* **FP**: Combinations of:
    * _distractions_
    * only background

In [2]:
# Student number
np.random.seed(16896375)

In [3]:
PROJECT_DIR = Path("..")
DATA_DIR = PROJECT_DIR / "data"

ILIDS_META_DIR = DATA_DIR / "ilids-metadata"

In [4]:
clips_csv = ILIDS_META_DIR / "clips.csv"

clips_df = pd.read_csv(clips_csv, index_col="filename")
clips_df = clips_df.drop(columns=["AlarmEvents", "Duration"])

In [5]:
videos_csv = ILIDS_META_DIR / "videos.csv"

videos_df = pd.read_csv(videos_csv, index_col="format.filename")
videos_df["format.duration"] = pd.to_timedelta(
    videos_df["format.duration"], unit="second"
)

In [6]:
alarms_csv = ILIDS_META_DIR / "alarms.csv"

alarms_df = pd.read_csv(alarms_csv, index_col="filename")

alarms_df["AlarmDuration"] = pd.to_timedelta(alarms_df["AlarmDuration"])
alarms_df["StartTime"] = pd.to_timedelta(alarms_df["StartTime"])
alarms_df["EndTime"] = alarms_df["StartTime"] + alarms_df["AlarmDuration"]

alarms_duration_min = alarms_df["AlarmDuration"].dt.seconds.min()
alarms_duration_max = alarms_df["AlarmDuration"].dt.seconds.max()

In [7]:
hand_distractions_csv = (
    DATA_DIR / "handcrafted-metadata" / "szte_distractions.extended.corrected.csv"
)

distractions_df = pd.read_csv(hand_distractions_csv, index_col="filename")
distractions_df["start time"] = pd.to_timedelta(distractions_df["start time"])
distractions_df["end time"] = pd.to_timedelta(distractions_df["end time"])
distractions_df["duration"] = pd.to_timedelta(distractions_df["duration"])

In [8]:
len(alarms_df), len(distractions_df)

(432, 35)

In [9]:
alarms_df = alarms_df.rename(columns={"AlarmDuration": "Duration"})
TP = alarms_df[
    [
        "StartTime",
        "EndTime",
        "Duration",
        "Distance",
        "SubjectApproachType",
        "SubjectDescription",
        "SubjectOrientation",
    ]
]
TP["Classification"] = "TP"

In [10]:
distractions_df = distractions_df.rename(
    columns={
        "distraction": "Distraction",
        "start time": "StartTime",
        "end time": "EndTime",
        "duration": "Duration",
    }
)
distractions_df = distractions_df[["StartTime", "EndTime", "Duration", "Distraction"]]
distractions_df["Classification"] = "FP"

In [11]:
SEQUENCES = pd.concat([TP, distractions_df])
SEQUENCES.head()

Unnamed: 0_level_0,StartTime,EndTime,Duration,Distance,SubjectApproachType,SubjectDescription,SubjectOrientation,Classification,Distraction
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SZTE/video/SZTEA101a.mov,0 days 00:05:37,0 days 00:06:37,0 days 00:01:00,30.0,Crouch Walk,One Person,Perpendicular,TP,
SZTE/video/SZTEA101a.mov,0 days 00:08:58,0 days 00:10:06,0 days 00:01:08,15.0,Crawl,One Person,Perpendicular,TP,
SZTE/video/SZTEA101a.mov,0 days 00:12:12,0 days 00:13:12,0 days 00:01:00,10.0,Run,One Person,Perpendicular,TP,
SZTE/video/SZTEA101a.mov,0 days 00:15:14,0 days 00:16:52,0 days 00:01:38,15.0,Creep walk,One Person,Perpendicular,TP,
SZTE/video/SZTEA101a.mov,0 days 00:18:41,0 days 00:19:33,0 days 00:00:52,30.0,Run,One Person,Perpendicular,TP,


In [12]:
SEQUENCES.tail()

Unnamed: 0_level_0,StartTime,EndTime,Duration,Distance,SubjectApproachType,SubjectDescription,SubjectOrientation,Classification,Distraction
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SZTE/video/SZTEN202a.mov,0 days 00:04:01,0 days 00:06:01,0 days 00:02:00,,,,,FP,Birds
SZTE/video/SZTEN202b.mov,0 days 00:16:43,0 days 00:18:11,0 days 00:01:28,,,,,FP,Birds
SZTE/video/SZTEN202c.mov,0 days 00:05:29,0 days 00:06:31,0 days 00:01:02,,,,,FP,Birds
SZTE/video/SZTEN202c.mov,0 days 00:15:18,0 days 00:15:39,0 days 00:00:21,,,,,FP,Insect on camera
SZTE/video/SZTEN202d.mov,0 days 00:19:50,0 days 00:20:18,0 days 00:00:28,,,,,FP,Foxes


In [13]:
def apply_interval(df: pd.DataFrame) -> pd.arrays.IntervalArray:
    return pd.arrays.IntervalArray.from_arrays(
        df["StartTime"], df["EndTime"], closed="both"
    )

In [14]:
SEQUENCES["Interval"] = apply_interval(SEQUENCES)
SEQUENCES.head()

Unnamed: 0_level_0,StartTime,EndTime,Duration,Distance,SubjectApproachType,SubjectDescription,SubjectOrientation,Classification,Distraction,Interval
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SZTE/video/SZTEA101a.mov,0 days 00:05:37,0 days 00:06:37,0 days 00:01:00,30.0,Crouch Walk,One Person,Perpendicular,TP,,"[0 days 00:05:37, 0 days 00:06:37]"
SZTE/video/SZTEA101a.mov,0 days 00:08:58,0 days 00:10:06,0 days 00:01:08,15.0,Crawl,One Person,Perpendicular,TP,,"[0 days 00:08:58, 0 days 00:10:06]"
SZTE/video/SZTEA101a.mov,0 days 00:12:12,0 days 00:13:12,0 days 00:01:00,10.0,Run,One Person,Perpendicular,TP,,"[0 days 00:12:12, 0 days 00:13:12]"
SZTE/video/SZTEA101a.mov,0 days 00:15:14,0 days 00:16:52,0 days 00:01:38,15.0,Creep walk,One Person,Perpendicular,TP,,"[0 days 00:15:14, 0 days 00:16:52]"
SZTE/video/SZTEA101a.mov,0 days 00:18:41,0 days 00:19:33,0 days 00:00:52,30.0,Run,One Person,Perpendicular,TP,,"[0 days 00:18:41, 0 days 00:19:33]"


In [15]:
def check_in_sequences(df: pd.DataFrame, reference: pd.DataFrame) -> np.ndarray:
    # For matching filename and
    #   startTime <= start <= endTime or
    #   startTime <= end <= endTime

    # Example df to test
    # df = pd.DataFrame(
    #     {
    #         "StartTime": pd.to_timedelta([0, 0, 6 * 60], unit="second"),
    #         "EndTime": pd.to_timedelta([9 * 60, 12, 8 * 60], unit="second"),
    #         "filename": [
    #             "SZTE/SZTEA101a.mov",
    #             "SZTE/SZTEA101a.mov",
    #             "SZTE/SZTEA101a.mov",
    #         ],
    #     }
    # ).set_index("filename")

    matching_filename = df.index.intersection(reference.index)

    if len(matching_filename) == 0:
        return np.array([False] * len(df))

    def check_df_row(row, reference: pd.DataFrame):
        sub_reference = reference.loc[reference.index.intersection([row.name])]

        if len(sub_reference) == 0:
            return False

        return (
            sub_reference["Interval"]
            .array.overlaps(
                pd.Interval(row["StartTime"], row["EndTime"], closed="both")
            )
            .any()
        )

    return df.apply(check_df_row, axis=1, args=(reference,))

In [16]:
def generate_new_false_positive_intervals(N: int) -> pd.DataFrame:
    video_files_idx = np.random.randint(0, len(videos_df), N)

    video_files = videos_df.index[video_files_idx]
    video_durations = videos_df.iloc[video_files_idx]["format.duration"]
    fp_durations = np.random.randint(alarms_duration_min, alarms_duration_max, N)
    fp_durations_delta = pd.to_timedelta(fp_durations, unit="second")

    video_remaining_proportion = video_durations - fp_durations_delta

    fp_start = (video_remaining_proportion * 0.95) * np.random.random(N)

    fp_df = pd.DataFrame(
        {
            "StartTime": pd.to_timedelta(fp_start, unit="second").dt.floor("S"),
            "Duration": fp_durations_delta,
        },
        index=video_files,
    )
    fp_df["EndTime"] = fp_df["StartTime"] + fp_df["Duration"]
    fp_df["Classification"] = "FP"

    return fp_df

In [17]:
def drop_invalid_intervals(df: pd.DataFrame, inplace=True) -> pd.DataFrame:
    # In some cases, the generated duration is longer than the selected video's duration.
    # In this case, the 'StartTime' will be negative.
    # Drop them.
    dropping_rows = df[df["StartTime"] < pd.Timedelta(0, "second")].index

    df = df.drop(dropping_rows, inplace=inplace)

    return df

In [18]:
def drop_intersect_interval(
    df: pd.DataFrame, reference: pd.DataFrame, inplace=True
) -> pd.DataFrame:
    # In case in the sequence to extract from the videos already has an intersecting
    # interval, drop them.
    dropping_rows = df[check_in_sequences(df, reference=reference)].index

    df = df.drop(dropping_rows, inplace=inplace)

    return df

In [19]:
TARGET_SEQUENCES = 2 * len(alarms_df)
missing_fp = TARGET_SEQUENCES - len(SEQUENCES)

progress = tqdm(
    total=TARGET_SEQUENCES,
    desc="Generating unique non overlapping sequences",
    initial=len(SEQUENCES),
)
while missing_fp > 0:
    fp_df = generate_new_false_positive_intervals(missing_fp)
    drop_invalid_intervals(fp_df)
    drop_intersect_interval(fp_df, SEQUENCES)

    fp_df["Interval"] = apply_interval(fp_df)

    SEQUENCES = pd.concat([SEQUENCES, fp_df])

    missing_fp = TARGET_SEQUENCES - len(SEQUENCES)

    progress.update(n=len(fp_df))

Generating unique non overlapping sequences: 100%|██████████| 864/864 [00:28<00:00,  5.26it/s]

In [20]:
TARGET_SEQUENCES = 2 * len(alarms_df)
missing_fp = TARGET_SEQUENCES - len(SEQUENCES)

progress = tqdm(
    total=TARGET_SEQUENCES,
    desc="Generating unique non overlapping sequences",
    initial=len(SEQUENCES),
)
while missing_fp > 0:
    fp_df = generate_new_false_positive_intervals(missing_fp)
    drop_invalid_intervals(fp_df)
    drop_intersect_interval(fp_df, SEQUENCES)

    fp_df["Interval"] = apply_interval(fp_df)

    SEQUENCES = pd.concat([SEQUENCES, fp_df])

    missing_fp = TARGET_SEQUENCES - len(SEQUENCES)

    progress.update(n=len(fp_df))

Generating unique non overlapping sequences: 100%|██████████| 864/864 [00:28<00:00, 13.91it/s]


In [21]:
SEQUENCES.tail()

Unnamed: 0,StartTime,EndTime,Duration,Distance,SubjectApproachType,SubjectDescription,SubjectOrientation,Classification,Distraction,Interval
SZTR/video/SZTRA101a04.mov,0 days 00:02:05,0 days 00:02:25,0 days 00:00:20,,,,,FP,,"[0 days 00:02:05, 0 days 00:02:25]"
SZTE/video/SZTEN103b.mov,0 days 00:07:41,0 days 00:08:55,0 days 00:01:14,,,,,FP,,"[0 days 00:07:41, 0 days 00:08:55]"
SZTE/video/SZTEN201e.mov,0 days 00:10:39,0 days 00:12:56,0 days 00:02:17,,,,,FP,,"[0 days 00:10:39, 0 days 00:12:56]"
SZTR/video/SZTRA203b13.mov,0 days 00:01:28,0 days 00:02:05,0 days 00:00:37,,,,,FP,,"[0 days 00:01:28, 0 days 00:02:05]"
SZTE/video/SZTEN102b.mov,0 days 00:19:26,0 days 00:20:41,0 days 00:01:15,,,,,FP,,"[0 days 00:19:26, 0 days 00:20:41]"


In [22]:
SEQUENCES = SEQUENCES.join(clips_df)
SEQUENCES.index.rename("filename", inplace=True)

In [23]:
# Create unique indexes/identifier for later easier extraction of sequences
filename_series = SEQUENCES.index.to_series()
SEQUENCES["filename"] = filename_series

new_index_series = (
    filename_series.apply(lambda f: Path(f).stem)
    + "_"
    + SEQUENCES["StartTime"].dt.seconds.apply(
        lambda secs: strftime("%H_%M_%S", gmtime(secs))
    )
    + filename_series.apply(lambda f: Path(f).suffix)
)

new_index_series.rename("id_sequence", inplace=True)

new_index_series.head()

filename
SZTE/video/SZTEA101a.mov    SZTEA101a_00_05_37.mov
SZTE/video/SZTEA101a.mov    SZTEA101a_00_08_58.mov
SZTE/video/SZTEA101a.mov    SZTEA101a_00_12_12.mov
SZTE/video/SZTEA101a.mov    SZTEA101a_00_15_14.mov
SZTE/video/SZTEA101a.mov    SZTEA101a_00_18_41.mov
Name: id_sequence, dtype: object

In [24]:
SEQUENCES = SEQUENCES.set_index(new_index_series).sort_index()

# Make sure to place filename colum first, for readability
SEQUENCES = SEQUENCES[pd.Index(["filename"]).append(SEQUENCES.columns.drop("filename"))]

SEQUENCES.head()

Unnamed: 0_level_0,filename,StartTime,EndTime,Duration,Distance,SubjectApproachType,SubjectDescription,SubjectOrientation,Classification,Distraction,Interval,Stage,Weather.Clouds,Weather.Fog,Weather.Rain,Weather.Snow,Weather.TimeOfDay
id_sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
SZTEA101a_00_00_42.mov,SZTE/video/SZTEA101a.mov,0 days 00:00:42,0 days 00:01:49,0 days 00:01:07,,,,,FP,,"[0 days 00:00:42, 0 days 00:01:49]",1,,False,False,False,Dawn
SZTEA101a_00_04_47.mov,SZTE/video/SZTEA101a.mov,0 days 00:04:47,0 days 00:05:26,0 days 00:00:39,,,,,FP,,"[0 days 00:04:47, 0 days 00:05:26]",1,,False,False,False,Dawn
SZTEA101a_00_05_37.mov,SZTE/video/SZTEA101a.mov,0 days 00:05:37,0 days 00:06:37,0 days 00:01:00,30.0,Crouch Walk,One Person,Perpendicular,TP,,"[0 days 00:05:37, 0 days 00:06:37]",1,,False,False,False,Dawn
SZTEA101a_00_08_58.mov,SZTE/video/SZTEA101a.mov,0 days 00:08:58,0 days 00:10:06,0 days 00:01:08,15.0,Crawl,One Person,Perpendicular,TP,,"[0 days 00:08:58, 0 days 00:10:06]",1,,False,False,False,Dawn
SZTEA101a_00_12_12.mov,SZTE/video/SZTEA101a.mov,0 days 00:12:12,0 days 00:13:12,0 days 00:01:00,10.0,Run,One Person,Perpendicular,TP,,"[0 days 00:12:12, 0 days 00:13:12]",1,,False,False,False,Dawn


In [25]:
# Change to way time related column will be serialized in the new csv
SEQUENCES["StartTime"] = SEQUENCES["StartTime"].dt.seconds.apply(
    lambda secs: strftime("%H:%M:%S", gmtime(secs))
)
SEQUENCES["EndTime"] = SEQUENCES["EndTime"].dt.seconds.apply(
    lambda secs: strftime("%H:%M:%S", gmtime(secs))
)
SEQUENCES["Duration"] = SEQUENCES["Duration"].dt.seconds.apply(
    lambda secs: strftime("%H:%M:%S", gmtime(secs))
)
SEQUENCES.drop(columns="Interval", inplace=True)

SEQUENCES.to_csv(hand_distractions_csv.parent / "tp_fp_sequences.csv")