# Concatenete rgb and flow features into one feature vector

In [34]:
import numpy as np
import os

In [None]:
flow_folder = 'features/flow_anet_resnet200'
rgb_folder = 'features/rgb_anet_resnet200'
video_list_file = 'resources/video_list.txt'
new_features_folder = 'features/features_actionformer'

# read the video list
with open(video_list_file, 'r') as f:
    video_list = f.read().splitlines()

In [35]:
def save_one_feat_vec(flow: np.array, rgb: np.array, filename: str):
    """ save the feature vector to a file

    Args:
        flow (np.array): Flow feature vector
        rgb (np.array): RGB feature vector
        filename (str): filename to save the feature vector
    """
    
    if os.path.exists(filename):
        return
    
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    feat = np.concatenate((flow, rgb), axis=1)
    np.save(filename, feat)

In [36]:
def concatenete_video_feat(video_id: str, flow_folder: str, rgb_folder: str, new_features_folder: str):
    """ Concatenates the flow and rgb features for a video and saves the result in a new file.

    Args:
        video_id (str): video id of the video to process
        flow_folder (str): Folder containing the flow features for all videos
        rgb_folder (str): Folder containing the rgb features for all videos
        new_features_folder (str): Folder where the new features will be saved
    """
    
    flow_filename = os.path.join(flow_folder, video_id + '.npy')
    rgb_filename = os.path.join(rgb_folder, video_id + '.npy')
    one_feat_vec_filename = os.path.join(new_features_folder, video_id + '.npy')
    
    if not os.path.exists(flow_filename) or not os.path.exists(rgb_filename):
        print(f'Missing flow or rgb file for video {video_id}')
        exit(1)
    
    flow = np.load(flow_filename)
    rgb = np.load(rgb_filename)
    
    save_one_feat_vec(flow, rgb, one_feat_vec_filename)

### Load features and create feature vectors

In [27]:
import multiprocessing

num_processes = multiprocessing.cpu_count()
with multiprocessing.Pool(num_processes) as pool:
    pool.starmap(concatenete_video_feat, [(video_id, flow_folder, rgb_folder, new_features_folder) for video_id in video_list])

# Create json dataset file

In [2]:
import json
import numpy as np
import pandas as pd
import os
import re

In [3]:
training_set = [
    "44",
    "47",
    "49",
    "55",
    "58",
    "60",
    "81",
    "85",
    "88",
    "91",
    "95",
    "104",
    "107",
    "141",
    "143",
    "148",
    "149",
    "153",
    "156",
    "46",
    "53",
    "56",
    "63",
    "66",
    "72",
    "92",
    "102",
    "105",
    "117",
    "129",
    "132",
    "144",
    "145",
    "146",
    "154",
    "157",
    "160"]

validation_set = [
            "69",
            "74",
            "111",
            "116",
            "83",
            "86"
    ]

test_set = [
            "65",
            "68",
            "126",
            "128",
            "131",
            "137",
            "76",
            "79",
            "89",
            "135"
    ]

OBJECT_NAMES = ["Cavi_alimentatore",
                "Puntale_oscilloscopio",
                "Clip_di_massa",
                "Puntale_saldatore",
                "Avvitatore",
                "Batteria_avvitatore",
                "Connettore_Batteria_Avvitatore",
                "Cacciavite",
                "Pinza",
                "Scheda_Alto_Voltaggio",
                "Scheda_Basso_Voltaggio",
                "Schermo_Scheda_Basso_Voltaggio",
                "Registro"]

TR_ACTION_NAME_MAPPING = {0: 'negative',
                          1: 'hand_take',
                          2: 'hand_release'}

ENIGMA_CLASS_NAMES = ['hand_take',
                      'hand_release']

In [5]:
video_list_file = 'resources/video_list.txt'
hand_tr_segmantation_gt_folder = 'resources/hand_tr_segmantation_gt'
enigma_json_filename = "enigma_dataset_temp_start.json"
FPS = 30

# read the video list
with open(video_list_file, 'r') as f:
    video_list = f.read().splitlines()

## Using boundaries from frame ranges sx, dx

In [6]:
def get_subset(video_id: str) -> str:
    """ Get the subset of a video (training, validation or test)

    Args:
        video_id (str): video id of the video to process

    Returns:
        str: subset of the video (training, validation or test)
    """
    
    if video_id in training_set:
        return 'training'
    elif video_id in validation_set:
        return 'validation'
    elif video_id in test_set:
        return 'testing'

    return None

def get_video_duration(df_annotations: pd.DataFrame, video_fps: float) -> float:
    """ Get the duration of a video with a precision of 2 decimal places

    Args:
        df_annotations (pd.DataFrame): Dataframe containing the annotations of the video with a column 'end_index'
        video_fps (float): FPS of the video

    Returns:
        float: duration of the video with a precision of 2 decimal places
    """    
    
    tot_frames = df_annotations.iloc[-1]['end_index'] + 1
    return round(tot_frames / video_fps, 2)

def get_annotation(segment_series: pd.Series, video_fps : float) -> dict:
    """ Get the annotation of a segment in the format required by the ActionFormer model
        
    Args:
        segment_series (pd.Series): Series containing the annotation of a segment
            segment_series = {
                "start_index": 1200,
                "end_index": 1800,
                "class": 1
            }
        video_fps (float): FPS of the video
        
    Returns:
        dict: annotation of a segment in the format required by the ActionFormer model
            annotation = {
                "label": "hand_take",
                "segment": [ 2.0, 3.0 ],
                "segment(frames)": [ 60.0, 90.0 ],
                "label_id": 1
            }
    """    
    
    # IN OUR CASE WE START FRAMES FROM 0
    # IT SHOULD BE OK (https://github.com/happyharrycn/actionformer_release/issues/4#issuecomment-1050008045)
    start_index = segment_series['start_index']
    end_index = segment_series['end_index']
    
    # CLASSES STARTS FROM 0 (NEGATIVE CLASS ARE NOT CONSIDERED)
    label_id = segment_series['class']
    
    start_segment = round(start_index / video_fps, 1)
    end_segment = round(end_index / video_fps, 1)
    
    annotation_data = {
        "label": TR_ACTION_NAME_MAPPING[label_id],
        "segment": [
            start_segment,
            end_segment
        ],
        "segment(frames)": [
            start_index,
            end_index
        ],
        # WE NEED TO SHIFT THE LABEL_ID BY 1 BECAUSE THE NEGATIVE CLASS IS NOT CONSIDERED
        "label_id": label_id - 1
    }
    
    return annotation_data

In [7]:
FPS = 30

enigma_json = dict()
enigma_json["version"] = "ENIGMA"
enigma_json["database"] = dict()

for video_id in video_list:
    subset = get_subset(video_id)
    assert subset is not None, f'Video {video_id} not found in any set'
    
    # read the ground truth file
    hand_tr_segmantation_gt_filename = os.path.join(hand_tr_segmantation_gt_folder, video_id + '.csv')
    hand_tr_segmantation_gt = pd.read_csv(hand_tr_segmantation_gt_filename)
    
    video_duration = get_video_duration(hand_tr_segmantation_gt, FPS)
    
    # remove the negative class
    hand_tr_segmantation_gt = hand_tr_segmantation_gt[hand_tr_segmantation_gt['class'] != 0]
    
    # create the annotations for the video
    video_annotations = [get_annotation(row, FPS) for _,row in hand_tr_segmantation_gt.iterrows()]

    video_data_dict = {
        "subset": subset,
        "duration": video_duration,
        "fps": FPS,
        "annotations": video_annotations
    }

    enigma_json["database"][str(video_id)] = video_data_dict

In [8]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)


with open(enigma_json_filename, 'w') as json_file:
    json.dump(enigma_json, json_file, cls=NpEncoder)

## Using small fixed boundaries for contact timestamp

In [74]:
TR_ACTION_NAME_MAPPING = {0: 'hand_take',
                          1: 'hand_release'}

In [75]:
def get_subset(video_id: str) -> str:
    """ Get the subset of a video (training, validation or test)

    Args:
        video_id (str): video id of the video to process

    Returns:
        str: subset of the video (training, validation or test)
    """
    
    if video_id in training_set:
        return 'training'
    elif video_id in validation_set:
        return 'validation'
    elif video_id in test_set:
        return 'testing'

    return None

def get_video_duration(hand_tr_segmantation_gt_filename: str, video_fps: float) -> float:
    hand_tr_segmantation_gt = pd.read_csv(hand_tr_segmantation_gt_filename)    
    
    tot_frames = hand_tr_segmantation_gt.iloc[-1]['end_index'] + 1
    return round(tot_frames / video_fps, 2)

def get_annotation(segment_series: pd.Series, video_fps : float, video_len : float) -> dict:
    """ Get the annotation of a segment in the format required by the ActionFormer model
        
    Args:
        segment_series (pd.Series): Series containing the annotation of a contact
            segment_series = {
                "t-start": 47.886,
                "class": 1
            }
        video_fps (float): FPS of the video
        video_len (float): Length of the video
        
    Returns:
        dict: annotation of a segment in the format required by the ActionFormer model
            annotation = {
                "label": "hand_take",
                "segment": [ 2.0, 3.0 ],
                "segment(frames)": [ 60.0, 90.0 ],
                "label_id": 1
            }
    """    
    
    # IN OUR CASE WE START FRAMES FROM 0
    # IT SHOULD BE OK (https://github.com/happyharrycn/actionformer_release/issues/4#issuecomment-1050008045)
    contact_timestamp = segment_series['t-start']
    
    # WE CREATE A SEGMENT FROM CONTACT_TIMESTAMP TO THE END OF THE ACTION
    start_segment = contact_timestamp
    end_segment = segment_series['end_index']
    
    
    # CLASSES STARTS FROM 0 (NEGATIVE CLASS ARE NOT CONSIDERED)
    label_id = int(segment_series['class'])
    
    start_index = int(start_segment - (start_segment % (1/video_fps)))*video_fps
    end_index = int(end_segment - (end_segment % (1/video_fps)))*video_fps
    
    annotation_data = {
        "label": TR_ACTION_NAME_MAPPING[label_id],
        "segment": [
            start_segment,
            end_segment
        ],
        "segment(frames)": [
            start_index,
            end_index
        ],
        "label_id": label_id
    }
    
    return annotation_data

def get_contact_timestamp(filename_csv):
    
    data = pd.read_csv(filename_csv)
    
    hand_take_rows = data[data["tipo_azione"] =="Hand_Take (mano-oggetto)"][['timestamp','id_label']]
    hand_take_rows['action_type'] = "hand_take"
    hand_release_rows = data[data["tipo_azione"] =="Hand_Release (mano-oggetto)"][['timestamp','id_label']]
    hand_release_rows['action_type'] = "hand_release"

    actions = pd.concat([hand_take_rows, hand_release_rows])
    actions = actions.sort_values(by=['timestamp'])
    actions = actions.reset_index(drop=True)
    #remove the id from the label e.g. (5.Clip_di_massa) -> Clip_di_massa
    actions['id_label'] = [re.sub("^[0-9]*.", '', x) for x in actions['id_label']]
    actions['class'] = actions['action_type'].apply(lambda x: ENIGMA_CLASS_NAMES.index(x))
    
    actions = actions[actions['id_label'].isin(OBJECT_NAMES)]
    
    actions.drop(columns=['id_label', 'action_type'], inplace=True)
    actions.rename(columns={'timestamp': 't-start'}, inplace=True)
    
    return actions

In [76]:
anno_path = 'resources/enigma_csv'
hand_tr_segmantation_gt_folder = 'resources/hand_tr_segmantation_gt'
FPS = 30

enigma_json = dict()
enigma_json["version"] = "ENIGMA"
enigma_json["database"] = dict()

for video_id in video_list:
    subset = get_subset(video_id)
    assert subset is not None, f'Video {video_id} not found in any set'
    
    # read the ground truth file
    contact_timestamp_gt = get_contact_timestamp(os.path.join(anno_path, video_id + '.csv'))
    
    # read the ground truth file
    hand_tr_segmantation_gt_filename = os.path.join(hand_tr_segmantation_gt_folder, video_id + '.csv')
    video_duration = get_video_duration(hand_tr_segmantation_gt_filename, FPS)
    
    # create the annotations for the video
    video_annotations = [get_annotation(row, FPS, video_duration) for _,row in contact_timestamp_gt.iterrows()]

    video_data_dict = {
        "subset": subset,
        "duration": video_duration,
        "fps": FPS,
        "annotations": video_annotations
    }

    enigma_json["database"][str(video_id)] = video_data_dict

In [77]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)


with open(enigma_json_filename, 'w') as json_file:
    json.dump(enigma_json, json_file, cls=NpEncoder)

# Temp

In [4]:
video_id = '44'

hand_tr_segmantation_gt_filename = os.path.join(hand_tr_segmantation_gt_folder, video_id + '.csv')
hand_tr_segmantation_gt = pd.read_csv(hand_tr_segmantation_gt_filename)

print(hand_tr_segmantation_gt.tail())

tot_frames = hand_tr_segmantation_gt.iloc[-1]['end_index'] + 1
print(tot_frames)

    start_index  end_index  class
56        34084      34099      2
57        34100      34167      1
58        34168      34927      0
59        34928      34980      1
60        34981      38283      0
38284
