# Concatenete rgb and flow features into one feature vector

In [34]:
import numpy as np
import os

In [None]:
flow_folder = 'features/flow_anet_resnet200'
rgb_folder = 'features/rgb_anet_resnet200'
video_list_file = 'resources/video_list.txt'
new_features_folder = 'features/features_actionformer'

# read the video list
with open(video_list_file, 'r') as f:
    video_list = f.read().splitlines()

In [35]:
def save_one_feat_vec(flow: np.array, rgb: np.array, filename: str):
    """ save the feature vector to a file

    Args:
        flow (np.array): Flow feature vector
        rgb (np.array): RGB feature vector
        filename (str): filename to save the feature vector
    """
    
    if os.path.exists(filename):
        return
    
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    feat = np.concatenate((flow, rgb), axis=1)
    np.save(filename, feat)

In [36]:
def concatenete_video_feat(video_id: str, flow_folder: str, rgb_folder: str, new_features_folder: str):
    """ Concatenates the flow and rgb features for a video and saves the result in a new file.

    Args:
        video_id (str): video id of the video to process
        flow_folder (str): Folder containing the flow features for all videos
        rgb_folder (str): Folder containing the rgb features for all videos
        new_features_folder (str): Folder where the new features will be saved
    """
    
    flow_filename = os.path.join(flow_folder, video_id + '.npy')
    rgb_filename = os.path.join(rgb_folder, video_id + '.npy')
    one_feat_vec_filename = os.path.join(new_features_folder, video_id + '.npy')
    
    if not os.path.exists(flow_filename) or not os.path.exists(rgb_filename):
        print(f'Missing flow or rgb file for video {video_id}')
        exit(1)
    
    flow = np.load(flow_filename)
    rgb = np.load(rgb_filename)
    
    save_one_feat_vec(flow, rgb, one_feat_vec_filename)

### Load features and create feature vectors

In [27]:
import multiprocessing

num_processes = multiprocessing.cpu_count()
with multiprocessing.Pool(num_processes) as pool:
    pool.starmap(concatenete_video_feat, [(video_id, flow_folder, rgb_folder, new_features_folder) for video_id in video_list])

# Create json dataset file

In [12]:
import json
import numpy as np
import pandas as pd
import os
import re
import pickle

In [13]:
annotations_filename = "resources/interaction_frames.pkl"

with open(annotations_filename, 'rb') as file:
    picke_file = pickle.load(file)

In [14]:
df_interaction_frames = picke_file["df_annotations"]
interaction_types = picke_file["interaction_types"]
object_classes = picke_file["object_classes"]
videos_info = picke_file["videos"]

##### create object classes dict

In [15]:
object_classes_dict = {int(k):v['class_name'] for k,v in object_classes.items()}

##### Remove annotations for the test set

In [16]:
# df_interaction_frames = df_interaction_frames[df_interaction_frames['split']!="Test"]

##### expand the annotations in single interactions

In [17]:
interactions = pd.DataFrame()

for key,row in df_interaction_frames.iterrows():
    for interaction in row['interaction']:
        
        frame_idx = row['frame_id'].replace(f"{row['video_id']}_",'')
        object_id = row['objects'][interaction['id_obj']]['class_name']
        
        data = {'video_id': row['video_id'],
                'split': row['split'],
                'frame': frame_idx,
                'timestamp': row['timestamp'],
                'object_name': object_classes_dict[object_id],
                'interaction_id': interaction['interaction_category'],
                'interaction_label': interaction_types[interaction['interaction_category']]}
        
        interactions = pd.concat([interactions, pd.DataFrame([data])], ignore_index=True)

##### create splits for Take/Release, First Contact/Decontact, Take/Release/First Contact/Decontact

In [18]:
idx_ht_hr = interactions.apply(lambda x: x['interaction_label'] in ("take","release"), axis=1)
idx_fc_hd = interactions.apply(lambda x: x['interaction_label'] in ("first_contact","decontact"), axis=1)

ht_hr = interactions[idx_ht_hr]
fc_hd  = interactions[idx_fc_hd]
ht_hr_fc_hd = interactions.copy()

In [19]:
print(f"ht_hr: {len(ht_hr)}, fc_hr: {len(fc_hd)}, ht_hr_fc_hd: {len(ht_hr_fc_hd)}")

ht_hr: 2872, fc_hr: 8300, ht_hr_fc_hd: 11172


##### create frame_ranges for take/release

In [20]:
frame_ranges = pd.read_csv('resources/frame_range_en.csv', index_col="action_name")

In [21]:
idx_take = frame_ranges.apply(lambda x: 'take' in x.name, axis=1)
idx_release = frame_ranges.apply(lambda x: 'release' in x.name, axis=1)

# in the case the the frame range for an interation class is not specified, we use the mean of the frame ranges
mean_frame_range_dx_take = frame_ranges[idx_take]['frame_range_dx'].mean()
mean_frame_range_dx_release = frame_ranges[idx_release]['frame_range_dx'].mean()
mean_frame_range_sx_take = frame_ranges[idx_take]['frame_range_sx'].mean()
mean_frame_range_sx_release = frame_ranges[idx_release]['frame_range_sx'].mean()

## Using boundaries from frame ranges sx, dx

In [22]:
def create_enigma_actionformer_dataset(df_interactions, frame_ranges, mean_frame_range_dx_take, mean_frame_range_dx_release):

    enigma_json = dict()
    enigma_json["version"] = "ENIGMA"
    enigma_json["database"] = dict()

    gb_videos = df_interactions.groupby('video_id')

    for video_id, vid_interactions in gb_videos:

        vid_subset = videos_info[str(video_id)]["split"]
        vid_duration = round(videos_info[str(video_id)]["duration_seconds"],2)
        vid_fps = videos_info[str(video_id)]["fps"]
        video_annotations = []
        
        for i, interaction in vid_interactions.iterrows():
            
            # calculate padding to add to the interaction
            class_interaction = interaction['interaction_label'] + '-' + interaction['object_name']

            if interaction['interaction_label'] in ['first_contact', 'decontact']:
                padding_dx = 15
                padding_sx = 15
            elif class_interaction in frame_ranges.index:
                padding_dx = frame_ranges.loc[class_interaction]["frame_range_dx"]
                padding_sx = frame_ranges.loc[class_interaction]["frame_range_sx"]
            else:
                print(class_interaction)
                padding_dx = mean_frame_range_dx_release if interaction['interaction_label'] == 'release' else mean_frame_range_dx_take
                padding_sx = mean_frame_range_sx_release if interaction['interaction_label'] == 'release' else mean_frame_range_sx_take
                
            
            # padding_sx = 15
            # padding_dx = 15
            
            annotation_data = {
                "label": interaction['interaction_label'],
                "segment": [
                    interaction['timestamp'] - padding_sx/vid_fps,
                    interaction['timestamp'] + padding_dx/vid_fps
                ],
                "segment(frames)": [
                    int(interaction['frame']) - padding_sx,
                    int(interaction['frame']) + padding_dx
                ],
                "label_id": interaction['interaction_id']
            }
            
            video_annotations.append(annotation_data)

        video_data_dict = {
            "subset": vid_subset,
            "duration": vid_duration,
            "fps": vid_fps,
            "annotations": video_annotations
        }

        enigma_json["database"][str(video_id)] = video_data_dict
        
    return enigma_json

def save_actionformer_dataset_json(json_dataset, json_filename):
    class NpEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, np.integer):
                return int(obj)
            if isinstance(obj, np.floating):
                return float(obj)
            if isinstance(obj, np.ndarray):
                return obj.tolist()
            return super(NpEncoder, self).default(obj)
        
    with open(json_filename, 'w') as json_file:
        json.dump(json_dataset, json_file, cls=NpEncoder)

In [23]:
ht_hr_json_filename = 'annotations/enigma_ht_hr.json'
fc_hd_json_filename = 'annotations/enigma_fc_hd.json'
ht_hr_fc_hd_filename = 'annotations/enigma_ht_hr_fc_hd.json'


class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)


ht_hr_json = create_enigma_actionformer_dataset(df_interactions=ht_hr, 
                                   frame_ranges=frame_ranges,
                                   mean_frame_range_dx_take=mean_frame_range_dx_take,
                                   mean_frame_range_dx_release=mean_frame_range_dx_release)

fc_hd_json = create_enigma_actionformer_dataset(df_interactions=fc_hd, 
                                   frame_ranges=frame_ranges,
                                   mean_frame_range_dx_take=mean_frame_range_dx_take,
                                   mean_frame_range_dx_release=mean_frame_range_dx_release)

ht_hr_fc_hd_json = create_enigma_actionformer_dataset(df_interactions=ht_hr_fc_hd , 
                                   frame_ranges=frame_ranges,
                                   mean_frame_range_dx_take=mean_frame_range_dx_take,
                                   mean_frame_range_dx_release=mean_frame_range_dx_release)

save_actionformer_dataset_json(ht_hr_json, ht_hr_json_filename)
save_actionformer_dataset_json(fc_hd_json, fc_hd_json_filename)
save_actionformer_dataset_json(ht_hr_fc_hd_json, ht_hr_fc_hd_filename)

take-power_supply
release-power_supply
take-oscilloscope
take-welder_base
release-welder_base
release-welder_station
release-oscilloscope
release-oscilloscope
take-oscilloscope
release-oscilloscope
release-oscilloscope
release-welder_base
take-welder_base
release-welder_base
take-power_supply
release-power_supply
take-oscilloscope
take-welder_base
release-welder_base
release-welder_station
release-oscilloscope
release-oscilloscope
take-oscilloscope
release-oscilloscope
release-oscilloscope
release-welder_base
take-welder_base
release-welder_base


## Using small fixed boundaries for contact timestamp

In [74]:
TR_ACTION_NAME_MAPPING = {0: 'hand_take',
                          1: 'hand_release'}

In [75]:
def get_subset(video_id: str) -> str:
    """ Get the subset of a video (training, validation or test)

    Args:
        video_id (str): video id of the video to process

    Returns:
        str: subset of the video (training, validation or test)
    """
    
    if video_id in training_set:
        return 'training'
    elif video_id in validation_set:
        return 'validation'
    elif video_id in test_set:
        return 'testing'

    return None

def get_video_duration(hand_tr_segmantation_gt_filename: str, video_fps: float) -> float:
    hand_tr_segmantation_gt = pd.read_csv(hand_tr_segmantation_gt_filename)    
    
    tot_frames = hand_tr_segmantation_gt.iloc[-1]['end_index'] + 1
    return round(tot_frames / video_fps, 2)

def get_annotation(segment_series: pd.Series, video_fps : float, video_len : float) -> dict:
    """ Get the annotation of a segment in the format required by the ActionFormer model
        
    Args:
        segment_series (pd.Series): Series containing the annotation of a contact
            segment_series = {
                "t-start": 47.886,
                "class": 1
            }
        video_fps (float): FPS of the video
        video_len (float): Length of the video
        
    Returns:
        dict: annotation of a segment in the format required by the ActionFormer model
            annotation = {
                "label": "hand_take",
                "segment": [ 2.0, 3.0 ],
                "segment(frames)": [ 60.0, 90.0 ],
                "label_id": 1
            }
    """    
    
    # IN OUR CASE WE START FRAMES FROM 0
    # IT SHOULD BE OK (https://github.com/happyharrycn/actionformer_release/issues/4#issuecomment-1050008045)
    contact_timestamp = segment_series['t-start']
    
    # WE CREATE A SEGMENT OF 1 SECOND AROUND THE CONTACT
    start_segment = max(0,contact_timestamp - 0.5)
    end_segment = min(video_len, contact_timestamp + 0.5)
    
    
    # CLASSES STARTS FROM 0 (NEGATIVE CLASS ARE NOT CONSIDERED)
    label_id = int(segment_series['class'])
    
    start_index = int(start_segment - (start_segment % (1/video_fps)))*video_fps
    end_index = int(end_segment - (end_segment % (1/video_fps)))*video_fps
    
    annotation_data = {
        "label": TR_ACTION_NAME_MAPPING[label_id],
        "segment": [
            start_segment,
            end_segment
        ],
        "segment(frames)": [
            start_index,
            end_index
        ],
        "label_id": label_id
    }
    
    return annotation_data

def get_contact_timestamp(filename_csv):
    
    data = pd.read_csv(filename_csv)
    
    hand_take_rows = data[data["tipo_azione"] =="Hand_Take (mano-oggetto)"][['timestamp','id_label']]
    hand_take_rows['action_type'] = "hand_take"
    hand_release_rows = data[data["tipo_azione"] =="Hand_Release (mano-oggetto)"][['timestamp','id_label']]
    hand_release_rows['action_type'] = "hand_release"

    actions = pd.concat([hand_take_rows, hand_release_rows])
    actions = actions.sort_values(by=['timestamp'])
    actions = actions.reset_index(drop=True)
    #remove the id from the label e.g. (5.Clip_di_massa) -> Clip_di_massa
    actions['id_label'] = [re.sub("^[0-9]*.", '', x) for x in actions['id_label']]
    actions['class'] = actions['action_type'].apply(lambda x: ENIGMA_CLASS_NAMES.index(x))
    
    actions = actions[actions['id_label'].isin(OBJECT_NAMES)]
    
    actions.drop(columns=['id_label', 'action_type'], inplace=True)
    actions.rename(columns={'timestamp': 't-start'}, inplace=True)
    
    return actions

In [76]:
anno_path = 'resources/enigma_csv'
hand_tr_segmantation_gt_folder = 'resources/hand_tr_segmantation_gt'
FPS = 30

enigma_json = dict()
enigma_json["version"] = "ENIGMA"
enigma_json["database"] = dict()

for video_id in video_list:
    subset = get_subset(video_id)
    assert subset is not None, f'Video {video_id} not found in any set'
    
    # read the ground truth file
    contact_timestamp_gt = get_contact_timestamp(os.path.join(anno_path, video_id + '.csv'))
    
    # read the ground truth file
    hand_tr_segmantation_gt_filename = os.path.join(hand_tr_segmantation_gt_folder, video_id + '.csv')
    video_duration = get_video_duration(hand_tr_segmantation_gt_filename, FPS)
    
    # create the annotations for the video
    video_annotations = [get_annotation(row, FPS, video_duration) for _,row in contact_timestamp_gt.iterrows()]

    video_data_dict = {
        "subset": subset,
        "duration": video_duration,
        "fps": FPS,
        "annotations": video_annotations
    }

    enigma_json["database"][str(video_id)] = video_data_dict

In [77]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)


with open(enigma_json_filename, 'w') as json_file:
    json.dump(enigma_json, json_file, cls=NpEncoder)

# Temp

In [4]:
video_id = '44'

hand_tr_segmantation_gt_filename = os.path.join(hand_tr_segmantation_gt_folder, video_id + '.csv')
hand_tr_segmantation_gt = pd.read_csv(hand_tr_segmantation_gt_filename)

print(hand_tr_segmantation_gt.tail())

tot_frames = hand_tr_segmantation_gt.iloc[-1]['end_index'] + 1
print(tot_frames)

    start_index  end_index  class
56        34084      34099      2
57        34100      34167      1
58        34168      34927      0
59        34928      34980      1
60        34981      38283      0
38284
