In [1]:
import h5py
import os
import re

import numpy as np

  from ._conv import register_converters as _register_converters


In [2]:
HOME_PATH = '/home/shubhams/Hercules/kidstube-data/'
ANNOTATION_PATH_SUFFIX = 'annotations/{0}.txt'
DATASET_PATH = 'processed/annotated_data.hdf5'

ANNOTATION_LABELS = {
    'none': 0,
    'violent' : 1,
    'sexual': 2,
    'both': 3
}

In [3]:
def save_checkpoint(frame_data, annotations, video_ids, path=HOME_PATH+DATASET_PATH):
    with h5py.File(path, 'a', libver='latest') as f:
        frame_data = np.array(frame_data)
        annotations = np.array(annotations)
        video_ids = np.array(video_ids)
        
        try:
            frame_dset = f['frames']
            annotation_dset = f['annotations']
            vids_dset = f['vids']
        except KeyError:
            frame_dset = f.create_dataset('frames', shape=(0, 6, 512), maxshape=(None, 6, 512), compression = 'gzip')
            annotation_dset = f.create_dataset('annotations', shape=(0, ), maxshape=(None,), compression = 'gzip')
            vids_dset = f.create_dataset('vids', shape=(0, ), maxshape=(None, ), compression = 'gzip', dtype=h5py.special_dtype(vlen=str))
            f.swmr_mode = True

        new_frame_shape = frame_data.shape[0]
        new_annotation_shape = annotations.shape[0]
        new_vids_shape = video_ids.shape[0]
        
        frame_dset.resize(frame_dset.shape[0] + new_frame_shape, axis=0)
        annotation_dset.resize(annotation_dset.shape[0] + new_annotation_shape, axis=0)
        vids_dset.resize(vids_dset.shape[0] + new_vids_shape, axis=0)
        
        frame_dset[-new_frame_shape:] = frame_data
        annotation_dset[-new_annotation_shape:] = annotations
        vids_dset[-new_vids_shape:] = video_ids
        print(frame_dset.shape)

In [5]:
def read_hdf5(name, path=HOME_PATH + 'processed/aggregate_1_sec/frames_features.hdf5'):
    f = h5py.File(path, 'r')
    return f[name]

In [6]:
vids = read_hdf5('vids')
frames = read_hdf5('frames')

In [8]:
prev = vids[0]
vid_details = []

vid_dict = {}
vid_dict['start_index'] = 0
ctr = 0

for i in range(1, vids.shape[0]):
    vid = vids[i]
    ctr += 1
    if not prev == vid:
        vid_dict['vid'] = prev
        vid_dict['length'] = ctr
        vid_details.append(vid_dict)
        
        vid_dict = {}
        vid_dict['start_index'] = i
        ctr = 0
        prev = vid

In [11]:
vid_details

[{'start_index': 0, 'vid': '5938eff100e5263c1d7eee07', 'length': 1473},
 {'start_index': 1473, 'vid': '5938eff100e5263c1d7eee08', 'length': 1502},
 {'start_index': 2975, 'vid': '5938eff100e5263c1d7eee09', 'length': 1502},
 {'start_index': 4477, 'vid': '5938eff100e5263c1d7eee0a', 'length': 1502},
 {'start_index': 5979, 'vid': '5938eff100e5263c1d7eee0b', 'length': 1472},
 {'start_index': 7451, 'vid': '5938eff100e5263c1d7eee0c', 'length': 1502},
 {'start_index': 8953, 'vid': '5938eff100e5263c1d7eee0d', 'length': 1502},
 {'start_index': 10455, 'vid': '5938eff100e5263c1d7eee0e', 'length': 1502},
 {'start_index': 11957, 'vid': '5938eff200e5263c1d7eee0f', 'length': 1502},
 {'start_index': 13459, 'vid': '5938eff200e5263c1d7eee10', 'length': 1501},
 {'start_index': 14960, 'vid': '5938eff200e5263c1d7eee11', 'length': 1502},
 {'start_index': 16462, 'vid': '5938eff200e5263c1d7eee12', 'length': 1502},
 {'start_index': 17964, 'vid': '5938eff200e5263c1d7eee13', 'length': 1502},
 {'start_index': 19466

In [None]:
compare_list = []
for detail in vid_details:
    try :
        vid = detail['vid']
        file_path = HOME_PATH+ANNOTATION_PATH_SUFFIX.format(vid)
        compare_dict = {}
        compare_dict['vid'] = vid
        compare_dict['file_length'] = detail['length']

        with open(file_path) as f:
            content = f.readlines()
            content = [x.strip() for x in content]
            compare_dict['annotation_length'] = len(content)
        if compare_dict['file_length'] != compare_dict['annotation_length']:
            compare_dict['conflict'] = True
        else:
            compare_dict['conflict'] = False
        compare_list.append(compare_dict)
    except FileNotFoundError as e:
        print('File not found for Video ID: ', vid)

In [None]:
ctr = 0
for comparings in compare_list:
    if comparings['conflict'] == True:
        ctr += 1
        print(comparings)
print(ctr)

In [None]:
for detail in vid_details:
    vid = detail['vid']
    bad_annotation_ctr = 0
    frame_index = 0
    annotated_features = []
    annotations = []
    annotation_vids = []
    
    
    file_path = HOME_PATH + ANNOTATION_PATH_SUFFIX.format(vid)
    with open(file_path) as f:
        content = f.readlines()
        content = [x.strip() for x in content]
    for annotation in content:
        m = re.search('[^: ]+$', annotation)
        try:
            annotations.append(ANNOTATION_LABELS[m.group(0)])
            annotated_features.append(frames[frame_index + detail['start_index']])
            annotation_vids.append(vid)
        except KeyError:
            bad_annotation_ctr += 1
            pass
        frame_index += 1
    print(vid, detail['length'], len(annotated_features), bad_annotation_ctr)
    assert len(annotated_features) + bad_annotation_ctr <= detail['length']