In [None]:
import caffe
import lmdb
from PIL import Image
import cv2
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import os, time
import datetime
import random

video_files = os.listdir("videoclips")
annotation_files = os.listdir("annotations")

video_files.remove("example.avi")

def log(string):
    print "%s: %s" % (str(datetime.datetime.now())[:-6], string)

In [None]:
proportion = 0.3

with open("annotations/train_clean.txt") as fp:
    train_clean = fp.readlines()
    train_clean = [f[:-2] for f in train_clean]
    print len(train_clean)
    train_clean = np.random.choice(train_clean, int(len(train_clean) * proportion))

with open("annotations/test_clean.txt") as fp:
    test_clean = fp.readlines()
    test_clean = [f[:-2] for f in test_clean]
    test_clean = np.random.choice(test_clean, int(len(test_clean) * proportion))
    
with open("annotations/train_auto.txt") as fp:
    train_auto = fp.readlines()
    train_auto = [f[:-2] for f in train_auto]
    train_auto = np.random.choice(train_auto, int(len(train_auto) * proportion))


In [None]:
len(train_clean)

In [None]:
train_clean[0].split("<")[1].split(">")

In [None]:
# Kernels

sobel_x = np.array([
        [-1, 0, 1],
        [-2, 0, 2],
        [-1, 0, 1]
    ])

sobel_y = np.array([
        [1, 2, 1],
        [0, 0, 0],
        [-1, -2, -1]
    ])

kernels = {
    "sobel": [sobel_x, sobel_y]
}

def convolution(X, kernel):
    """
    Apply convolution over an image using a kernel
    
    Args:
        X         : numpy.ndarray
            input image
        kernel    : list
            list of filters. All filters should have the same dimension
    """
    
    shape = X.shape
    kernel_shape = kernel[0].shape
    
    # Number of rows and columns
    target_x = shape[0] - kernel_shape[0] + 1
    target_y = shape[1] - kernel_shape[1] + 1
    
    target = np.zeros((target_x, target_y))
    
    for i in range(target_x):
        for j in range(target_y):
            input_sub = X[i:(i+kernel_shape[0]), j:(j+kernel_shape[1])]
            combined = 0
            for f in kernel:
                interm = np.sum(np.multiply(input_sub, f))
                combined += interm
            target[i, j] = combined
    
    return target

def capture_and_release(fname, resize=(200, 200), sobel=False, show=False):
    cap = cv2.VideoCapture(fname)
    rets, frames = [], []
    while True:
        ret, frame = cap.read()
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
        
        if not ret:
            break
        
        rets.append(ret)
        if resize is not None:
            frame = np.array(Image.fromarray(np.array(frame).astype(np.uint8)).resize(resize))
        if sobel:
            frame = convolution(frame, kernels["sobel"])
            frame = frame.astype(np.float32)
        frame = frame / 255.
        frames.append(frame)


    if show:
        f1 = frames[1]
        cv_rgb = f1
        print "Shape: ", cv_rgb.shape
        print "Number of frames: ", len(frames)
        if sobel:
            plt.imshow(cv_rgb, cmap="Greys_r")
        else:
            plt.imshow(cv_rgb)
        plt.show()

    cap.release()
    cv2.destroyAllWindows()
    return frames

In [None]:
fname = "videoclips/example.avi"
frames = capture_and_release(fname, show=True)

In [None]:
# Dictionary that has two levels:
# video file name => frame number => label (can have multiple)
video_to_frame_to_label = {fname: {} for fname in video_files}
stores = [train_clean, test_clean, train_auto]

label_bank = {"None": 0, "SitUp": 1, "GetOutCar": 2, "StandUp": 3, "AnswerPhone": 4,
             "Kiss": 5, "HugPerson": 6, "HandShake": 7, "SitDown": 8}

for idx, store in enumerate(stores):
    for line in store:
        video_title = line.split("\"")[1]
        begin, end = map(int, line.split("(")[1].split(")")[0].split("-"))
        labels = []
        label_group = line.split("<")[1:]
        for l in label_group:
            label = l.split(">")[0]
            labels.append(label_bank[label])
        for i in range(begin - 1, end + 1):
            if i in video_to_frame_to_label[video_title]:
                video_to_frame_to_label[video_title][i] = \
                        video_to_frame_to_label[video_title][i] + labels                           
            else:
                video_to_frame_to_label[video_title][i] = labels

In [None]:
video_to_frame_to_label.keys()[:10]

In [None]:
# Additional helper functions


def get_frames_and_labels(f, sobel=False):
    """
    Assuming that f is of the form videoclips/FILENAME.avi
    """
    frames = capture_and_release(f, sobel=sobel)
    video_title = f.split("/")[1]
    return_frames = []
    return_labels = []
    for idx, frame in enumerate(frames):
        if frame is not None:
            if idx in video_to_frame_to_label[video_title]:
                labels = video_to_frame_to_label[video_title][idx]
    #             print "Labels length: ", len(labels)
                for label in labels:
                    return_frames.append(frame) # If a frame has two labels, we add two frames
                    return_labels.append(label)
    #                 print "Return frames length: ", len(return_frames)
    #                 print "Return labels length: ", len(labels)
            else:
                return_frames.append(frame)
                return_labels.append(0) # For non-annotated frames
    return return_frames, return_labels

# Write to the lmdb
def write_to_lmdb(lmdb_name, frames, labels=None, write_label=True):
    map_size = np.array(frames).nbytes * 10
    env = lmdb.open(lmdb_name, map_size=map_size)
    
    with env.begin(write=True) as txn:
        for i in range(len(frames)):
            datum = caffe.proto.caffe_pb2.Datum()
            datum.channels = frames[i].shape[2]
            datum.width = frames[i].shape[1]
            datum.height = frames[i].shape[0]
            datum.data = np.array(frames[i]).tobytes()
            if write_label:
                datum.label = int(labels[i])
            str_id = "{:08}".format(i)
            
            txn.put(str_id, datum.SerializeToString())

key_store = range(200000)
random.shuffle(key_store)
            
# Append to an existing LMDB
def append_to_lmdb(lmdb_name, frames, labels=None, trim=None, write_label=True, first_time=False):
    map_size = np.array(frames).nbytes * 1e4
    log("Created")
    log("%i frames" % len(frames))
    env = lmdb.open(lmdb_name, map_size=map_size)
    
    # Get the max_key
    max_key = 0
    if not first_time:
#         max_key = env.stat()["entries"]
        max_key = env.stat()["entries"]
        log("%i entries so far" % max_key)
    
    if trim is not None:
        # Trim should < 0.5
        trim = int(len(frames) * trim)
        frames = frames[trim:-trim]
        log("Trimmed to %i frames" % len(frames))

    
    with env.begin(write=True) as txn:
        for i in range(len(frames)):
            datum = caffe.proto.caffe_pb2.Datum()
            datum.channels = frames[i].shape[2]
            datum.width = frames[i].shape[1]
            datum.height = frames[i].shape[0]
            datum.data = np.array(frames[i]).tobytes()
            if write_label:
                datum.label = int(labels[i])
            str_id = '{:08}'.format(key_store[max_key + 1 + i])
            txn.put(str_id, datum.SerializeToString())

In [None]:
def create_lmdb(fnames, db, trim=None, sobel=False):
    total = len(fnames)
    for idx, f in enumerate(fnames):
        video = f.split("\"")[1]
        log("%i, %s" % (idx, video))
        frames, labels = get_frames_and_labels("videoclips/" + video, sobel=sobel)
        if len(frames[0].shape) == 2:
            # Convert to 3-d data
            frames = np.array([frame[:,:,np.newaxis] for frame in frames])
        first_time = False
        if idx == 0:
            first_time = True
        append_to_lmdb(db, frames[:-1], labels[:-1], trim=trim, first_time=first_time)
        log("Finished %i/%i" % (idx, total))
        time.sleep(1)
    log("Done")

In [None]:
create_lmdb(train_clean, "train_clean_small_lmdb", trim=0.4, sobel=False)

In [None]:
create_lmdb(test_clean, "test_clean_small_lmdb", trim=0.45, sobel=False)


# for idx, f in enumerate(test_clean):
#     video = f.split("\"")[1]
#     print "%i, %s" % (idx, video)
#     frames, labels = get_frames_and_labels("videoclips/" + video)
#     first_time = False
#     if idx == 0:
#         first_time = True
#     append_to_lmdb("test_clean_lmdb", frames[:-1], labels[:-1], first_time)
#     print "Finished %i" % idx
#     time.sleep(1)


In [None]:
total = []
for f in train_clean:
    video = f.split("\"")[1]
    frames, labels = get_frames_and_labels("videoclips/" + video, sobel=False)
    if len(frames[0].shape) == 2:
        # Convert to 3-d data
        frames = np.array([frame[:,:,np.newaxis] for frame in frames])
    total = total + frames