In [1]:
import numpy as np

import caffe

from lib import run_net
from lib import score_util

from datasets.youtube import youtube
from datasets.pascal_voc import pascal

Configure Caffe and load net

In [2]:
caffe.set_device(0)
caffe.set_mode_gpu()

net = caffe.Net('../nets/stage-voc-fcn8s.prototxt',
                '../nets/voc-fcn8s-heavy.caffemodel',
                caffe.TEST)

Dataset details

In [3]:
YT = youtube('/x/youtube/')
PV = pascal('/x/PASCAL/VOC2011')

n_cl = len(YT.classes)
inputs = YT.load_dataset()

Set base clock/subsampling rate

In [4]:
CR = 10 # subsample amount -- we used only every 10 frames for paper

# Oracle per frame

In [5]:
hist_perframe = np.zeros((n_cl, n_cl))
for (class_, vid, shot) in inputs:
    for f in YT.list_label_frames(class_, vid, shot):
        # skip the first 2 frames to align with pipeline
        if f < (2*CR + 1):
            continue
        im = YT.load_frame(class_, vid, shot, f)
            
        out = run_net.segrun(net, YT.preprocess(im))
        out_yt = np.zeros(out.shape, dtype=np.int32)
        for c in YT.classes:
            out_yt[out == PV.classes.index(c)] = YT.classes.index(c)

        label = YT.load_label(class_, vid, shot, f)
        label = YT.make_label(label, class_)
        hist_perframe += score_util.fast_hist(label.flatten(), out_yt.flatten(), n_cl)

acc, cl_acc, mean_iu, fw_iu = score_util.get_scores(hist_perframe)
print 'Oracle: Per frame'
print 'acc\t\t cl acc\t\t mIU\t\t fwIU'
print '{:f}\t {:f}\t {:f}\t {:f}\t'.format(100*acc, 100*cl_acc, 100*mean_iu, 100*fw_iu)

Oracle: Per frame
acc		 cl acc		 mIU		 fwIU
95.255650	 82.298180	 69.999789	 91.424557	


# Clockwork baseline every other frame

In [6]:
# Run on previous frames
hist_baseline = np.zeros((n_cl, n_cl))
for (class_, vid, shot) in inputs: 
    # Run always on previous frame
    for f in YT.list_label_frames(class_, vid, shot):
        # skip the first 10 frames to align with pipeline
        if f < 2*CR + 1:
            continue
        im = YT.load_frame(class_, vid, shot, (f-CR))
            
        out = run_net.segrun(net, YT.preprocess(im))
        out_yt = np.zeros(out.shape, dtype=np.int32)
        for c in YT.classes:
            out_yt[out == PV.classes.index(c)] = YT.classes.index(c)

        label = YT.load_label(class_, vid, shot, f)
        label = YT.make_label(label, class_)
        hist_baseline += score_util.fast_hist(label.flatten(), out_yt.flatten(), n_cl)

# score merged histograms for on-frame and off-frame inference
# to avoid bias in every other evaluation
acc, cl_acc, mean_iu, fw_iu = score_util.get_scores(hist_baseline + hist_perframe)
print 'Baseline (Every other on subsample by {})'.format(CR)
print 'acc\t\t cl acc\t\t mIU\t\t fwIU'
print '{:f}\t {:f}\t {:f}\t {:f}\t'.format(100*acc, 100*cl_acc, 100*mean_iu, 100*fw_iu)

Baseline (Every other on subsample by 10)
acc		 cl acc		 mIU		 fwIU
94.244592	 78.600476	 65.643063	 89.739591	


# Alternating Clockwork

In [7]:
# Run always assuming updated previous (average with perframe result from above)
hist_alternate = np.zeros((n_cl, n_cl))

for (class_, vid, shot) in inputs:
    for f in YT.list_label_frames(class_, vid, shot):
        # skip the first 10 frames to align with pipeline
        if f < 2*CR+1:
            continue
            
        # Assume 10 frame old is last full run
        im = YT.load_frame(class_, vid, shot, (f-CR))
        _ = run_net.segrun(net, YT.preprocess(im))
        
        # Run current frame through clockwork
        im = YT.load_frame(class_, vid, shot, f)
        out = run_net.adaptive_clock_forward(net, YT.preprocess(im), False)
        out_yt = np.zeros(out.shape, dtype=np.int32)
        for c in YT.classes:
            out_yt[out == PV.classes.index(c)] = YT.classes.index(c)

        label = YT.load_label(class_, vid, shot, f)
        label = YT.make_label(label, class_)
        hist_alternate += score_util.fast_hist(label.flatten(), out_yt.flatten(), n_cl)

# score merged histograms for on-frame and off-frame inference
# to avoid bias in every other evaluation
acc, cl_acc, mean_iu, fw_iu = score_util.get_scores(hist_alternate + hist_perframe)
print 'Alternating Clockwork (Every other on subsample by {})'.format(CR)
print 'acc\t\t cl acc\t\t mIU\t\t fwIU'
print '{:f}\t {:f}\t {:f}\t {:f}\t'.format(100*acc, 100*cl_acc, 100*mean_iu, 100*fw_iu)

Alternating Clockwork (Every other on subsample by 10)
acc		 cl acc		 mIU		 fwIU
94.590488	 79.387382	 66.959342	 90.291947	


# Adaptive Clockwork

In [8]:
def sm_diff(prev_fts, fts):
    prev_m = prev_fts.argmax(axis=0).copy()
    curr_m = fts.argmax(axis=0).copy()
    diff = np.array(prev_m != curr_m).mean()
    return diff

def adaptive_clockwork_youtube(thresh):
    hist = np.zeros((n_cl, n_cl))
    num_frames = 0
    num_update_frames = 0
    for (class_, vid, shot) in inputs:
        is_first = True
        for f in YT.list_label_frames(class_, vid, shot):
            # skip the first 10 frames to align with pipeline
            if f < 2*CR+1:
                continue

            num_frames += 1 # index the total number of frames        
            if is_first: # push the 10 frame lag through the net
                im = YT.load_frame(class_, vid, shot, (f-CR))
                _ = run_net.segrun(net, YT.preprocess(im))
                prev_fts = net.blobs['score_pool4'].data[0].copy()
                is_first = False

            # Run to pool4 on current frame
            im = YT.load_frame(class_, vid, shot, f)
            run_net.feed_net(net, YT.preprocess(im))
            net.forward(start='conv1_1', end='score_pool4')
            curr_fts = net.blobs['score_pool4'].data[0].copy()

            # Decide whether or not to update to fc7
            d = sm_diff(prev_fts, curr_fts)
            if sm_diff(prev_fts, curr_fts) >= thresh: # push through rest of net
                net.forward(start='conv5_1', end='upscore2') 
                prev_fts = net.blobs['score_pool4'].data[0].copy()
                num_update_frames += 1

            # Compute full merge score
            net.forward(start='score_pool4c') 
            out = net.blobs['score'].data[0].argmax(axis=0).astype(np.uint8)
            out_yt = np.zeros(out.shape, dtype=np.int32)
            for c in YT.classes:
                out_yt[out == PV.classes.index(c)] = YT.classes.index(c)

            label = YT.load_label(class_, vid, shot, f)
            label = YT.make_label(label, class_)
            hist += score_util.fast_hist(label.flatten(), out_yt.flatten(), n_cl)

    acc, cl_acc, mean_iu, fw_iu = score_util.get_scores(hist)
    print 'Adaptive Clockwork: Threshold', thresh, ' Updated {:d}/{:d} frames ({:2.1f}%)'.format(num_update_frames, num_frames, 100.0*num_update_frames/num_frames)
    print 'acc\t\t cl acc\t\t mIU\t\t fwIU'
    print '{:f}\t {:f}\t {:f}\t {:f}\t'.format(100*acc, 100*cl_acc, 100*mean_iu, 100*fw_iu)
    return acc, cl_acc, mean_iu, fw_iu

for thresh in (0.10, 0.25, 0.35, 0.2545):
    adaptive_clockwork_youtube(thresh) 

Adaptive Clockwork: Threshold 0.1  Updated 1727/1851 frames (93.3%)
acc		 cl acc		 mIU		 fwIU
95.251757	 82.298574	 69.992369	 91.418386	
Adaptive Clockwork: Threshold 0.25  Updated 962/1851 frames (52.0%)
acc		 cl acc		 mIU		 fwIU
94.971220	 80.688052	 68.342614	 90.943456	
Adaptive Clockwork: Threshold 0.35  Updated 405/1851 frames (21.9%)
acc		 cl acc		 mIU		 fwIU
93.446679	 72.902067	 58.985571	 88.465425	
Adaptive Clockwork: Threshold 0.2545  Updated 926/1851 frames (50.0%)
acc		 cl acc		 mIU		 fwIU
94.974965	 81.086213	 68.496321	 90.954592	
