# Demo: Excitation Backprop for RNNs

This is an example to show how to use Excitation Backprop to visualize CNN+LSTM model's top-down attention.

In [1]:
import numpy as np
import argparse
import sys
import os
import time
import pdb
import glob
import h5py
import pylab, operator, csv
import urllib
from skimage import transform, filters
import moviepy.editor as mpy
import matplotlib.pyplot as plt
import pylab
%matplotlib inline
import io
import base64
from IPython.display import HTML
import util_LSTM
import util_preprocess_data

caffe_root = '../'  # this file is expected to be in {caffe_root}/examples
sys.path.insert(0, caffe_root + 'python')

import caffe

We first load the model files and set the top layers' name.

In [2]:
model_file = caffe_root + '/models/VGG16_LSTM/deploy.prototxt' # 128 frames per clip
model_weights = caffe_root + '/models/VGG16_LSTM/VGG16LSTM_UCF101plusBU101.caffemodel'
caffe.set_mode_gpu()
net = caffe.Net(model_file, model_weights, caffe.TEST)

tags,tag2ID = util_LSTM.loadTags(caffe_root + '/models/VGG16_LSTM/catName.txt')
topBlobName = 'fc8-final'
topLayerName = 'fc8-final'
secondTopLayerName = 'fc7'
secondTopBlobName = 'fc7'

We select a video that consists of two UCF101 concatenated actions. The selected video will be pre-processed.

In [5]:
# specify the demo number
demo = 1

# input frames of 2 concatenated actions (128 frames)
vid_fpath = caffe_root + '/excitationBP-RNNs/data/demo' + str(demo)
frames = sorted(glob.glob('%s/*jpg' %vid_fpath))

# prepare data
[data, clip] = util_preprocess_data.get_batches(net, frames)

A slow playback of the original video to be fed into the model.

In [6]:
# Create the original video
output = mpy.ImageSequenceClip(frames, fps=12)
output.write_videofile('original_video.mp4')

# Play the original video
vid = io.open('original_video.mp4', 'r+b').read()
encoded = base64.b64encode(vid)
HTML(data='''<video alt="test" controls>
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii')))

[MoviePy] >>>> Building video original_video.mp4
[MoviePy] Writing video original_video.mp4


100%|██████████| 129/129 [00:00<00:00, 222.40it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: original_video.mp4 



We do a forward pass and show the top predictions.

In [7]:
frames_per_clip = data.shape[0]  # number of frames per clip  
out_probs_reshape = np.zeros((frames_per_clip, 101))
net.blobs['data'].data[...] = data
net.blobs['clip_markers'].data[...] = clip
  
# Forward pass
out = net.forward()

# incase we decide to have more than one clip per batch -reshaping output probs
clips_per_batch = net.blobs['probs'].data.shape[1]  # number of clips per batch
count = 0
for nc in range(clips_per_batch): # loop over clips
  for nf in range(frames_per_clip): # loop over frames of clip
    out_probs_reshape[count,:] = net.blobs['probs'].data[nf, nc, :]
    count += 1
 
scores = ((np.mean(out_probs_reshape, axis=0)))
tagScore = util_LSTM.getTagScore(scores, tags, tag2ID) 
tagScore.sort(key = operator.itemgetter(1), reverse = True)
print(tagScore[:15])

[('CliffDiving', 0.64351659791881299), ('HorseRiding', 0.32360550800018306), ('Surfing', 0.025320480303634839), ('Skiing', 0.0012462827599636144), ('HandstandWalking', 0.0010282857829396073), ('SkyDiving', 0.00058607083189423281), ('SkateBoarding', 0.00036403186998666101), ('BalanceBeam', 0.00025730674053594545), ('Diving', 0.00022755445591675981), ('FrontCrawl', 0.00018622570007040501), ('WritingOnBoard', 0.00017438140970340381), ('Hammering', 0.00016326771160324839), ('RopeClimbing', 0.00014706189920182144), ('WallPushups', 0.00013876597542676421), ('PlayingDaf', 0.00011865326514678183)]


Now we try to visualize one of the two actions present in the video clip, using our cEB-R.

In [8]:
# switch to the excitation backprop mode
caffe.set_mode_eb_gpu()

# specify the tagName of the desired action
# Demo1: tagName = 'CliffDiving', tagName = 'HorseRiding'
# Demo2: tagName = 'HandstandWalking', tagName = 'IceDancing'
# Demo3: tagName = 'BaseballPitch', tagName = 'Billiards'
tagName = 'HorseRiding' 

#specify the output layer name
outputLayerName = 'conv5_1'
outputBlobName = 'conv5_1'

# Flag for the indicator vector 
propLastFrameOnly= True
tagID = tag2ID[tagName]

# --------------------------------- cEB-R ---------------------------------
net.blobs[topBlobName].diff[...] = 0 # Initializing topBlobName 

# Module 1: RNN BACKWARD
# ----------------------
if propLastFrameOnly: # propagate 1 only from the last timestep
  for c in range(clips_per_batch):
    net.blobs[topBlobName].diff[frames_per_clip-1,c,tagID] = 1 
else: # propagate 1 from every timestep
  net.blobs[topBlobName].diff[:,:,tagID] = 1

out = net.backward(start = topLayerName, end =  secondTopLayerName)
attMap1 = net.blobs[secondTopBlobName].diff.copy()

net.params[topBlobName][0].data[...] *= -1 # invert top layer weights
out = net.backward(start = topLayerName, end = secondTopLayerName)
attMap2 = net.blobs[secondTopBlobName].diff.copy()
net.params[topBlobName][0].data[...] *= -1 # invert back top layer weights

# Module 2: NORMALIZATION
# -----------------------
norm1=sum(sum(attMap1[...]))
norm2=sum(sum(attMap2[...]))
for ff in range(frames_per_clip):
  if propLastFrameOnly:
    attMap1[ff]/=norm1
    attMap2[ff]/=norm2
  else:
    attMap1[ff]=sum(attMap1[ff])*frames_per_clip
    attMap1[ff]/=norm1
    attMap2[ff]=sum(attMap2[ff])*frames_per_clip
    attMap2[ff]/=norm2
  # compute the contrastive signal 
  net.blobs[secondTopBlobName].diff[ff]=attMap1[ff]-attMap2[ff]

# Module 3: CNN BACKWARD
# ----------------------
out = net.backward(start = secondTopLayerName, end = outputBlobName)
attMap = ((net.blobs[outputBlobName].diff).sum(1))
# ------------------------------------------------------------------------


A slow playback of the video created with the overlayed saliency maps.

In [None]:
# Save frames with overlayed saliency maps
overlap_frames=[]
for j in range(attMap.shape[0]): # loops over saliency maps of frames
    current_frame = frames[j]
    attMap[j]=np.maximum(attMap[j], 0)  
    img = caffe.io.load_image(current_frame)
    frameMap = util_LSTM.showAttMap(img, [attMap[j]], tagName, overlap = True, blur = False)
    frameMapName = 'cEB-R_' + current_frame.split('/')[-1]
    fig = plt.figure()
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)   
    ax.imshow(frameMap) 
    fig.savefig(frameMapName, pad_inches=0)
    overlap_frames.append(frameMapName)

# Create the final video
overlap_vid = mpy.ImageSequenceClip(overlap_frames, fps=12)
overlap_vid.write_videofile("cEB-R_" + tagName + ".mp4")

# Play the final video
video = io.open("cEB-R_" + tagName + ".mp4", 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''<video alt="test" controls>
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii')))