- Read a video
- Sample frames from it (15 frames) 1- 13 - 1 (before fg, fg, after fg)
- Extract fc7 activation layer from VGG net for every frame
- Average activation values for all the frames of a video
- use them as features in a classification model

In [1]:
import cv2
import numpy as np
import pandas as pd
import os
from IPython.display import Image
from IPython.display import display
from matplotlib import pyplot as plt

#Video reading my cv2 stopped working since ffmpeg has some issue on my system. So using skvideo to read.
from skvideo.io import VideoCapture
import bisect
from time import time

In [2]:
from keras.utils.generic_utils import Progbar
#Deep learning packages
import theano
from keras.models import Sequential
from keras.optimizers import SGD
from keras.activations import relu
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers.core import Dense, Flatten, Dropout

Using Theano backend.


In [24]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [3]:
'''Given a file name and number - samples frames according to foreground object distribution in a video'''
def frame_sampling(fpath,n):
    #Get first and last frames of the video
    count_fg_f = []
    count_fg_l = []
    vid = VideoCapture(fpath)

    success, frame_first = vid.read()
    while success:
        success, frame = vid.read()
        if success:
            frame_last = frame

    frame_first = cv2.cvtColor(frame_first, cv2.COLOR_BGR2GRAY)
    frame_last = cv2.cvtColor(frame_last, cv2.COLOR_BGR2GRAY)
    #diff_l_f = cv2.absdiff(frame_last, frame_first)

    #Find out the frames where foreground object starts, ends
    #reset the indexer to the first frame
    vid = VideoCapture(fpath)
    success, frame = vid.read()
    while success:
        success, frame = vid.read()
        if success:
            frame_ng = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            diff_n_f = cv2.absdiff(frame_ng, frame_first)
            diff_n_l = cv2.absdiff(frame_ng, frame_last)
            _,diff_n_f = cv2.threshold(diff_n_f,80,255,cv2.THRESH_BINARY)
            _,diff_n_l = cv2.threshold(diff_n_l,80,255,cv2.THRESH_BINARY)
            #erode
            kernel = np.ones((5,5), np.uint8)
            diff_n_f = cv2.erode(diff_n_f, kernel, iterations=1)
            diff_n_l = cv2.erode(diff_n_l, kernel, iterations=1)
            # Accummulate fg counts with first frame and last frame
            count_fg_f.append(np.count_nonzero(diff_n_f))
            count_fg_l.append(np.count_nonzero(diff_n_l))
    fg_start = bisect.bisect(count_fg_f, 500) #Frame where fg starts
    fg_end = len(count_fg_l)-bisect.bisect(count_fg_l[::-1],500)-1 #Frame fg ends
    end = len(count_fg_l)-1

    # get n samples based on foreground object distribution
    fg_diff = fg_end - fg_start
    if fg_diff > 50:
        sample = range(fg_start, fg_end,(fg_end-fg_start)/(n-1))
        #print len(sample)
        sample.insert(0,0)
        if len(sample) >= n-1:
            sample = sample[:n-1]
        else: 
            while len(sample) != n-1:
                sample.append(fg_end)

        sample.append(end)
    else:
        sample = range(0,end, end/n)
        sample = sample[:n]
    print "%s length is %d" %(fpath, len(sample))

    return sample

In [4]:
def get_sampled_frames(fpath,sample):
    vid = VideoCapture(fpath)
    frames=[]
    
    success, fr = vid.read()
    #get zero frame
    frx=cv2.resize(fr, (224,224))
    frx = frx.astype(np.float32)
    print frx.dtype
    #read in RGB by skvideo, keeping the same format. cv2 would have read it in BGR format though!
    frx[:,:,0] -= 123.68
    frx[:,:,1] -= 116.779
    frx[:,:,2] -= 103.909

    frames.append(frx.astype(np.float32))
    
    #Get all the frames with sample index in the video
    fr_index = 0
    sam_index = 1
    while success:
        success, fr = vid.read()
        # Get it - If the index stroed in samples == index of the current frame
        if success and sam_index < len(sample):
            fr_index += 1
            if fr_index == sample[sam_index]:
                sam_index += 1
                frx=cv2.resize(fr, (224,224))
                frx = frx.astype(np.float32)
                frx[:,:,0] -= 123.68
                frx[:,:,1] -= 116.779
                frx[:,:,2] -= 103.909
                frames.append(frx.astype(np.float32))
    return frames

In [5]:
'''Very Deep Convolutional Networks for Large-Scale Image Recognition
K. Simonyan, A. Zisserman
arXiv:1409.1556'''
#VGG16 model - ILSVRC - 2014 competition
#Mean = [103.909, 116.779, 123.68]
#BGR format
#

def VGG_16(weights_path = None):
    model = Sequential()
    model.add(ZeroPadding2D((1,1), input_shape = (3,224,224)))
    model.add(Convolution2D(64,3,3, activation="relu"))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64,3,3, activation="relu"))
    model.add(MaxPooling2D((2,2), strides = (2,2)))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))

    if weights_path:
        model.load_weights(weights_path)

    return model

In [6]:
def get_network():
    model = VGG_16('/Users/homw/Documents/petp/Yelp/vgg16_weights.h5')
    sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy')

    #Now get the layers of interest from VGG net
    #We are interested in using the layers of net before the dense layer
    my_out = model.layers[32].get_output(train = False)
    input_layer = model.get_input(train = False)

    my_net = theano.function([input_layer], my_out)
       
    return my_net

In [7]:
net = get_network()

In [9]:
#Give the path to folder with all the videos
path = '/Users/homw/Documents/MSDS16/IndStudy/videos/'

In [10]:
start = time()
features = []
mean_feature = []
num_frames = 15
for f in os.listdir(path)[1:]:
#vid_file = '3132 - Fri Jan 15 10-24-13 2016.mp4'
    fpath = os.path.join(path, f)
    sample = frame_sampling(fpath,num_frames)
    frames = get_sampled_frames(fpath,sample)
    
    #Convert the sample frames of a video into a 4D array
    frames = np.array(frames)
    frames = frames.reshape(len(frames),3,224,224)
    frames = frames.astype(np.float32)
    
    print "frames of %s is ready for vggnet with size %d" %(f,frames.shape[0])
    #Get 4095 length feature vector for every frame - size num_frames X 4095
    vid_feature = net(frames)
    #Get the mean feature out of all the sampled frames and store it
    mean_feature.append(np.mean(vid_feature, axis=0))
    #store frame level features
    features.append(vid_feature)
print "time taken is %d secs" %((time()-start))

/Users/homw/Documents/MSDS16/IndStudy/videos/3132 - Fri Jan 15 10-24-13 2016.mp4 length is 15
float32
frames of 3132 - Fri Jan 15 10-24-13 2016.mp4 is ready for vggnet with size 15
/Users/homw/Documents/MSDS16/IndStudy/videos/3132 - Fri Jan 15 14-44-18 2016.mp4 length is 15
float32
frames of 3132 - Fri Jan 15 14-44-18 2016.mp4 is ready for vggnet with size 15
/Users/homw/Documents/MSDS16/IndStudy/videos/3132 - Fri Jan 15 15-47-38 2016.mp4 length is 15
float32
frames of 3132 - Fri Jan 15 15-47-38 2016.mp4 is ready for vggnet with size 15
/Users/homw/Documents/MSDS16/IndStudy/videos/3132 - Fri Jan 15 19-15-07 2016.mp4 length is 15
float32
frames of 3132 - Fri Jan 15 19-15-07 2016.mp4 is ready for vggnet with size 15
/Users/homw/Documents/MSDS16/IndStudy/videos/3132 - Fri Jan 15 19-15-44 2016.mp4 length is 15
float32
frames of 3132 - Fri Jan 15 19-15-44 2016.mp4 is ready for vggnet with size 15
/Users/homw/Documents/MSDS16/IndStudy/videos/3132 - Sat Jan 16 09-59-18 2016.mp4 length is 15
f

In [14]:
X = np.array(mean_feature)
np.save("mean_feature.npy",X)
data = pd.read_csv("video_labels.csv")

In [85]:
np.save("all_fatures.npy", np.array(features))

In [26]:
target = data.ix[:,"actual label"]
mlb = MultiLabelBinarizer()
target = mlb.fit_transform(target)

In [102]:
data.head()

Unnamed: 0.1,Unnamed: 0,Day,Day_of_week,File_Name,Month,RoomNo,Time_hh,Time_mm,Time_ss,Year,Act_MedSink,Act_ItemsPlaced,Act_HandWash,Act_Others,actual label
0,0,15,Fri,3132 - Fri Jan 15 10-24-13 2016,Jan,3132,10,24,13,2016,0,1,1,0,R
1,1,15,Fri,3132 - Fri Jan 15 14-44-18 2016,Jan,3132,14,44,18,2016,0,1,0,0,I
2,2,15,Fri,3132 - Fri Jan 15 15-47-38 2016,Jan,3132,15,47,38,2016,0,1,1,0,H
3,3,15,Fri,3132 - Fri Jan 15 19-15-07 2016,Jan,3132,19,15,7,2016,0,0,0,1,P
4,4,15,Fri,3132 - Fri Jan 15 19-15-44 2016,Jan,3132,19,15,44,2016,0,0,0,1,P


In [105]:
#Train a binary classfier for Handwash
#In the same way other activities "M", "I", "R", "P" can be tried.
target = data.ix[:,"actual label"]
handwash = [1 if i == "H" else 0 for i in target]
print handwash
rf = RandomForestClassifier(n_estimators=100, max_depth=1000, verbose=0)
rf.fit(X[:21,:], handwash[:21])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0]


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1000, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [106]:
# predicted Probabilities
rf.predict_proba(X), handwash

(array([[ 0.9 ,  0.1 ],
        [ 0.97,  0.03],
        [ 0.98,  0.02],
        [ 1.  ,  0.  ],
        [ 1.  ,  0.  ],
        [ 0.92,  0.08],
        [ 0.94,  0.06],
        [ 0.94,  0.06],
        [ 0.98,  0.02],
        [ 0.94,  0.06],
        [ 0.87,  0.13],
        [ 0.85,  0.15],
        [ 0.27,  0.73],
        [ 0.91,  0.09],
        [ 0.17,  0.83],
        [ 0.11,  0.89],
        [ 0.82,  0.18],
        [ 0.12,  0.88],
        [ 0.23,  0.77],
        [ 0.97,  0.03],
        [ 0.97,  0.03]]),
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0])

In [196]:
#Not used in the code!!
#Getting weights for each layer. But it is very easy to use theano function to get the 
#activations of the layer of interest
weight_dic = {}
n=0
for layer in model.layers:
    w = layer.get_weights()
    weight_dic[n] = w
    n+=1

In [197]:
len(weight_dic)

37