# Code for spatio-temporal Maps using ebrnn
## Author: Shakthi Duraimurugan

In [1]:
import os
import sys
from os import path
import time
from time import sleep

import tensorflow as tf
import numpy as np
# import cv2
# import matplotlib.pyplot as plt
import pylab as plt
from sklearn.utils import shuffle

from tqdm import tqdm, trange, tqdm_notebook, tnrange
import glob
import time
import pandas as pd
import h5py
import pickle as pkl
import subprocess as sp
import pdb

In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
slim_dir = "/mnt/workspace/models/research/slim/"
checkpoints_dir = "/mnt/workspace/models/checkpoints/"
sys.path.insert(0, slim_dir)
from nets import vgg
from preprocessing import vgg_preprocessing

## Define EBRNN Class

In [5]:
class ebrnn(object):
    def __init__(self, image_size, batch_size, image_mean):
        # Set Paths
        self.dataset_dir = "/mnt/workspace/datasets/UCF-101/"
        self.label_dir = "/mnt/workspace/datasets/ucf101/ucf24/labels/"
        self.checkpoints_dir = "/mnt/workspace/models/checkpoints/"
        self.extracted_features_dir = "/mnt/workspace/ebrnn-tf/fc7_features/"

        self.labels = [x.replace(self.label_dir,"") for x in sorted(glob.glob(self.label_dir+"*"))]

        self.image_size = image_size
        self.batch_size = batch_size
        self.image_mean = image_mean

        # Define the VGG 16 Network
        g1 = tf.Graph()
        slim = tf.contrib.slim
        with g1.as_default():
            self.input_batch = tf.placeholder(dtype=tf.uint8, shape=(batch_size,240,320,3))
            resized_images = tf.image.resize_images(self.input_batch, [self.image_size,self.image_size])
            channels = tf.split(axis=3, num_or_size_splits=3, value=resized_images)
            for i in range(3):
                channels[i] -= self.image_mean[i]
            normalized_images = tf.concat(axis=3, values=channels)

            with slim.arg_scope(vgg.vgg_arg_scope()):
                outputs, end_points = vgg.vgg_16(normalized_images,num_classes=1000, is_training=False)

            self.final_conv = end_points['vgg_16/conv5/conv5_3']
            self.fc7 = end_points['vgg_16/fc7']
            self.probablities = tf.nn.softmax(outputs)

            self.vgg_sess = tf.Session()



        # Define LSTM model
        g2 = tf.Graph()
        with g2.as_default():
            # Define placeholders
            x = tf.placeholder(tf.float32, (1, n_frames, 4096))
            lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden, activation = tf.nn.relu)
            seq_out, states = tf.nn.dynamic_rnn(cell = lstm_cell, sequence_length = sequence_length, inputs = x, dtype = tf.float32)
            w = tf.get_variable(name = "lstm_weights", shape = [n_hidden, n_classes], 
                    initializer = tf.contrib.layers.xavier_initializer(), trainable = True)
            b = tf.get_variable(name = "lstm_bias", shape = [n_classes], 
                    initializer = tf.contrib.layers.xavier_initializer(), trainable = True)
            final_out = seq_out[:, -1, :] # last time step's output
            out = tf.matmul(final_out, w) + b
            y = tf.placeholder(tf.uint8, (None))
            y_one_hot = tf.one_hot(y, len(labels))
            sequence_length = tf.placeholder(tf.int32, shape=None)
            logits = lstm_model(x, batch_size, sequence_length, n_hidden, len(labels))
            self.lstm_sess = tf.Session()

    def read_video(self, video_path):
        command = [ 'ffmpeg', '-i', video_path, '-f', 'image2pipe', '-pix_fmt', 'rgb24', '-vcodec', 'rawvideo', '-']
        pipe = sp.Popen(command, stdout = sp.PIPE, stderr = open(os.devnull, 'w'), bufsize=10**8)
        while True:
            raw_image = pipe.stdout.read(240*320*3)
            if len(raw_image) != 240*320*3:
                break;
            # transform the byte read into a numpy array
            image =  np.fromstring(raw_image, dtype='uint8')
            image = image.reshape((240,320,3))
            self.video_frames.append(image)
            # throw away the data in the pipe's buffer.
            pipe.stdout.flush()
        self.video_frames = np.asarray(self.video_frames, dtype=np.uint8)
        
        
    def get_imagenet_features(self, label, video_path):
        # Read frames from the video
        self.video_frames = []
        self.read_video(video_path)
        n = self.video_frames.shape[0]

        if not(os.path.exists(self.extracted_features_dir + label)):
            os.mkdir(self.extracted_features_dir + label)

        filename = self.extracted_features_dir + label + "/" + video_path.split('/')[-1].split('.')[0]

        # Iterate through batches and extract features
        full_video_features = []
        for start, end in zip(range(0, n, self.batch_size), range(self.batch_size, n + self.batch_size, self.batch_size)):
            current_batch = np.zeros((self.batch_size, 240, 320, 3), dtype = np.uint8)
            current_batch[:min(end, n) - start] = self.video_frames[start:end]

            # final_conv = self.sess.run(self.final_conv, feed_dict = {self.input_batch: current_batch})
            fc7 = self.sess.run(self.fc7, feed_dict = {self.input_batch: current_batch})
            full_video_features = full_video_features + list(fc7)

        features = np.asarray(full_video_features[:n], np.float32) 
        np.save(filename, features)
        # pdb.set_trace()
        pass