# Video Extraction
---

In [1]:
from moviepy.editor import VideoFileClip
import numpy as np
import tensorflow as tf

## Kinects-i3D ##
import sys
sys.path.insert(1, '../kinetics-i3d/') #insert the kinects-i3d project to the path
import i3d

### Global Variables

In [3]:
_NUM_CLASSES = 400
_IMAGE_SIZE = [224,224]
_FRAMES = 64

### Creating frames

In [8]:
def get_frames_array(video_path):
    clip = VideoFileClip(video_path, target_resolution=(_IMAGE_SIZE[0],_IMAGE_SIZE[1]))
    frames = np.array([x for x in clip.iter_frames()])
    return pad_frames(frames)
    
def pad_frames(frames):
    frames_qtt = frames.shape[0]
    if(frames_qtt < _FRAMES): #padding the frame

        pad_left_count = int((_FRAMES - frames_qtt) / 2)
        pad_right_count = _FRAMES - frames_qtt - pad_left_count

        pad_left = np.zeros((pad_left_count, frames.shape[1],  frames.shape[2],  frames.shape[3]))
        pad_right = np.zeros((pad_right_count, frames.shape[1],  frames.shape[2],  frames.shape[3]))

        rgb_array = np.concatenate((pad_left, frames, pad_right))

#         print('Array padded')

    else: 
        ##TODO: reduce the array -- CHECK IT!
        rgb_array = np.resize(frames.mean(axis=0).astype(int),
                              (_FRAMES, frames.shape[1],  frames.shape[2],  frames.shape[3]))
#         print('Array resized')

    return rgb_array

### Creating an array of utterances frames

In [9]:
def get_utterances_array(utterances_path, start=1):
    result_array = np.empty((0, _FRAMES, _IMAGE_SIZE[0],_IMAGE_SIZE[1], 3))
    count = start 
    while(True):
        try:
            f = get_frames_array(utterances_path + str(count) + ".mp4")
            result_array = np.append(result_array, [f], axis=0)
            count +=1
    #         if(count == 10):
    #             break

        except Exception as e:
            print("{0} utterances processed".format(count-start))
    #         print(e)
            break
    return result_array

In [10]:
path = "../MOSI_Dataset/Segmented/_dI--eQ6qVU_"

rgb_array = get_utterances_array(path)

28 utterances processed


In [11]:
rgb_array.shape

(28, 64, 224, 224, 3)

In [68]:
#Expand the array in one more dimension
# rgb_array = np.expand_dims(rgb_array, axis=0)
# rgb_array.shape

(1, 64, 224, 224, 3)

### Predict

In [12]:
def model_visual_features(rgb_array):    
    i3d_model = i3d.InceptionI3d(num_classes=_NUM_CLASSES, final_endpoint='Predictions')

    inp = tf.placeholder(tf.float32, [None, _FRAMES, _IMAGE_SIZE[0], _IMAGE_SIZE[1], 3])

    predictions, end_points = i3d_model(inp, is_training=True, dropout_keep_prob=0.5)

    init_op = tf.global_variables_initializer()

    # sample_input = np.zeros((5, 64, _IMAGE_SIZE[0], _IMAGE_SIZE[1], 3))
    sample_input = rgb_array

    with tf.Session() as sess:
        sess.run(init_op)
        out_predictions, out_logits = sess.run([predictions, end_points['Logits']], {inp: sample_input})
    
    return out_logits

In [None]:
out_logits = model_visual_features(rgb_array)

In [8]:
out_logits.shape

(28, 400)

### Save video features in a pickle

In [11]:
path = "../MOSI_Dataset/ProcessedData/"
filename = '_dI--eQ6qVU'
np.save(path + filename, out_logits)

In [15]:
np.load(path + filename + '.npy')

array([[-0.0776252 ,  0.52176446, -0.57059944, ...,  0.57569677,
        -0.08314126,  0.2847538 ],
       [-0.20371394,  0.45799956, -0.433181  , ...,  0.5871268 ,
         0.1312544 ,  0.4346002 ],
       [-0.18181585,  0.435361  , -0.48127335, ...,  0.5568002 ,
        -0.13335773,  0.32280993],
       ...,
       [-0.20457196,  0.5942516 , -0.5102663 , ...,  0.6374935 ,
        -0.03212039,  0.12488917],
       [-0.10982757,  0.61451787, -0.58562803, ...,  0.6420568 ,
        -0.40021592,  0.46122852],
       [-0.14870346,  0.662376  , -0.16642144, ...,  0.5224863 ,
        -0.14700334,  0.2605955 ]], dtype=float32)