**MODEL: Convolution Autoencoder -> LSTM**

**Import Statements and Global Variables**

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import cv2
import numpy as np
import os
from keras import backend as K
from keras.layers import Conv2D, Dropout, LSTM, BatchNormalization, Input,Activation, MaxPooling2D, Flatten, Dense,TimeDistributed, UpSampling2D
from keras.models import Model, load_model
from keras import metrics 
import random
import pickle

# Directory paths
VIDEOS_DIR = './Videos/'
IMAGES_DIR = './Images/'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


**Labels for classes**

In [2]:
# List of all classes
classes = ['Kicking', 'Riding-Horse', 'Running', 'SkateBoarding', 
           'Swing-Bench', 'Lifting', 'Swing-Side', 'Walking', 'Golf-Swing']

# Create dictionary to label each class
class_to_index = {}
for i in range(len(classes)):
    class_to_index[classes[i]] = i

**Standard Function Definitions**

In [20]:
def permute(X,Y):
    train_size = X.shape[0]
    permutation_train = np.random.permutation(train_size)
    X = X[permutation_train]
    Y = Y[permutation_train]
    return X,Y

def load_image(path,image_size):
    image = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, image_size)
    return image

def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

def pad(X_cnn,max_len):
    features_len = np.shape(X_cnn)[1]
    length = np.shape(X_cnn)[0]
    X_cnn = list(X_cnn)
    pad_arr = [0 for i in range(features_len)]
    for i in range(max_len-length):
        X_cnn.append(pad_arr)
    return np.array(X_cnn)

def evaluate(X_test,Y_test,model):
    count = 0
    for i in range(len(X_test)):
        pred = model.predict(X_test[i])
        max_pred = [np.argmax(i) for i in pred]
        counts = np.bincount(max_pred)
        class_pred = np.argmax(counts)
        #class_pred = max_pred
        actual = np.argmax(Y_test[i])
        #print("Max Preds time", max_pred)
        #print("Pred",classes[class_pred],"Actual",classes[actual])
        #print()
        if class_pred == actual:
            count += 1
    return float(count)/float(len(Y_test)) * 100.0

**Reading Data**

In [4]:
# List of directories in Videos folder
videos = []
for x in classes:
    videos.append(list(os.listdir(VIDEOS_DIR+x+'/')))
videos

def build_dataset(image_size, images_per_video = 10):
    # Initialise image vectors
    X_train_images = []
    Y_train_images = []
    
    # For each class in classes, do:
    vid_num, l_vid_num = 0, []
    for i in range(len(classes)):
        # Name of class
        cls = classes[i]
        
        print ("Processing ", cls, " videos...")
        # For each dir in corresponding videos folder, do:
        for j in range(len(videos[i])):
            # Get dir no, eg. '012', '016', etc.
            vid = videos[i][j]
            print ("Going through ", vid, "...")
            
            # Append to new paths
            video_r = VIDEOS_DIR+cls+'/'+ vid +'/'
            image_r = IMAGES_DIR+cls+'/'+ vid +'/'
            
            # List all images in the given image directory
            filelist = list(os.listdir(image_r))
            #print(sorted(filelist))
            
            # Initialise X_train
            X_train_images_video = []
            
            # For image in imagelist, do:
            for fichier in filelist:
                # Check if file is an image
                if fichier.endswith(".png"):
                    # Load and resize image
                    image = load_image(image_r+fichier,image_size)
                    # Append to list
                    X_train_images_video.append(image)
                    
            # Randomly shuffle list
            permutation = np.random.permutation(len(X_train_images_video))
            X_train_images_video = np.array(X_train_images_video)[permutation]
            
            # If no. of images in video < images_per_video, take that
            no_of_images = min(images_per_video, len(X_train_images_video))
            
            # Append to X_train images
            X_train_images += list(X_train_images_video[:no_of_images])
            
            # Append to Y_train_images
            Y_train_images += [i] * no_of_images
            
            # To keep track of video number
            l_vid_num += [vid_num] * no_of_images
            vid_num += 1
            
    print ("Done.")
    return np.array(X_train_images), np.array(Y_train_images), l_vid_num, vid_num

**Encoder and LSTM Models**

In [5]:
def build_encoder(input_dim):
    input_img = Input(shape=input_dim)

    x = Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)

    x = Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(16, (3, 3), activation='relu')(x)
    x = UpSampling2D((2, 2))(x)
    decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)

    encoder = Model(input_img, decoded)
    encoder.compile(optimizer='adadelta', loss='binary_crossentropy')

    return encoder

In [6]:
def LSTM_model(encoding_dim):
    X_input = Input(shape=encoding_dim)
    X = LSTM(32, return_sequences=True)(X_input)
    X = Dropout(0.3)(X)
    X = LSTM(32, return_sequences=False)(X)
    X = Dropout(0.3)(X)
    X = Dense(9,activation='softmax')(X)
    
    lstm = Model(X_input, X)
    lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    return lstm

**Read Data and Build Models**

In [7]:
data = build_dataset((172, 172))

Processing  Kicking  videos...
Going through  012 ...
Going through  016 ...
Going through  011 ...
Going through  017 ...
Going through  013 ...
Going through  003 ...
Going through  009 ...
Going through  015 ...
Going through  006 ...
Going through  007 ...
Going through  004 ...
Going through  002 ...
Going through  014 ...
Going through  001 ...
Going through  010 ...
Going through  005 ...
Processing  Riding-Horse  videos...
Going through  003 ...
Going through  009 ...
Going through  006 ...
Going through  007 ...
Going through  004 ...
Going through  002 ...
Going through  008 ...
Going through  001 ...
Going through  010 ...
Going through  005 ...
Processing  Running  videos...
Going through  009 ...
Going through  006 ...
Going through  007 ...
Going through  004 ...
Going through  002 ...
Going through  008 ...
Going through  001 ...
Going through  010 ...
Going through  005 ...
Processing  SkateBoarding  videos...
Going through  003 ...
Going through  009 ...
Going through 

In [8]:
x_img, y_img, lvid, no_of_videos = data

In [9]:
encoder = build_encoder((3, 172, 172))
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3, 172, 172)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 16, 172, 172)      448       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 16, 86, 86)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 8, 86, 86)         1160      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 8, 43, 43)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 8, 43, 43)         584       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 8, 22, 22)         0         
__________

In [11]:
lstm = LSTM_model((10, 3872))
lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 10, 3872)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 10, 32)            499840    
_________________________________________________________________
dropout_3 (Dropout)          (None, 10, 32)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 297       
Total params: 508,457
Trainable params: 508,457
Non-trainable params: 0
_________________________________________________________________


**Train Models**

In [12]:
np.shape(x_img)

(1097, 172, 172, 3)

In [13]:
x_img = x_img.astype('float32') / 255.
x_img = np.reshape(x_img, (len(x_img), 3, 172, 172))

In [14]:
encoder.fit(x_img, x_img,
                epochs=1,
                batch_size=128,
                shuffle=True)

Epoch 1/1


<keras.callbacks.History at 0x7fdc64f6b358>

In [15]:
get_6th_layer_output = K.function([encoder.layers[0].input],
                                  [encoder.layers[6].output])

layer_output = get_6th_layer_output([x_img])[0]

In [16]:
np.shape(layer_output)

(1097, 8, 22, 22)

In [22]:
lstm_in = [None] * no_of_videos
for i in range(len(lstm_in)):
    lstm_in[i] = []

for i in range(len(layer_output)):
    img = np.reshape(layer_output[i], -1)
    lstm_in[lvid[i]].append(img)

print (np.shape(lstm_in))
    
lens = [len(a) for a in lstm_in]
max_len = max(lens)
print (max_len)

for i in range(len(lstm_in)):
    lstm_in[i] = pad(lstm_in[i], max_len)
    
for a in lstm_in:
    print (np.shape(a))
    
lstm_in = np.array(lstm_in)

(111,)
10
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)
(10, 3872)


In [23]:
np.shape(lstm_in)

(111, 10, 3872)

In [24]:
lstm_y = convert_to_one_hot(y_img, 9)
np.shape(lstm_y)

(1097, 9)

In [42]:
lstm_y = [ np.where(r==1)[0][0] for r in lstm_y]

In [43]:
lstm_y

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [46]:
y = np.array([None] * no_of_videos)
    
for i in range(len(lstm_y)):
    y[lvid[i]] = lstm_y[i]
    
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5,
       5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8], dtype=object)

In [54]:
for i in range(len(y)):
    y[i] = int(y[i])

y = np.array(y, dtype=np.int8)
    
y = convert_to_one_hot(y, 9)
y

array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0.

In [55]:
lstm.fit(lstm_in, y, epochs=1, batch_size=128, shuffle=True)

Epoch 1/1


<keras.callbacks.History at 0x7fdc64451978>

**Save Model**

In [56]:
# Save model here