In [1]:
### Batch Predictions starts

import tensorflow as tf
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import tqdm
from sklearn.preprocessing import LabelBinarizer

#STEP 0: if LSTM model not loaded then load the file 
BASE_DATA_PATH = 'C:\\Users\\STSC\\Desktop\\CV-Indoor\\'
saved_model_dir = os.path.join(BASE_DATA_PATH, 'my_model_.h5')
model = tf.keras.models.load_model(saved_model_dir)



In [2]:
# Check its architecture
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            multiple                  0         
_________________________________________________________________
lstm (LSTM)                  multiple                  3672064   
_________________________________________________________________
dense (Dense)                multiple                  131328    
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
dense_1 (Dense)              multiple                  514       
Total params: 3,803,906
Trainable params: 3,803,906
Non-trainable params: 0
_________________________________________________________________


In [3]:
SEQUENCE_LENGTH = 40

In [4]:
#function to retrieve the next image in the set of image to extract from a video clip
def single_video_frame_generator():
    frames = []
    cap = cv2.VideoCapture(video_path)
    SEQUENCE_LENGTH = 40
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    sample_every_frame = max(1, num_frames // SEQUENCE_LENGTH)
    current_frame = 0

    label = os.path.basename(os.path.dirname(video_path))
    


    max_images = SEQUENCE_LENGTH
    while True:
        success, frame = cap.read()
        if not success:
            break

        if current_frame % sample_every_frame == 0:
            # OPENCV reads in BGR, tensorflow expects RGB so we invert the order
            frame = frame[:, :, ::-1]
            img = tf.image.resize(frame, (224, 224))
            img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
            max_images -= 1
            yield img, video_path

        if max_images == 0:
            break
        current_frame += 1


#create a Dataset using the prevous function frame_generator        
# `from_generator` might throw a warning, expected to disappear in upcoming versions:
dataset = tf.data.Dataset.from_generator(single_video_frame_generator,
             output_types=(tf.float32, tf.string),
             output_shapes=((224, 224, 3), ()))

#dataset = dataset.batch(16).prefetch(tf.data.experimental.AUTOTUNE)
print(dataset)


<DatasetV1Adapter shapes: ((224, 224, 3), ()), types: (tf.float32, tf.string)>


In [5]:
mobilenet_v2 = tf.keras.applications.mobilenet_v2.MobileNetV2(input_shape=(224,224,3), include_top=False, weights='imagenet')
x = mobilenet_v2.output

# We add Average Pooling to transform the feature map from
# 8 * 8 * 2048 to 1 x 2048, as we don't need spatial information
pooling_output = tf.keras.layers.GlobalAveragePooling2D()(x)
feature_extraction_model = tf.keras.Model(mobilenet_v2.input,pooling_output)


In [6]:
#STEP 3:  Now process the images in our dataSet with the MobileNet feature extraction model created prevously
#
#IMPORTANT:   the conversion from tensor object to numpy array ONLY WORKS IN TF 2.* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


# step 3.1 create function to go through video specified and return an array of images of length SEQUNCE_LENGTH
#          each image is resized and preprocessed for input into a FeatureExtractor MobileNet
#          returns array of tenors (1 tensor per image)
def grabImagesFromVideo_PreProcess_for_MobileNet_FeatureExtractor(video_path):
    #print(" going to process " + str(video_path))
    frames = []
    cap = cv2.VideoCapture(video_path)
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    sample_every_frame = max(1, num_frames // SEQUENCE_LENGTH)
    max_images = SEQUENCE_LENGTH
 
    #cycle through the frames in the video
    for current_frame_index in range(num_frames):
        
        #print("on frame" + str(current_frame_index))

        #read in next frame from video
        success, frame = cap.read()
        if not success:
            break

        #take every kth(sample_every_frame) frame and store in frames array
        if current_frame_index % sample_every_frame == 0:
            # OPENCV reads in BGR, tensorflow expects RGB so we invert the order
            frame = frame[:, :, ::-1]
            #appropriately resize and preprocess the image for Feature Extraction with inceptionV3 CNN
            img = tf.image.resize(frame, (224, 224))
            #img = tf.keras.applications.inception_v3.preprocess_input(img)
            img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
#             print(" going to save image")
#             print(img)
#             print(img[0])
#             print(img[0][0])

            frames.append(img)
            max_images -= 1
           
        # if we have sampled SEQUENCE_LENGTH number of frames then stop
        if max_images == 0:
            break
    return frames
            
# step 3.1 create function to go through video specified and return an array of images of lenght SEQUNCE_LENGTH
#          each image is resized and preprocessed for input into a FeatureExtractor InceptionV3 CNN
#          then run through the FeatureExtractor --output will be a 1x2048 feature vector for each image
#          append to the set of features and return
#          features is an array of SEQUENCE_LENGTH (40) Tensors (each 1x2048 in length)

def grabImagesFromVideo_Process_with_MobileNet_FeatureExtractor(video_path, feature_extraction_model):
    #print(" going to process " + str(video_path))
    features = []
    cap = cv2.VideoCapture(video_path)
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    sample_every_frame = max(1, num_frames // SEQUENCE_LENGTH)
 
    max_images = SEQUENCE_LENGTH
    #cycle through the frames in the video
    for current_frame_index in range(num_frames):
        
        #print("     on frame" + str(current_frame_index))

        #read in next frame from video
        success, frame = cap.read()
            
        if not success:
            break

        #take every kth(sample_every_frame) frame and store in frames array
        if current_frame_index % sample_every_frame == 0:
            # OPENCV reads in BGR, tensorflow expects RGB so we invert the order
            frame = frame[:, :, ::-1]
              
            #appropriately resize and preprocess the image for Feature Extraction with inceptionV3 CNN
            # CONVERTS TO A TENSOR from an array   with the size of 299x299
            img = tf.image.resize(frame, (224, 224))
           
            #img = tf.keras.applications.inception_v3.preprocess_input(img)
            img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
            
            #print(" img currently is:")
            #print(img)
            #print(" img shape is " + str(img.shape))
            tensor_input = tf.expand_dims(img,axis=0)
            #print(" expanded dimension tensor now is " + str(tensor_input))
            #print("     --shape is" + str(tensor_input.shape))
            #Diagnostics: run the "graph" to print out tensor object
            if(False):
                with tf.Session() as sess:
                    sess.run(init_op) #execute init_op
                    print('the random values that we sample')
                    print(" content img")
                    print (sess.run(img))
                    print(" _________________")

                    print(" expanded dimension tensor now is " + str(tensor_input))
                    print("     --shape is" + str(tensor_input.shape))
                    print(" content tensor_input")
                    print (sess.run(tensor_input))

         
            # now going to process with the feature extraction model (MobileNet based)
            current_features = feature_extraction_model(tensor_input)
            
            #current_features = feature_extraction_model.predict(img, steps=1)
            #reshape the tensor to shape ( #features x 1)--see https://www.tensorflow.org/api_docs/python/tf/reshape
            current_features = tf.reshape(current_features, (current_features.shape[0], -1))
            
            #convert tensor current_features to an numpy array !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            #only works in TF 2.* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            current_features = current_features.numpy()
            
            features.append(current_features)
    
            #reduce counter
            max_images -= 1
           
        # if we have sampled SEQUENCE_LENGTH number of frames then stop
        if max_images == 0:
            break
    
    return features


In [7]:
# load the data for all video files generated in a list
gen_list_path = 'C:/Users/STSC/Desktop/CV-Indoor/Batch_Predictions/filenames.txt'
with open(gen_list_path) as f:
    file_list = [row.strip() for row in list(f)]

In [8]:
# perform prediction using video data on LSTM model
for video_path in file_list:
    print(video_path)
    
    #step 3.2 call function passing a video_path and returning the array of images of SEQUENCE_LENGTH
    images = grabImagesFromVideo_PreProcess_for_MobileNet_FeatureExtractor(video_path)
    features = grabImagesFromVideo_Process_with_MobileNet_FeatureExtractor(video_path, feature_extraction_model)
    
    #save the array of feature vectors to file same location but extension .npy
    output_path = video_path.replace('.mp4', '.npy')
    fid = open(output_path, "wb")
    np.save(output_path, features)
    
    #STEP 4: take the array of vectors all_features and convert it to a tensor for input into our LSTM model
    tensor_input = tf.convert_to_tensor(features, dtype=tf.float32)
    #print(tensor_input)

    #create tensor needed from processed video
    #current array of feature vectors representing images in video clip called  all_features
    prediction= model.predict(tensor_input, batch_size=1)
    print('Prediction:::')
    print(prediction)
    print("length of predictions- " + str(len(prediction)))

    i=0
    for p in prediction:
        print("for prediction " + str(i))
        print("index of max predicition class ")
        if(np.argmax(p)==0):
            print('Stairs- ',np.argmax(p))
        elif(np.argmax(p)==1):
            print('Doors- ',np.argmax(p))
        else:
            print('Others- ',np.argmax(p))
        i+=1

C:\Users\STSC\Desktop\CV-Indoor\Batch_Predictions\Doors\VID_20191201_011325053.mp4
Prediction:::
[[1.16993277e-07 9.99999881e-01]
 [1.49868626e-07 9.99999881e-01]
 [7.47531175e-08 9.99999881e-01]
 [1.10521363e-07 9.99999881e-01]
 [1.02813090e-07 9.99999881e-01]
 [3.58313166e-08 1.00000000e+00]
 [4.81829368e-08 1.00000000e+00]
 [1.03031972e-07 9.99999881e-01]
 [1.11623628e-07 9.99999881e-01]
 [1.09102984e-07 9.99999881e-01]
 [4.14795238e-08 1.00000000e+00]
 [1.22546578e-06 9.99998808e-01]
 [8.95481733e-08 9.99999881e-01]
 [5.18460979e-07 9.99999523e-01]
 [6.33051940e-08 9.99999881e-01]
 [2.62459707e-08 1.00000000e+00]
 [2.71744653e-08 1.00000000e+00]
 [4.61963232e-08 1.00000000e+00]
 [1.39818596e-07 9.99999881e-01]
 [5.63179128e-07 9.99999404e-01]
 [6.29071508e-07 9.99999404e-01]
 [1.70301192e-07 9.99999881e-01]
 [4.18658139e-08 1.00000000e+00]
 [3.73655702e-08 1.00000000e+00]
 [7.08787198e-08 9.99999881e-01]
 [1.79503047e-07 9.99999762e-01]
 [2.49647769e-07 9.99999762e-01]
 [3.48951716

 [9.9999976e-01 1.8155940e-07]]
length of predictions- 40
for prediction 0
index of max predicition class 
Stairs-  0
for prediction 1
index of max predicition class 
Stairs-  0
for prediction 2
index of max predicition class 
Stairs-  0
for prediction 3
index of max predicition class 
Stairs-  0
for prediction 4
index of max predicition class 
Stairs-  0
for prediction 5
index of max predicition class 
Stairs-  0
for prediction 6
index of max predicition class 
Stairs-  0
for prediction 7
index of max predicition class 
Stairs-  0
for prediction 8
index of max predicition class 
Stairs-  0
for prediction 9
index of max predicition class 
Stairs-  0
for prediction 10
index of max predicition class 
Stairs-  0
for prediction 11
index of max predicition class 
Stairs-  0
for prediction 12
index of max predicition class 
Stairs-  0
for prediction 13
index of max predicition class 
Stairs-  0
for prediction 14
index of max predicition class 
Stairs-  0
for prediction 15
index of max predic