In [1]:
import sys
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
print("Num GPUs Available: ", tf.config.list_physical_devices())
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from skimage.metrics import structural_similarity
import matplotlib.pyplot as plt

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image, ImageOps
import pickle
import cv2

Num GPUs Available:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]


In [2]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

In [3]:
image_features_extract_model = tf.saved_model.load('features_extract_model')

In [4]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
#vocab_size = top_k + 1
vocab_size = 5000 + 1

#num_steps = len(img_name_train) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
features_shape = 2048
attention_features_shape = 64
max_length = 52

In [5]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features1, hidden1):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden1, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features1) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features1
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [6]:
class CNN_Encoder(tf.keras.Model):
    # This encoder passes the extracted features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)
        
    def call(self, x):
        inp = x
        y = self.fc(inp)
        z = tf.nn.relu(y)
        return z

In [7]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    #@tf.function(input_signature = [tf.TensorSpec(shape=[64, 1], dtype=tf.int32), tf.TensorSpec(shape=[64, 64, 256], dtype=tf.float32),tf.TensorSpec(shape=[64, 512], dtype=tf.float32)])
    @tf.function
    def __call__(self, x, features1, hidden):
        # defining attention as a separate model
        hidden1 = hidden
        context_vector, attention_weights = self.attention(features1, hidden)
    
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
    
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
    
        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)
    
        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))
    
        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)
    
        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [8]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [9]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

ckpt.restore(ckpt_manager.latest_checkpoint)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f438cd94d68>

In [10]:
print(encoder.trainable_weights)

[]


# Caption

In [11]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# Inference

In [12]:
def evaluate(image):
    attention_plot = np.zeros((max_length, attention_features_shape))

    hidden = decoder.reset_state(batch_size=1)

    temp_input = tf.expand_dims(load_image(image)[0], 0)
    img_tensor_val = image_features_extract_model(temp_input)
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
    
    features = encoder(img_tensor_val)

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = []
    
    for i in range(max_length):
        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)

        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        result.append(tokenizer.index_word[predicted_id])

        if tokenizer.index_word[predicted_id] == '<end>':
            return result, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    attention_plot = attention_plot[:len(result), :]
    return result, attention_plot

In [13]:
def plot_attention(image, result, attention_plot):
    temp_image = np.array(Image.open(image))

    fig = plt.figure(figsize=(10, 10))

    len_result = len(result)
    for l in range(len_result):
        temp_att = np.resize(attention_plot[l], (8, 8))
        ax = fig.add_subplot(len_result//2, len_result//2, l+1)
        ax.set_title(result[l])
        img = ax.imshow(temp_image)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.tight_layout()
    plt.show()

# Function for cosine similarity to calculate similarity b/w sentences

In [14]:
def similarity(X, Y):
    # tokenization
    X_list = word_tokenize(X) 
    Y_list = word_tokenize(Y)

    # sw contains the list of stopwords
    sw = stopwords.words('english')
    sw.append("<end>")
    
    l1 =[];l2 =[]

    # remove stop words from the string
    X_set = {w for w in X_list if not w in sw} 
    Y_set = {w for w in Y_list if not w in sw}

    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) # create a vector
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
    c = 0

    # cosine formula 
    for i in range(len(rvector)):
            c+= l1[i]*l2[i]
    cosine = c / float((sum(l1)*sum(l2))**0.5)
    return cosine

# Picture Similarity

In [15]:
def mse(imageA, imageB):
    err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
    err /= float(imageA.shape[0] * imageA.shape[1])
    # return the MSE, the lower the error, the more "similar"
    # the two images are
    return err

def compare_images(imageA, imageB):
    imageA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
    imageB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)

    #m = mse(imageA, imageB)
    s = structural_similarity(imageA, imageB)
    return s

# Summary

In [16]:
filenames = glob("../keyFrames/summ/*.png")
filenames.sort()

caption = []
for img in filenames:
    result, attention_plot = evaluate(img)
    result = " ".join(result)
    if len(caption) == 0:
        caption.append(result)
        old_img = img
        continue
    if compare_images(cv2.imread(old_img), cv2.imread(img)) > 0.9:
        old_img = img
        continue
    if similarity(result, caption[-1]) < 0.8:
        caption.append(result)
    old_img = img
    
ind = 1
remove_words = ["<end>"]
caption_final = []
for i in caption:
    for word in remove_words:
        i = i.replace(word, '')
        caption_final.append(i)
        
    print("{}) {}".format(ind, i))
    ind += 1

1) a mountain view over the desert is in it's lake waters 
2) a poster of a large amount of sheep on the land window 
3) all different clocks hanging from a fish 
4) a group of people waiting on a large city with a clock that looks at the top of a larger boat built in to a building and a brick building 
5) an outdoor figure of flowers in the grass 
6) the crowd of people walk along a city skyline near the docks 
7) a number of the character on the train on the train engine pulled by a bridge and another on shore 
8) an airplane with long brown an audience stands beside the sky 
9) at the contents outside each under a town 
10) a group of kids in a political event with <unk> a woman looking at a bus at building that's raised trains 
11) some people in front of a building 
12) a fire hydrant spraying water in the rain 
13) a crowd of people in a crowd holding lab furniture and some trees 
14) there are several red and white flowers with pink flowers 
15) a beautiful picture of cows walki

# Timestamp

In [24]:
def getTime(frame, videoFile):
    cap = cv2.VideoCapture(videoFile)
    fps = (cap.get(5))
    return (int) (frame / fps)

In [28]:
filenames = glob("../keyFrames/time/*.png")
filenames.sort()

caption = []
for img in filenames:
    result, attention_plot = evaluate(img)
    result = " ".join(result)
    if len(caption) == 0:
        caption.append(result)
        old_img = img
        continue
    if compare_images(cv2.imread(old_img), cv2.imread(img)) > 0.9:
        old_img = img
        continue
    if similarity(result, caption[-1]) < 0.8:
        caption.append(result)
    old_img = img
    
remove_words = ["<end>"]
event_final = []

print("Seconds\t\tEvent")
for img, event in zip(filenames, caption):
    frame = img[33:-4]
    sec = getTime(int(frame), "../videos/NYTravel.mp4")
    for word in remove_words:
        event = event.replace(word, '')
        event_final.append(event)
        
    print("{}\t\t{}".format(sec, event))

Seconds		Event
17		people sitting under a mountain range near a waterfall and near by water 
20		a view of some colorful beach in front of it 
33		an older room with various types of a show 
35		there is a clock tower in the sky with clock in a body of water on the hill 
32		this woman sitting at parking sign at a red sign 
196		a man on a sidewalk holding two covered in thick 
198		a group of people on horses are fighting 
200		a man is walking past cones near a fence near building with an umbrella 
