In [1]:
import tensorflow as tf
from tensorflow.keras import Model, layers
import numpy as np 
import matplotlib.pyplot as plt
import json
import os 
from PIL import Image

In [2]:
AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API

TRAINING_FILENAMES = '../datasets/gazetrack_tfrec/train.tfrec' 
VALID_FILENAMES = '../datasets/gazetrack_tfrec/val.tfrec'
TEST_FILENAMES = '../datasets/gazetrack_tfrec/test.tfrec' 
BATCH_SIZE = 256

SEED = tf.Variable(256)

In [3]:
def parse_tfrecord_fn(example):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "path": tf.io.FixedLenFeature([], tf.string),
        "device": tf.io.FixedLenFeature([], tf.string),
        "screen_h": tf.io.FixedLenFeature([], tf.int64),
        "screen_w": tf.io.FixedLenFeature([], tf.int64),
        "face_valid": tf.io.FixedLenFeature([], tf.int64),
        "face_x": tf.io.FixedLenFeature([], tf.int64),
        "face_y": tf.io.FixedLenFeature([], tf.int64),
        "face_w": tf.io.FixedLenFeature([], tf.int64),
        "face_h": tf.io.FixedLenFeature([], tf.int64),
        "leye_x": tf.io.FixedLenFeature([], tf.int64),
        "leye_y": tf.io.FixedLenFeature([], tf.int64),
        "leye_w": tf.io.FixedLenFeature([], tf.int64),
        "leye_h": tf.io.FixedLenFeature([], tf.int64),
        "reye_x": tf.io.FixedLenFeature([], tf.int64),
        "reye_y": tf.io.FixedLenFeature([], tf.int64),
        "reye_w": tf.io.FixedLenFeature([], tf.int64),
        "reye_h": tf.io.FixedLenFeature([], tf.int64),
        "dot_xcam": tf.io.FixedLenFeature([], tf.float32),
        "dot_y_cam": tf.io.FixedLenFeature([], tf.float32),
        "dot_x_pix": tf.io.FixedLenFeature([], tf.float32),
        "dot_y_pix": tf.io.FixedLenFeature([], tf.float32),
        "reye_x1": tf.io.FixedLenFeature([], tf.int64),
        "reye_y1": tf.io.FixedLenFeature([], tf.int64),
        "reye_x2": tf.io.FixedLenFeature([], tf.int64),
        "reye_y2": tf.io.FixedLenFeature([], tf.int64),
        "leye_x1": tf.io.FixedLenFeature([], tf.int64),
        "leye_y1": tf.io.FixedLenFeature([], tf.int64),
        "leye_x2": tf.io.FixedLenFeature([], tf.int64),
        "leye_y2": tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(example, feature_description)
    example["image"] = tf.io.decode_jpeg(example["image"], channels=3)
    return example

## Defining Transforms

In [4]:
def augmentation(image, training = True):
    if training:
        aug = tf.keras.Sequential([
                layers.Resizing(128+10, 128+10),
                layers.RandomCrop(128, 128, 256),
                layers.Rescaling(1./255),
                layers.Normalization(mean = (0.3741, 0.4076, 0.5425), variance = (0.0004, 0.0004, 0.0004))
                ])
        
    else:
        aug = tf.keras.Sequential([
                layers.Resizing(128+10, 128+10),
                layers.Rescaling(1./255),
                layers.Normalization(mean = (0.3741, 0.4076, 0.5425), variance = (0.0004, 0.0004, 0.0004))
                ])
    
    image = aug(image)
    
    return image

## Preprocessing on TFrec extracted data

In [5]:
def prepare_sample(features):
    image = features['image']
    w = tf.shape(image)[0]
    h = tf.shape(image)[1]
    
    w = tf.cast(w, tf.int64)
    h = tf.cast(h, tf.int64)
    
    screen_w, screen_h = features['screen_w'], features['screen_h']
    
    kps = [features['leye_x1']/w, features['leye_y1']/h, features['leye_x2']/w, features['leye_y2']/h,
           features['reye_x1']/w, features['reye_y1']/h, features['reye_x2']/w, features['reye_y2']/h]
    # kps has type float64
    
    lx, ly, lw, lh = features['leye_x'], features['leye_y'], features['leye_w'], features['leye_h']
    rx, ry, rw, rh = features['reye_x'], features['reye_y'], features['reye_w'], features['reye_h']
    
    lx = tf.cast(lx, tf.int32)
    ly = tf.cast(ly, tf.int32)
    lw = tf.cast(lw, tf.int32)
    lh = tf.cast(lh, tf.int32)
    
    rx = tf.cast(rx, tf.int32)
    ry = tf.cast(ry, tf.int32)
    rw = tf.cast(rw, tf.int32)
    rh = tf.cast(rh, tf.int32)
    
    
    
#     batch_im = tf.expand_dims(image, 0)
#     crop = tf.image.crop_and_resize(batch_im, boxes=[[0.0, 0.0, 0.5, 0.5]], crop_size=[128, 128], box_indices=[0])

    
    #tf crop                    oh-vert   ow-hori   th    tw
    
    # l_eye = tf.image.crop_to_bounding_box(image, tf.math.maximum(0, ly), tf.math.maximum(0, lx), ly+lh, lx+lw)
    # r_eye = tf.image.crop_to_bounding_box(image, tf.math.maximum(0, ry), tf.math.maximum(0, rx), ry+rh, rx+rw)
    
    lx1 = tf.math.maximum(0, lx)
    ly1 = tf.math.maximum(0, ly)   
    lx2 = tf.math.maximum(0, lx1+lw)
    ly2 = tf.math.maximum(0, ly1+lh)
    lhh = ly2-ly1
    lww = lx2-lx1
    
    rx1 = tf.math.maximum(0, rx)
    ry1 = tf.math.maximum(0, ry)   
    rx2 = tf.math.maximum(0, rx1+rw)
    ry2 = tf.math.maximum(0, ry1+rh)
    rhh = ry2-ry1
    rww = rx2-rx1
    
    l_eye = tf.image.crop_to_bounding_box(image, ly1, lx1, lhh, lww)
    r_eye = tf.image.crop_to_bounding_box(image, ry1, rx1, rhh, rww)
    
    
    # l_eye = tf.image.crop_to_bounding_box(image, tf.math.maximum(0, ly), tf.math.maximum(0, lx), tf.math.maximum(0, lh), tf.math.maximum(0, lw))
    # r_eye = tf.image.crop_to_bounding_box(image, tf.math.maximum(0, ry), tf.math.maximum(0, rx), tf.math.maximum(0, rh), tf.math.maximum(0, rw))
    
    #pil.crop
                             #left        #upper     #right         #lower
    # l_eye = image.crop((max(0, lx), max(0, ly), max(0, lx+lw), max(0, ly+lh)))
    # r_eye = image.crop((max(0, rx), max(0, ry), max(0, rx+rw), max(0, ry+rh)))
    
    
    
    
    l_eye = tf.image.flip_left_right(l_eye)
    
    out = [features['dot_xcam'], features['dot_y_cam']]
    # out has type float32
    
    l_eye = augmentation(l_eye)
    r_eye = augmentation(r_eye)
    
    # return l_eye, r_eye, kps, out, screen_w, screen_h
    
    x = {'l_eye': l_eye, 'r_eye':r_eye, 'kps':kps} 
    y = out
    
    return x, y

## Extracting the TFRecs and preprocessing+transforms+batching

In [6]:
def get_batched_dataset(filenames, batch_size):
    option_no_order = tf.data.Options()
    option_no_order.deterministic = False  # disable order, increase speed
    
    dataset = (
        tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
        .with_options(option_no_order)
        .map(parse_tfrecord_fn, num_parallel_calls=AUTO)
        .map(prepare_sample, num_parallel_calls=AUTO)
        .shuffle(batch_size*10)
        .batch(batch_size)
        .prefetch(buffer_size=AUTO)
    )
    
    dataset_len = sum(1 for _ in tf.data.TFRecordDataset(filenames))
    print(f"No. of samples: {dataset_len}")
    
    return dataset

In [7]:
# train_dataset = get_batched_dataset(TRAINING_FILENAMES, BATCH_SIZE)
valid_dataset = get_batched_dataset(VALID_FILENAMES, BATCH_SIZE)
# test_dataset = get_batched_dataset(TEST_FILENAMES, BATCH_SIZE)

No. of samples: 43458


## dataset shapes

In [8]:
sample = next(iter(valid_dataset))
len(sample)

2

In [9]:
x = sample[0]
type(x), len(x)

(dict, 3)

In [10]:
l_eye = x['l_eye']
r_eye = x['r_eye']
lms = x['kps']

print(l_eye.shape, r_eye.shape, lms.shape)

(256, 128, 128, 3) (256, 128, 128, 3) (256, 8)


In [11]:
y = sample[1]
y.shape

TensorShape([256, 2])

## New model

In [12]:
class eye_model(layers.Layer):
    def __init__(self, name='Eye-model'):
        super(eye_model, self).__init__()

        self.conv1 = layers.Conv2D(32, kernel_size=7, strides=2, padding='valid') 
        self.conv2 = layers.Conv2D(64, kernel_size=5, strides=2, padding='valid')
        self.conv3 = layers.Conv2D(128, kernel_size=3, strides=1, padding='valid')
        self.bn1 = layers.BatchNormalization(axis = -1, momentum=0.9)
        self.bn2 = layers.BatchNormalization(axis = -1, momentum=0.9)
        self.bn3 = layers.BatchNormalization(axis = -1, momentum=0.9)
        self.leakyrelu = layers.LeakyReLU(alpha=0.01)
        self.avgpool = layers.AveragePooling2D(pool_size=2)
        self.dropout = layers.Dropout(rate=0.02)

    def call(self, input_image):
        print('ip image -', input_image.shape)
        x = self.conv1(input_image)
        print("conv1-", x.shape)
        x = self.bn1(x)
        print("bn1-", x.shape)
        x = self.leakyrelu(x)
        x = self.avgpool(x)
        x = self.dropout(x) #

        x = self.conv2(x)
        print("conv2-", x.shape)
        x = self.bn2(x)
        print("bn2-", x.shape)
        x = self.leakyrelu(x)
        x = self.avgpool(x)
        x = self.dropout(x)

        x = self.conv3(x)
        print("conv3-", x.shape)
        x = self.bn3(x)
        print("bn3-", x.shape)
        x = self.leakyrelu(x)
        x = self.avgpool(x)
        # print('check3')
        x = self.dropout(x)

        return x

class landmark_model(layers.Layer):
    def __init__(self, name='Landmark-model'):
        super(landmark_model, self).__init__()

        self.dense1 = layers.Dense(128)
        self.dense2 = layers.Dense(16)
        self.dense3 = layers.Dense(16)
        self.bn1 = layers.BatchNormalization(axis = -1,momentum=0.9)
        self.bn2 = layers.BatchNormalization(axis = -1, momentum=0.9)
        self.bn3 = layers.BatchNormalization(axis = -1, momentum=0.9)
        self.relu = layers.ReLU()

    def call(self, input_kps):
        # print(input_kps.shape)
        x = self.dense1(input_kps)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.dense2(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.dense3(x)
        x = self.bn3(x)
        x = self.relu(x)   

        return x

class gazetrack_model(Model):
    def __init__(self, name='Gazetrack-model'):
        super(gazetrack_model, self).__init__()

        self.eye_model = eye_model()
        self.landmark_model = landmark_model()

        self.dense1 = layers.Dense(8)
        self.dense2 = layers.Dense(4)
        self.dense3 = layers.Dense(2)

        self.bn1 = layers.BatchNormalization(axis = -1, momentum=0.9)
        self.bn2 = layers.BatchNormalization(axis = -1, momentum=0.9)
        self.dropout = layers.Dropout(rate=0.12)
        self.relu = layers.ReLU()


    def call(self, l_r_lms):
        leftEye = l_r_lms['l_eye']
        rightEye = l_r_lms['r_eye']
        lms = l_r_lms['kps']
        # leftEye, rightEye, lms = l_r_lms
        # l_eye_feat = tf.reshape(self.eye_model(leftEye), (leftEye.shape[0], 512))
        # r_eye_feat = tf.reshape(self.eye_model(rightEye), (rightEye.shape[0], 512))
        
        l_eye_feat = self.eye_model(leftEye)
        r_eye_feat = self.eye_model(rightEye)
        
        l_eye_feat = layers.Flatten()(l_eye_feat)
        r_eye_feat = layers.Flatten()(r_eye_feat)

        
        lm_feat = self.landmark_model(lms)
        
        # l_eye_feat = tf.cast(l_eye_feat, tf.float32)
        # r_eye_feat = tf.cast(r_eye_feat, tf.float32)
        
        combined_feat = tf.concat((l_eye_feat, r_eye_feat, lm_feat),1)

        x = self.dense1(combined_feat)
        x = self.bn1(x)
        # print('check1')
        x = self.dropout(x)
        # print('check2')
        x = self.relu(x)

        x = self.dense2(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.dense3(x)

        return x

In [13]:
# class SubModel(Model):
#     def call(self, inputs):
#         print(inputs)
        
# model_ = SubModel()
# inputs = tf.keras.input(shape=(100,))
# outputs = model_(inputs)
# model = tf.keras.Model(inputs=inputs, outputs=outputs)

        
 ## problems with keras subclassing
# https://stackoverflow.com/questions/58153888/how-to-set-the-input-of-a-keras-subclass-model-in-tensorflow

In [14]:
# def func_api():
#     x = Input(shape=(128, 128, 3))
#     y = layers.Conv2D(28, 3, strides=1)(x)
#     return Model(inputs=[x], outputs=[y])

# if __name__ == '__main__':
#     func = func_api()
#     func.summary()

#     sub = subclass()
#     sub.build(input_shape=(None, 24, 24, 3))
#     sub.summary()


## Training Parameters

In [15]:
lr = 0.016
loss = tf.keras.losses.MeanSquaredError()
metrics = [tf.keras.metrics.mean_squared_error]
optimizer = tf.keras.optimizers.Adam(learning_rate=lr,beta_1=0.9, beta_2=0.999, epsilon=1e-07)
scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', verbose=1, mode='min')
batch_size = 256

In [16]:
model = gazetrack_model()

In [17]:
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

## Training

In [18]:
model.fit(
    x=valid_dataset,   
    batch_size=batch_size,
    epochs=1,  
    verbose='auto',   #auto=1, 1=progress bar, 2=one line per epoch( maybe use 2 if running job)
    callbacks=[scheduler],
    validation_data=valid_dataset,
    shuffle=True,    #probably will not work as our dataset is a tf.data object
    initial_epoch=0,     #epoch at which to resume training
    workers=1,
    use_multiprocessing=False
)

ip image - (None, 128, 128, 3)
conv1- (None, 61, 61, 32)
bn1- (None, 61, 61, 32)
conv2- (None, 13, 13, 64)
bn2- (None, 13, 13, 64)
conv3- (None, 4, 4, 128)
bn3- (None, 4, 4, 128)
ip image - (None, 128, 128, 3)
conv1- (None, 61, 61, 32)
bn1- (None, 61, 61, 32)
conv2- (None, 13, 13, 64)
bn2- (None, 13, 13, 64)
conv3- (None, 4, 4, 128)
bn3- (None, 4, 4, 128)
ip image - (None, 128, 128, 3)
conv1- (None, 61, 61, 32)
bn1- (None, 61, 61, 32)
conv2- (None, 13, 13, 64)
bn2- (None, 13, 13, 64)
conv3- (None, 4, 4, 128)
bn3- (None, 4, 4, 128)
ip image - (None, 128, 128, 3)
conv1- (None, 61, 61, 32)
bn1- (None, 61, 61, 32)
conv2- (None, 13, 13, 64)
bn2- (None, 13, 13, 64)
conv3- (None, 4, 4, 128)
bn3- (None, 4, 4, 128)
     18/Unknown - 106s 5s/step - loss: 22.8953 - mean_squared_error: 22.8953

InvalidArgumentError: Graph execution error:

assertion failed: [width must be >= target + offset.]
	 [[{{node crop_to_bounding_box/Assert_4/Assert}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_47945]

In [None]:
evaluate(
    x=train_dataset,
    batch_size=batch_size,
    verbose='auto',
    steps=1,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False,
    return_dict=False
)