In [0]:
# The dataset.tar file should be extracted in the same directory as the notebook
# Importing data from dataset
from dataset_generator import get_data_file_lists, get_dataset, normalize_n_structure_data
trainf, testf, ktestf = get_data_file_lists('dataset', 4, ["tomato", "toothpaste"], ["cap_3", "food_box_3", "soda_can_2"])

In [0]:
# Normalizing images
train_ds = get_dataset(trainf).map(normalize_n_structure_data)
test_ds = get_dataset(testf).map(normalize_n_structure_data)
ktest_ds = get_dataset(ktestf).map(normalize_n_structure_data)

In [0]:
# Importing necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.applications import VGG16
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, GlobalAveragePooling2D, \
Lambda, multiply, concatenate, InputLayer, UpSampling2D, Layer, Concatenate, ZeroPadding2D, Conv2DTranspose
import keras.backend as K
from keras import Input
from keras.initializers import Constant, RandomNormal
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.models import Model, Sequential
import cv2

In [0]:
preprocess_fn = lambda a,b,c,d,e,f: (
    a,
    tf.pad(1-c, ((0,31), (0,31), (0,0)))[:,:,0:1],
    tf.pad(c, ((0,31), (0,31), (0,0)))[:,:,0:1],
    tf.image.resize_image_with_pad(d, 224, 224),
    e,
    tf.image.resize_image_with_pad(f, 224, 224))


train_ds = train_ds.map(preprocess_fn).batch(4).repeat()
test_ds = test_ds.map(preprocess_fn).batch(1)
ktest_ds = ktest_ds.map(preprocess_fn).batch(1)

In [0]:
data_iter = tf.data.Iterator.from_structure(test_ds.output_types, test_ds.output_shapes)
simg_batch, smskn_batch, smskp_batch, qimg_batch, qdepth_batch, qmsk_batch = data_iter.get_next()
init_training = data_iter.make_initializer(train_ds)
init_testing = data_iter.make_initializer(test_ds)
init_ktesting = data_iter.make_initializer(ktest_ds)

In [0]:
# Number of Classes
# Since the images in our dataset contains object belonging to only one class per image num_classes is 1.
# We have written the code to implement multi-class segmentation, which will be used in future improvements, but decided not to include it here.
num_classes = 1  #@param{type:"slider", min:1, max:5}

# Feature Dimension - 512 for light version of the head and 4096 when using fully convolutionalized VGG16 network
feat_dim = 512  #@param{type:"slider", min:256, max:4096}

In [0]:
# The feature extractor; VGG16, pretrained on ILSVRC
vgg = VGG16(include_top = False, weights = 'imagenet', input_shape = (224,224,3))

In [0]:
# VGG summary
vgg.summary()

In [0]:
# Note: Use this function when fully convolutionalized version of VGG16 is to be used as feature extractor. 
# When using this, set feat_dim to 4096

# def get_encoder(model):
#     """
#     https://stackoverflow.com/questions/41161021/how-to-convert-a-dense-layer-to-an-equivalent-convolutional-layer-in-keras
#     Recast `modules` into fully convolutional form.
#     The conversion transfers weights and infers kernel sizes from the
#     `input_size` and modules' action on it.
#     n.b. This only handles the conversion of linear/fully-connected modules,
#     although other module types could require conversion for correctness.
#     """
#     model_layers = []
#     for layer in model.layers:

#         if "Flatten" in str(layer):
#             flattened_ipt = True
#             f_dim = layer.input_shape
#             continue

#         elif "Dense" in str(layer):
#             input_shape = layer.input_shape
#             output_dim =  layer.get_weights()[1].shape[0]
#             W,b = layer.get_weights()

#             if flattened_ipt:
#                 shape = (f_dim[1],f_dim[2],f_dim[3],output_dim)
#                 new_W = W.reshape(shape)
#                 new_layer = Conv2D(output_dim,
#                                           (f_dim[1],f_dim[2]),
#                                           strides=(1,1),
#                                           activation='relu',
#                                           padding='valid',
#                                           weights=[new_W,b])
#                 flattened_ipt = False

#             else:
#                 shape = (1,1,input_shape[1],output_dim)
#                 new_W = W.reshape(shape)
#                 new_layer = Conv2D(output_dim,
#                                           (1,1),
#                                           strides=(1,1),
#                                           activation='relu',
#                                           padding='valid',
#                                           weights=[new_W,b])


#         else:
#             new_layer = layer

#         model_layers.append(new_layer)
    
#     return model_layers[1:]

In [0]:
def bilinear_kernel(size, normalize=False):
    """
    Make a 2D bilinear kernel suitable for upsampling/downsampling with
    normalize=False/True. The kernel is size x size square.
    Take
        size: kernel size (square)
        normalize: whether kernel sums to 1 (True) or not
    Give
        kernel: np.array with bilinear kernel coefficient
    """
    factor = (size + 1) // 2
    if size % 2 == 1:
        center = factor - 1
    else:
        center = factor - 0.5
    og = np.ogrid[:size, :size]
    kernel = (1 - abs(og[0] - center) / factor) * \
             (1 - abs(og[1] - center) / factor)
    if normalize:
        kernel /= kernel.sum()
    return kernel.astype('float64')

In [0]:
class Interpolator(Layer):
    """
    Transposed Convolution layer with fixed bilinear kernel. Used for upsampling.
    """

    def __init__(self, filters, rate, odd = False, normalize = False, **kwargs):
        self.filter = filters
        self.strides = (rate, rate)
        self.kernel_size = rate*2
        if odd:
            self.kernel_size -= 1
        bk = bilinear_kernel(self.kernel_size, normalize)
        self.ini = np.repeat(np.repeat(bk[:,:,None,None], self.filter, axis = -2), self.filter, axis = -1).astype('float64')
        super(Interpolator, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel', 
                                      shape=(self.kernel_size, self.kernel_size, self.filter, self.filter),
                                      initializer=Constant(self.ini),
                                      trainable=False)
        super(Interpolator, self).build(input_shape)  # Be sure to call this at the end

    def call(self, x):
        return K.conv2d_transpose(
                x,
                self.kernel,
                output_shape = (tf.shape(x)[0], (tf.shape(x)[1]-1)*self.strides[0] + self.kernel_size, \
                                (tf.shape(x)[2]-1)*self.strides[1] + self.kernel_size, self.filter),
                strides=self.strides,
                data_format='channels_last')

In [0]:
class Downsampler(Layer):
    """
    Convolution layer with fixed bilinear kernel. Used for downsampling.
    """
    
    def __init__(self, filters, rate, odd, normalize = True, **kwargs):
        self.filter = filters
        self.strides = (rate, rate)
        self.kernel_size = rate * 2
        if odd:
            self.kernel_size -= 1
        bk = bilinear_kernel(self.kernel_size, normalize)
        self.ini = np.repeat(np.repeat(bk[:,:,None,None], self.filter, axis = -2), self.filter, axis = -1).astype('float64')
        super(Downsampler, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel', 
                                      shape=(self.kernel_size, self.kernel_size, self.filter, self.filter),
                                      initializer=Constant(self.ini),
                                      trainable=False)
        super(Downsampler, self).build(input_shape)  # Be sure to call this at the end

    def call(self, x):
        return K.conv2d(
                x,
                self.kernel,
                strides=self.strides,
                data_format='channels_last')

In [0]:
# When using fully convolutionalized version of VGG16, use the convolutional base and FC6 layer with added Dropout layer at the end.
# Encoder Network(Light Version) - VGG Convolution base + FC6
# Adding a pad layer to get output of desired shape (6x6xC)
pad_layer = ZeroPadding2D(padding=(81,81))
layers = vgg.layers
layers.insert(0, pad_layer)
encoder_input = x = Input(shape=(224,224,3))
for i, layer in enumerate(layers):
    x = layer(x)
x = Conv2D(512, (7,7), padding = 'valid', name = 'fc6', kernel_initializer = RandomNormal(mean=0, stddev=0.001), bias_initializer='zeros', activation='relu')(x)
encoder_output = Dropout(0.5)(x)
encoder = Model(encoder_input, encoder_output)

# Clearing up memory
del layers
del vgg

# Feature Extracto Summary
encoder.summary()

In [0]:
# Classification head
# Note: The small fully convolutional network used to produce segmentation mask from fused query features and guide
head_input = Input(shape = (None, None, feat_dim*2), name = 'Head_Input')
x = Conv2D(feat_dim*2, (1, 1), activation='relu', name = 'Head_Conv1', kernel_initializer = RandomNormal(mean=0, stddev=0.001))(head_input)
x = Dropout(0.5, name = 'Head_drop1')(x)
score = Conv2D(num_classes, (1,1), name = 'Head_conv2', kernel_initializer='zeros', bias_initializer='zeros')(x)
head = Model(head_input, score)
head.summary()

## IMPORTANT!!
Annotation shape: BatchxClassesxHxW in channel first format  

Batch x H x W x Classes for us

In [0]:
def mask_feat(x, mask, scale = True):
    """
    Helper function used to apply downsampled support mask to support features.
    """
    x_size, mask_size = x.shape, mask.shape
    if x_size[1:-1] != mask_size[1:-1]:
        raise ValueError("Shape mismatch. Feature is {}, but mask is {}".format(x_size, mask_size))
    if scale:
        x = tf.multiply(x, mask)
    return x

In [0]:
# For single class annotation
def get_combined_feat(qfeats, sfeatures, pos_anno, neg_anno, nc, update = False):    
    """
    Helper function used to fuse guide and query features
    """
    
    # mask support by annotations
    pos_feats = mask_feat(sfeatures, pos_anno)
    neg_feats = mask_feat(sfeatures, neg_anno)

    # global pool support +/- features
    # Note: This code implements ont-shot learning. When solving for few-shot learning, the pooling is done across batch dimension(0) as well
    pos_vec = tf.reduce_sum(pos_feats, axis = (1,2))
    neg_vec = tf.reduce_sum(neg_feats, axis = (1,2))

    h, w = qfeats.shape[1:-1]

    # Tile the pooled features across the image feature
    pos_glob = tf.tile(tf.reshape(pos_vec, shape = (-1, 1, 1, tf.shape(pos_vec)[-1])), (1, h, w, 1))
    neg_glob = tf.tile(tf.reshape(neg_vec, shape = (-1, 1, 1, tf.shape(neg_vec)[-1])), (1, h, w, 1))
    x = tf.concat([qfeats, pos_glob], axis=-1)

    return x

In [0]:
# Function that retrieves mean Intersection Over Union score given two masks
def get_miou(y, y_pred):
  y_pred = tf.nn.sigmoid(y_pred)
  y = tf.convert_to_tensor(y)
  union = tf.reduce_sum(tf.cast(tf.logical_or(tf.greater_equal(y, 0.5), tf.greater_equal(y_pred,0.5)), dtype = 'int64'), axis = (1,2,3))
  intersection = tf.reduce_sum(tf.cast(tf.logical_and(tf.greater_equal(y, 0.5), tf.greater_equal(y_pred,0.5)), dtype = 'int64'), axis = (1,2,3))
  miou = tf.reduce_mean(tf.divide(intersection,union))
  return miou

In [0]:
# Upsampler
decoder = Interpolator(1, 32, False)

In [0]:
# Downsampler
anno_enc = Downsampler(1, 32, odd = False)

In [0]:
# Revolver model
sfeatures = encoder(simg_batch)
c = K.constant(1e-6)
pos_anno = anno_enc(smskp_batch)
neg_anno = anno_enc(smskn_batch)
pos_anno = tf.divide(pos_anno, tf.math.add(tf.reduce_sum(pos_anno, axis=(1,2,3)), c)[...,None, None, None])
neg_anno = tf.divide(neg_anno, tf.math.add(tf.reduce_sum(neg_anno, axis=(1,2,3)), c)[...,None, None, None])
qfeatures = encoder(qimg_batch)
combined = get_combined_feat(qfeatures, sfeatures, pos_anno, neg_anno, num_classes, update = False)
score = head(combined)
segmentation_mask = decoder(score)

In [0]:
# Loss function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = qmsk_batch, logits=segmentation_mask))

# Define an optimizer 
train_op = tf.train.AdamOptimizer(learning_rate=0.00001).minimize(loss)

In [0]:
# Colab visualization code
#from tensorboardcolab import TensorBoardColab

tf.summary.scalar('loss', loss)
tf.summary.scalar('max', tf.reduce_max(segmentation_mask))
tf.summary.scalar('min', tf.reduce_min(segmentation_mask))
tf.summary.scalar('mIOU', get_miou(qmsk_batch, segmentation_mask))
summary = tf.summary.merge_all()
#tbc = TensorBoardColab()
#writer = tbc.get_writer()
writer = tf.summary.FileWriter('/content/summary/train', sess.graph)

In [0]:
# Training
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(init_training)
for i in range(50000):
    s, loss_value, _ = sess.run([summary, loss, train_op])
    writer.add_summary(s,i)

In [0]:
from mrf_depth import improve_segmentation_with_depth

def calc_iou(pred, out):
  """
  Calculates Intersection over Union between predicted mask and ground truth
  """
  out_bool = out > 0.5
  pred_bool = pred > 0.5
  return np.sum( out_bool & pred_bool, axis=(1,2,3) ) / np.sum( out_bool | pred_bool, axis=(1,2,3) )
  
def measure_iou(sess, num_of_batches=-1):
  """
  runs the network for the dataset and measures network's performance for the dataset in terms of mIoU
  """
  count = 0
  n = 0
  iou = 0
  iou_wd = 0
  while True:
    try:
      p, o, d = sess.run([segmentation_mask, qmsk_batch, qdepth_batch])
      pwd = []
      for i in range(len(p)):
        pwd.append(improve_segmentation_with_depth(p[i][:, :, 0], d[i][:, :, 0])/255.0)
      pwd = np.expand_dims(pwd, -1)
      n += len(p)
      iou += np.sum( calc_iou(p, o) )
      iou_wd +=  np.sum( calc_iou(pwd, o) )
      count += 1
      if count == num_of_batches:
        break
    except tf.errors.OutOfRangeError:
      break
  return iou/n, iou_wd/n

In [0]:
# Testing
sess.run(init_testing)
m = measure_iou(sess)
print(f"Testing Mean IOU: {m[0]}, using MRFs: {m[1]}")
sess.run(init_ktesting)
m = measure_iou(sess)
print(f"Testing (known classes) Mean IOU: {m[0]}, using MRFs: {m[1]}")