<a href="https://colab.research.google.com/github/scpepper69/ml-learning-materials/blob/master/Tensorflow_Objectetection_M2Det_draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
###### Download MSCOCO 2017 DataSet 
# !!! Attentioin !!! Images zip is over 19Gbyte. It takes about 20 minutes at Google Colboratory.

# Train Images
!wget http://images.cocodataset.org/zips/train2017.zip
!unzip -q train2017.zip

# Validatioin Images
!wget http://images.cocodataset.org/zips/val2017.zip
!unzip -q val2017.zip

# Test Images
#!wget http://images.cocodataset.org/zips/test2017.zip
#!unzip -q test2017.zip

# Annotations
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip -q annotations_trainval2017.zip

!mkdir output


In [0]:
###### DataSet Label Matching
mscoco2017 = {
    1: [0, 'person'],
    2: [1, 'bicycle'],
    3: [2, 'car'],
    4: [3, 'motorcycle'],
    5: [4, 'airplane'],
    6: [5, 'bus'],
    7: [6, 'train'],
    8: [7, 'truck'],
    9: [8, 'boat'],
    10: [9, 'traffic light'],
    11: [10, 'fire hydrant'],
    13: [11, 'stop sign'],
    14: [12, 'parking meter'],
    15: [13, 'bench'],
    16: [14, 'bird'],
    17: [15, 'cat'],
    18: [16, 'dog'],
    19: [17, 'horse'],
    20: [18, 'sheep'],
    21: [19, 'cow'],
    22: [20, 'elephant'],
    23: [21, 'bear'],
    24: [22, 'zebra'],
    25: [23, 'giraffe'],
    27: [24, 'backpack'],
    28: [25, 'umbrella'],
    31: [26, 'handbag'],
    32: [27, 'tie'],
    33: [28, 'suitcase'],
    34: [29, 'frisbee'],
    35: [30, 'skis'],
    36: [31, 'snowboard'],
    37: [32, 'sports ball'],
    38: [33, 'kite'],
    39: [34, 'baseball bat'],
    40: [35, 'baseball glove'],
    41: [36, 'skateboard'],
    42: [37, 'surfboard'],
    43: [38, 'tennis racket'],
    44: [39, 'bottle'],
    46: [40, 'wine glass'],
    47: [41, 'cup'],
    48: [42, 'fork'],
    49: [43, 'knife'],
    50: [44, 'spoon'],
    51: [45, 'bowl'],
    52: [46, 'banana'],
    53: [47, 'apple'],
    54: [48, 'sandwich'],
    55: [49, 'orange'],
    56: [50, 'broccoli'],
    57: [51, 'carrot'],
    58: [52, 'hot dog'],
    59: [53, 'pizza'],
    60: [54, 'donut'],
    61: [55, 'cake'],
    62: [56, 'chair'],
    63: [57, 'couch'],
    64: [58, 'potted plant'],
    65: [59, 'bed'],
    67: [60, 'dining table'],
    70: [61, 'toilet'],
    72: [62, 'tv'],
    73: [63, 'laptop'],
    74: [64, 'mouse'],
    75: [65, 'remote'],
    76: [66, 'keyboard'],
    77: [67, 'cell phone'],
    78: [68, 'microwave'],
    79: [69, 'oven'],
    80: [70, 'toaster'],
    81: [71, 'sink'],
    82: [72, 'refrigerator'],
    84: [73, 'book'],
    85: [74, 'clock'],
    86: [75, 'vase'],
    87: [76, 'scissors'],
    88: [77, 'teddy bear'],
    89: [78, 'hair drier'],
    90: [79, 'toothbrush'],
}


In [0]:
##### Prepare Libraries
# from Prepare DataSet
import argparse
import json
import cv2
import os

# from M2Det Class
import numpy as np
import tensorflow as tf
#from utils.layer import *

# from Data Class
import cv2
import glob
#import os
import multiprocessing
import time
#import numpy as np
#from utils.generate_priors import generate_priors
#from utils.assign_boxes import assign_boxes
#from utils.augment import augment


In [0]:
# Prepare MSCOCO 2017 DataSet for Training

def prerape_mscoco(image_dir, annotation_path, output_dir):
    with open(annotation_path) as f:
        data = json.load(f)

    annotations = data['annotations']
    for annotation in annotations:
        catid = annotation['category_id']
        clsid = mscoco2017[catid][0]
        
        image_filename = '{0:012d}'.format(annotation['image_id']) + '.jpg'
        src = os.path.join(image_dir, image_filename)
        if not os.path.exists(src):
            continue

        img = cv2.imread(src)
        h, w = img.shape[:2]
        bbox = annotation['bbox']
        x1 = bbox[0] / w
        y1 = bbox[1] / h
        x2 = (bbox[0] + bbox[2]) / w
        y2 = (bbox[1] + bbox[3]) / h

        label = [str(clsid), str(x1), str(y1), str(x2), str(y2)]

        output_filename = os.path.splitext(image_filename)[0] + '.txt'
        dst = os.path.join(output_dir, output_filename)
        with open(dst, 'a') as f:
            f.write('\t'.join(label) + '\n')
#        print(label, src)


In [0]:
prerape_mscoco(image_dir="./train2017", annotation_path='./annotations/image_info_unlabeled2017.json', output_dir="./output" )


In [0]:
!mkdir weights

In [0]:
##################################################
# M2Det Model Class
##################################################
class M2Det:
    def __init__(self, inputs, is_training, num_classes, use_sfam=False):
        self.num_classes = num_classes + 1 # for background class
        self.use_sfam = use_sfam
        self.levels = 8
        self.scales = 6
        self.num_priors = 9
        self.build(inputs, is_training)

    def build(self, inputs, is_training):
        with tf.variable_scope('VGG16'):
            net = inputs
            net = vgg_layer(net, is_training, 64, 2)
            net = vgg_layer(net, is_training, 128, 2)
            net = vgg_layer(net, is_training, 256, 3)
            net = vgg_layer(net, is_training, 512, 3, pooling=False)
            feature1 = net
            net = vgg_layer(net, is_training, 1024, 3)
            feature2 = net

        with tf.variable_scope('M2Det'):
            with tf.variable_scope('FFMv1'):
                feature1 = conv2d_layer(feature1, filters=256, kernel_size=3, strides=1)
                feature1 = tf.nn.relu(batch_norm(feature1, is_training))
                feature2 = conv2d_layer(feature2, filters=512, kernel_size=1, strides=1)
                feature2 = tf.nn.relu(batch_norm(feature2, is_training))
                feature2 = tf.image.resize_images(feature2, tf.shape(feature1)[1:3], 
                                                  method=tf.image.ResizeMethod.BILINEAR)
                base_feature = tf.concat([feature1, feature2], axis=3)

            outs = []
            for i in range(self.levels):
                if i == 0:
                    net = conv2d_layer(base_feature, filters=256, kernel_size=1, strides=1)
                    net = tf.nn.relu(batch_norm(net, is_training))
                else:
                    with tf.variable_scope('FFMv2_{}'.format(i+1)):
                        net = conv2d_layer(base_feature, filters=128, kernel_size=1, strides=1)
                        net = tf.nn.relu(batch_norm(net, is_training))
                        net = tf.concat([net, out[-1]], axis=3)
                with tf.variable_scope('TUM{}'.format(i+1)):
                    out = tum(net, is_training, self.scales)
                outs.append(out)

            features = []
            for i in range(self.scales):
                feature = tf.concat([outs[j][i] for j in range(self.levels)], axis=3)

                if self.use_sfam:
                    with tf.variable_scope('SFAM'):
                        attention = tf.reduce_mean(feature, axis=[1, 2], keepdims=True)
                        attention = tf.layers.dense(inputs=attention, units=64, 
                                                    activation=tf.nn.relu, name='fc1_{}'.format(i+1))
                        attention = tf.layers.dense(inputs=attention, units=1024,
                                                    activation=tf.nn.sigmoid, name='fc2_{}'.format(i+1))
                        feature = feature * attention

                features.insert(0, feature)

            all_cls = []
            all_reg = []
            with tf.variable_scope('prediction'):
                for i, feature in enumerate(features):
                    print(i+1, feature.shape)
                    cls = conv2d_layer(feature, self.num_priors * self.num_classes, 3, 1, use_bias=True)
                    cls = batch_norm(cls, is_training) # activation function is identity
                    cls = flatten_layer(cls)
                    all_cls.append(cls)
                    reg = conv2d_layer(feature, self.num_priors * 4, 3, 1, use_bias=True)
                    reg = batch_norm(reg, is_training) # activation function is identity
                    reg = flatten_layer(reg)
                    all_reg.append(reg)
                all_cls = tf.concat(all_cls, axis=1)
                all_reg = tf.concat(all_reg, axis=1)
                num_boxes = int(all_reg.shape[-1].value / 4)
                all_cls = tf.reshape(all_cls, [-1, num_boxes, self.num_classes])
                all_cls = tf.nn.softmax(all_cls)
                all_reg = tf.reshape(all_reg, [-1, num_boxes, 4])
                self.prediction = tf.concat([all_reg, all_cls], axis=-1)

#if __name__ == '__main__':
#    inputs = tf.placeholder(tf.float32, [None, 320, 320, 3])
#    is_training = tf.constant(False)
#    num_classes = 80
#    m2det = M2Det(inputs, is_training, num_classes)


In [0]:
##################################################
# utils/generate_priors.py
##################################################
def generate_priors(num_scales=3, anchor_scale=2.0, image_size=320, shapes=[40, 20, 10, 5, 3, 1]):
    anchor_configs = {}
    for shape in shapes:
        anchor_configs[shape] = []
        for scale_octave in range(num_scales):
            for aspect_ratio in [(1, 1), (1.41, 0.71), (0.71, 1.41)]:
                anchor_configs[shape].append(
                    (image_size / shape, scale_octave / float(num_scales), aspect_ratio))

    boxes_all = []
    for _, configs in anchor_configs.items():
        boxes_level = []
        for config in configs:
            stride, octave_scale, aspect = config
            base_anchor_size = anchor_scale * stride * (2 ** octave_scale)
            anchor_size_x_2 = base_anchor_size * aspect[0] / 2.0
            anchor_size_y_2 = base_anchor_size * aspect[1] / 2.0
            x = np.arange(stride / 2, image_size, stride)
            y = np.arange(stride / 2, image_size, stride)
            xv, yv = np.meshgrid(x, y)
            xv = xv.reshape(-1)
            yv = yv.reshape(-1)
            boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
                               yv + anchor_size_y_2, xv + anchor_size_x_2))
            boxes = np.swapaxes(boxes, 0, 1)
            boxes_level.append(np.expand_dims(boxes, axis=1))
        boxes_level = np.concatenate(boxes_level, axis=1)
        boxes_level /= image_size
        boxes_all.append(boxes_level.reshape([-1, 4]))

    anchor_boxes = np.vstack(boxes_all)

    return anchor_boxes


In [0]:
##################################################
# utils/assign_boxes.py
##################################################
def encode_box(box, priors, assignment_threshold):
    inter_upleft = np.maximum(priors[:, :2], box[:2])
    inter_botright = np.minimum(priors[:, 2:], box[2:])
    inter_wh = np.maximum(inter_botright - inter_upleft, 0)
    inter = inter_wh[:, 0] * inter_wh[:, 1]
    area_pred = (box[2] - box[0]) * (box[3] - box[1])
    area_gt = (priors[:, 2] - priors[:, 0])
    area_gt *= (priors[:, 3] - priors[:, 1])
    union = area_pred + area_gt - inter
    iou = inter / union

    encoded_box = np.zeros((len(priors), 5))
    assign_mask = iou >= assignment_threshold
    encoded_box[:, -1][assign_mask] = iou[assign_mask]
    assigned_priors = priors[assign_mask] 
    box_center = 0.5 * (box[:2] + box[2:])
    box_wh = box[2:] - box[:2]
    assigned_priors_center = 0.5 * (assigned_priors[:, :2] + assigned_priors[:, 2:])
    assigned_priors_wh = (assigned_priors[:, 2:4] - assigned_priors[:, :2])

    encoded_box[:, :2][assign_mask] = box_center - assigned_priors_center
    encoded_box[:, :2][assign_mask] /= assigned_priors_wh
    encoded_box[:, :2][assign_mask] /= 0.1 # variance0
    encoded_box[:, 2:4][assign_mask] = np.log(box_wh / assigned_priors_wh)
    encoded_box[:, 2:4][assign_mask] /= 0.2 # variance1
    return encoded_box.ravel()

def assign_boxes(boxes, priors, num_classes, threshold=0.5):
    num_classes += 1 # add background class
    assignment = np.zeros((len(priors), 4 + num_classes + 1))
    assignment[:, 4] = 1.0 # background
    encoded_boxes = np.apply_along_axis(encode_box, 1, boxes[:, :4], priors, threshold)
    encoded_boxes = encoded_boxes.reshape(-1, len(priors), 5)
    best_iou = encoded_boxes[:, :, -1].max(axis=0)
    best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
    best_iou_mask = best_iou > 0 # judge by iou between prior and bbox
    best_iou_idx = best_iou_idx[best_iou_mask]
    assign_num = len(best_iou_idx)
    encoded_boxes = encoded_boxes[:, best_iou_mask, :]
    assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num), :4]
    assignment[:, 4][best_iou_mask] = 0 # background
    assignment[:, 5:-1][best_iou_mask] = boxes[best_iou_idx, 4:]
    assignment[:, -1][best_iou_mask] = 1 # objectness
    return assignment


In [0]:
##################################################
# utils/augment.py
##################################################
def normalize(img):
    img = (img - 127.5) / 128.0
    return img

def random_crop(img, boxes):
    if np.random.uniform() > 0.5:
        return img, boxes

    x1, x2, y1, y2 = np.random.uniform(low=0.0, high=0.20, size=4)
    img_h, img_w = img.shape[:2]

    p1 = int(x1 * img_w)
    p2 = int((1.0 - x2) * img_w)
    q1 = int(y1 * img_h)
    q2 = int((1.0 - y2) * img_h)
    img = img[q1:q2, p1:p2, :]

    cropped_boxes = []
    for box in boxes:
        xmin, ymin, xmax, ymax = box[:4]
        if ((x1 >= xmax) or (xmin >= 1.0 - x2) or (y1 >= ymax) or (ymin >= 1.0 - y2)):
            continue
        xmin = max((xmin - x1) / (1.0 - x1 - x2), 0.0)
        xmax = 1.0 - max((1.0 - xmax - x2) / (1.0 - x1 - x2), 0.0)
        ymin = max((ymin - y1) / (1.0 - y1 - y2), 0.0)
        ymax = 1.0 - max((1.0 - ymax - y2) / (1.0 - y1 - y2), 0.0)
        box = [xmin, ymin, xmax, ymax] + box[4:]
        cropped_boxes.append(box)

    return img, cropped_boxes

def random_flip(img, boxes):
    if np.random.uniform() > 0.5:
        img = cv2.flip(img, 1)
        flipped_boxes = []
        for box in boxes:
            xmin, ymin, xmax, ymax = box[:4]
            new_xmin = 1.0 - xmax
            new_xmax = 1.0 - xmin
            box = [new_xmin, ymin, new_xmax, ymax] + box[4:]
            flipped_boxes.append(box)
    else:
        flipped_boxes = boxes

    return img, flipped_boxes

def down_sample(img):
    img_h, img_w = img.shape[:2]
    k = max(int(np.random.normal(loc=2.0)), 1)
    if k > 1:
        img = cv2.resize(img, (int(img_w / k), int(img_h / k)), interpolation=cv2.INTER_CUBIC)
        img = cv2.resize(img, (img_w, img_h), interpolation=cv2.INTER_CUBIC)
    return img

def multi_scale(img, boxes):
    if np.random.uniform() > 0.5:
        return img, boxes

    img_h, img_w = img.shape[:2]
    margin_left = int(min(max(np.random.normal(), 0.0), 0.5) * img_w)
    margin_right = int(min(max(np.random.normal(), 0.0), 0.5) * img_w)
    margin_top = int(min(max(np.random.normal(loc=0.1), 0.0), 0.5) * img_h)
    margin_bottom = int(min(max(np.random.normal(loc=0.1), 0.0), 0.5) * img_h)
    new_w = img_w + margin_left + margin_right
    new_h = img_h + margin_top + margin_bottom
    x1 = margin_left
    x2 = margin_left + img_w
    y1 = margin_top
    y2 = margin_top + img_h
    out = np.ones((new_h, new_w, 3), dtype=np.uint8) * 127
    out[y1:y2, x1:x2, :] = img

    scaled_boxes = []
    for box in boxes:
        xmin, ymin, xmax, ymax = box[:4]
        xmin = ((margin_left + xmin * img_w) / new_w)
        xmax = ((margin_left + xmax * img_w) / new_w)
        ymin = ((margin_top + ymin * img_h) / new_h)
        ymax = ((margin_top + ymax * img_h) / new_h)
        box = [xmin, ymin, xmax, ymax] + box[4:]
        scaled_boxes.append(box)

    return out, scaled_boxes

def scale(img, labels, img_size):
    img_h, img_w = img.shape[:2]
    ratio = max(img_h, img_w) / img_size
    new_h = int(img_h / ratio)
    new_w = int(img_w / ratio)
    ox = (img_size - new_w) // 2
    oy = (img_size - new_h) // 2
    scaled = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
    out = np.ones((img_size, img_size, 3), dtype=np.uint8) * 127
    out[oy:oy + new_h, ox:ox + new_w, :] = scaled

    scaled_labels = []
    for label in labels:
        xmin, ymin, xmax, ymax = label[0:4]
        xmin = (xmin * new_w + ox) / img_size
        ymin = (ymin * new_h + oy) / img_size
        xmax = (xmax * new_w + ox) / img_size
        ymax = (ymax * new_h + oy) / img_size
        label = [xmin, ymin, xmax, ymax] + label[4:]
        scaled_labels.append(label)

    return out, scaled_labels

def augment(img, boxes, input_size):
    img, boxes = random_crop(img, boxes)
    img, boxes = random_flip(img, boxes)
    #img, boxes = multi_scale(img, boxes)
    #img = down_sample(img)
    img, boxes = scale(img, boxes, input_size)
    img = normalize(img)
    return img, boxes


In [0]:
##################################################
# Data Class
##################################################
class Data:
    def __init__(self, image_dir, label_dir, num_classes, input_size, shapes, assignment_threshold):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.num_classes = num_classes 
        self.input_size = input_size
        self.assignment_threshold = assignment_threshold
        self.priors = generate_priors(image_size=self.input_size, shapes=shapes)
        self.size = self.get_size()

    def start(self):
        self.q = multiprocessing.Queue()
        p = multiprocessing.Process(target=self.put, args=(self.q,))
        p.start()

    def get_paths(self):
        paths = []
        for bb_path in glob.glob(os.path.join(self.label_dir, '*.txt')):
            im_path = os.path.join(self.image_dir, os.path.splitext(os.path.basename(bb_path))[0] + '.jpg')
            if os.path.exists(im_path):
                paths.append([im_path, bb_path])
        return paths

    def get_size(self):
        return len(self.get_paths())

    def put(self, q):
        queue_max_size = 1000
        paths = []
        while True:
            if len(paths) == 0:
                paths = self.get_paths()
            if q.qsize() >= queue_max_size:
                time.sleep(0.1)
                continue

            ix = np.random.randint(0, len(paths))
            path = paths.pop(ix)
            im_path, bb_path = path
            npimg = np.fromfile(im_path, dtype=np.uint8)
            img = cv2.imdecode(npimg, cv2.IMREAD_COLOR)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            with open(bb_path) as f:
                lines = f.read().splitlines()

            boxes = []
            for line in lines:
                ix, xmin, ymin, xmax, ymax = line.split('\t') 
                onehot_label = np.eye(self.num_classes)[int(ix)]
                box = [float(xmin), float(ymin), float(xmax), float(ymax)] + onehot_label.tolist()
                boxes.append(box)

            img, boxes = augment(img, boxes, self.input_size)

            if len(boxes) == 0:
                continue
            boxes = np.array(boxes)
            assignment = assign_boxes(boxes, self.priors, self.num_classes, self.assignment_threshold)
            q.put([img, assignment])
            
    def get(self, batch_size):
        x_batch = []
        t_batch = []
        for _ in range(batch_size):
            while True:
                if self.q.qsize() == 0:
                    time.sleep(1)
                    continue
                img, assignment = self.q.get()
                x_batch.append(img)
                t_batch.append(assignment)
                break
        return np.asarray(x_batch), np.asarray(t_batch)
