Welcome to the world where fashion meets computer vision! This is a starter kernel that applies Mask R-CNN with COCO pretrained weights to the task of [iMaterialist (Fashion) 2019 at FGVC6](https://www.kaggle.com/c/imaterialist-fashion-2019-FGVC6).

In [1]:
import os
import gc
import sys
import json
import glob
import random
from pathlib import Path

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imutils
import itertools
from tqdm import tqdm

from imgaug import augmenters as iaa
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold

In [2]:
DATA_DIR = Path('/home/ubuntu/efs/kaggle/imaterialist/')
ROOT_DIR = Path('/home/ubuntu/efs/kaggle/imaterialist/maskrcnn/logs')

# For demonstration purpose, the classification ignores attributes (only categories),
# and the image size is set to 512, which is the same as the size of submission masks
NUM_CATS = 46
IMAGE_SIZE = 512

In [3]:
print(ROOT_DIR/'Mask_RCNN')
sys.path.append("/home/ubuntu/github/Mask_RCNN/")
#sys.path.append(ROOT_DIR/'Mask_RCNN')
from mrcnn.config import Config
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
from mrcnn.model import log

model_path = '/home/ubuntu/efs/kaggle/imaterialist/maskrcnn/logs/fashion20190602T1444/mask_rcnn_fashion_0016.h5'

/home/ubuntu/efs/kaggle/imaterialist/maskrcnn/logs/Mask_RCNN


Using TensorFlow backend.


# Dataset

In [4]:
# Since the submission system does not permit overlapped masks, we have to fix them
def refine_masks(masks, rois):
    areas = np.sum(masks.reshape(-1, masks.shape[-1]), axis=0)
    mask_index = np.argsort(areas)
    union_mask = np.zeros(masks.shape[:-1], dtype=bool)
    for m in mask_index:
        masks[:, :, m] = np.logical_and(masks[:, :, m], np.logical_not(union_mask))
        union_mask = np.logical_or(masks[:, :, m], union_mask)
    for m in range(masks.shape[-1]):
        mask_pos = np.where(masks[:, :, m]==True)
        if np.any(mask_pos):
            y1, x1 = np.min(mask_pos, axis=1)
            y2, x2 = np.max(mask_pos, axis=1)
            rois[m, :] = [y1, x1, y2, x2]
    return masks, rois

In [5]:
def resize_image(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_AREA)  
    return img  
    
class FashionDataset(utils.Dataset):

    def __init__(self, df):
        super().__init__(self)
        
        # Add classes
        for i, name in enumerate(label_names):
            self.add_class("fashion", i+1, name)
        
        # Add images 
        for i, row in df.iterrows():
            self.add_image("fashion", 
                           image_id=row.name, 
                           path=str(DATA_DIR/'train'/row.name), 
                           labels=row['CategoryId'],
                           annotations=row['EncodedPixels'], 
                           height=row['Height'], width=row['Width'])

    def image_reference(self, image_id):
        info = self.image_info[image_id]
        return info['path'], [label_names[int(x)] for x in info['labels']]
    
    def load_image(self, image_id):
        return resize_image(self.image_info[image_id]['path'])

    def load_mask(self, image_id):
        info = self.image_info[image_id]
                
        mask = np.zeros((IMAGE_SIZE, IMAGE_SIZE, len(info['annotations'])), dtype=np.uint8)
        labels = []
        
        for m, (annotation, label) in enumerate(zip(info['annotations'], info['labels'])):
            sub_mask = np.full(info['height']*info['width'], 0, dtype=np.uint8)
            annotation = [int(x) for x in annotation.split(' ')]
            
            for i, start_pixel in enumerate(annotation[::2]):
                sub_mask[start_pixel: start_pixel+annotation[2*i+1]] = 1

            sub_mask = sub_mask.reshape((info['height'], info['width']), order='F')
            sub_mask = cv2.resize(sub_mask, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_NEAREST)
            
            mask[:, :, m] = sub_mask
            labels.append(int(label)+1)
            
        return mask, np.array(labels)

In [6]:
with open(DATA_DIR/"label_descriptions.json") as f:
    label_descriptions = json.load(f)

label_names = [x['name'] for x in label_descriptions['categories']]


segment_df = pd.read_csv(DATA_DIR/"train.csv")
segment_df['CategoryId'] = segment_df['ClassId'].str.split('_').str[0]
image_df = segment_df.groupby('ImageId')['EncodedPixels', 'CategoryId'].agg(lambda x: list(x))
size_df = segment_df.groupby('ImageId')['Height', 'Width'].mean()
image_df = image_df.join(size_df, on='ImageId')



In [7]:
dataset = FashionDataset(image_df)
dataset.prepare()

# Set Config

Mask R-CNN has a load of hyperparameters. I only adjust some of them.

In [8]:
class FashionConfig(Config):
    NAME = "fashion"
    NUM_CLASSES = NUM_CATS + 1 # +1 for the background class
    
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1 # a memory error occurs when IMAGES_PER_GPU is too high
    
    BACKBONE = 'resnet101'
    
    IMAGE_MIN_DIM = IMAGE_SIZE
    IMAGE_MAX_DIM = IMAGE_SIZE    
    IMAGE_RESIZE_MODE = 'none'
    RPN_ANCHOR_SCALES = (16, 32, 64, 128, 256)
    
config = FashionConfig()
config.display()


Configurations:
BACKBONE                       resnet101
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     1
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE         None
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.7
DETECTION_NMS_THRESHOLD        0.3
FPN_CLASSIF_FC_LAYERS_SIZE     1024
GPU_COUNT                      1
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 1
IMAGE_CHANNEL_COUNT            3
IMAGE_MAX_DIM                  512
IMAGE_META_SIZE                59
IMAGE_MIN_DIM                  512
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              none
IMAGE_SHAPE                    [512 512   3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.001
LOSS_WEIGHTS                   {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE                 14
MASK_SHAPE               

# Predict

In [9]:
class InferenceConfig(FashionConfig):
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    DETECTION_MIN_CONFIDENCE = 0.7
    #DETECTION_NMS_THRESHOLD = 0.1 # default 0.3
    TEST_MODE = "inference"
    
inference_config = InferenceConfig()


In [10]:

model = modellib.MaskRCNN(mode='inference', 
                          config=inference_config,
                          model_dir=ROOT_DIR)


assert model_path != '', "Provide path to trained weights"
print("Loading weights from ", model_path)
model.load_weights(model_path, by_name=True)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Loading weights from  /home/ubuntu/efs/kaggle/imaterialist/maskrcnn/logs/fashion20190607T1342/mask_rcnn_fashion_0001.h5
Re-starting from epoch 1


In [10]:
# Compute VOC-style Average Precision
random.seed(888)
image_ids = np.random.choice(dataset.image_ids, 1000)
def compute_batch_ap(image_ids):
    APs = []
    for image_id in tqdm(image_ids):
        # Load image
        image, image_meta, gt_class_id, gt_bbox, gt_mask =\
            modellib.load_image_gt(dataset, config,
                                   image_id, use_mini_mask=False)
        # Run object detection
        results = model.detect([image], verbose=0)
        # Compute AP
        r = results[0]
        AP, precisions, recalls, overlaps =\
            utils.compute_ap(gt_bbox, gt_class_id, gt_mask,
                              r['rois'], r['class_ids'], r['scores'], r['masks'])
        APs.append(AP)
    return APs

# Pick a set of random images
#image_ids = np.random.choice(dataset.image_ids, 1000)
#APs = compute_batch_ap(image_ids)
#print("mAP @ IoU=50: ", np.mean(APs))

In [11]:
mAPs = {}
for confidence in [0.95, 0.9, 0.8]:
    inference_config.DETECTION_MIN_CONFIDENCE = confidence
    model = modellib.MaskRCNN(mode='inference', 
                              config=inference_config,
                              model_dir=ROOT_DIR)
    model.load_weights(model_path, by_name=True)
    APs = compute_batch_ap(image_ids)
    mAPs[confidence] = APs
    print("Confidence:",confidence," mAP @ IoU=50: ", np.mean(APs))
    
for k,v in mAPs.items():
    print(k,v)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


  0%|          | 0/1000 [00:00<?, ?it/s]

Re-starting from epoch 16


100%|██████████| 1000/1000 [07:40<00:00,  2.17it/s]


Confidence: 0.95  mAP @ IoU=50:  0.4400024407328065


  0%|          | 0/1000 [00:00<?, ?it/s]

Re-starting from epoch 16


100%|██████████| 1000/1000 [07:17<00:00,  2.49it/s]


Confidence: 0.9  mAP @ IoU=50:  0.4593510657719566


  0%|          | 0/1000 [00:00<?, ?it/s]

Re-starting from epoch 16


100%|██████████| 1000/1000 [07:23<00:00,  2.53it/s]

Confidence: 0.8  mAP @ IoU=50:  0.48137578739686576
0.95 [0.2777777860562006, 0.0, 0.6370370553599464, 0.5, 0.1428571492433548, 0.03333333507180214, 0.27272728085517883, 1.0, 0.7142857313156128, 0.6666666865348816, 0.5000000186264515, 0.4238095283508301, 0.6000000238418579, 0.3333333333333333, 0.3800000071525574, 0.25, 0.800000011920929, 0.6000000238418579, 0.22407408017251226, 0.25, 0.0, 0.2857142984867096, 0.75, 0.28630952380952385, 1.0, 0.19230769574642181, 0.3333333432674408, 0.3333333432674408, 0.10000000149011612, 0.3809523979822794, 0.7500000149011612, 0.7499999850988389, 0.7499999850988389, 0.2222222238779068, 0.5454545617103577, 0.2571428596973419, 0.800000011920929, 0.4514285743236542, 0.25, 0.2703703731298447, 1.0, 0.375, 1.0, 0.14814814925193787, 0.2222222238779068, 0.75, 0.6333333440124989, 0.0, 0.75, 0.1875, 0.20833333333333331, 0.1111111119389534, 0.333333338300387, 0.75, 0.3454545557498932, 0.4291486406868154, 0.125, 0.800000011920929, 0.4328042368094126, 0.375, 0.16666


