# Maskrcnn

A maskrcnn model based on the model here:  
https://colab.research.google.com/github/pytorch/vision/blob/temp-tutorial/tutorials/torchvision_finetuning_instance_segmentation.ipynb#scrollTo=at-h4OWK0aoc 
is used in this notebook.

In [20]:
import os
import numpy as np, pandas as pd
from matplotlib import colors
import torch, torchvision
import torch.utils.data
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from PIL import Image

import sys
sys.path.append('/home/jupyter/fastai_dev/dev')
from local.test import *
from local.basics import *
from local.callback.all import *
from local.vision.all import *

### Data

In [4]:
! ls data/

sample_submission.csv  train.csv     train_images.zip
test_images.zip        train_images  understanding_cloud_organization.zip


In [5]:
items = get_image_files('data/train_images/')
items

(#5546) [data/train_images/0a7a247.jpg,data/train_images/2f52d76.jpg,data/train_images/6b272fe.jpg,data/train_images/01eecc1.jpg,data/train_images/f3dad96.jpg,data/train_images/93aafb4.jpg,data/train_images/f157992.jpg,data/train_images/4fa9d86.jpg,data/train_images/c71b0dc.jpg,data/train_images/547ad87.jpg...]

In [6]:
def load_train_annotation(fpath):
    df = pd.read_csv(fpath)
    df['Image'] = df.Image_Label.apply(lambda o: o.split('.')[0])
    df['Label'] = df.Image_Label.apply(lambda o: o.split('_')[1].lower())
    df.drop('Image_Label', axis=1, inplace=True)
    df = df[['Image', 'Label', 'EncodedPixels']]
    return df

annots = load_train_annotation('data/train.csv')

In [7]:
def rle_decode(mask_rle: str = '', shape: tuple = (1400, 2100)):
    '''
    Decode rle encoded mask.
    
    :param mask_rle: run-length as string formatted (start length)
    :param shape: (height, width) of array to return 
    Returns numpy array, 1 - mask, 0 - background
    
    Copied from https://www.kaggle.com/artgor/segmentation-in-pytorch-using-convenient-tools
    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape, order='F')


class CloudTypesImage(Tuple):
    def show(self, ax=None, figsize=None):
        imgid, img, masks = self
        if ax is None: _, ax = plt.subplots(figsize=figsize)
        ax.imshow(img)
        for cloud, m, in masks.items():
            if m.sum() == 0: continue
            m = np.ma.masked_where(m < 1, m)
            ax.imshow(m, alpha=.7, 
                      cmap=colors.ListedColormap([COLORS[cloud]]))
        present_clouds = [cloud for cloud, m in masks.items() if m.sum() > 0]
        ax.set_title(f"{imgid}:{','.join(present_clouds)}")
        ax.axis('off')
        
            
class CloudTypesTfm(Transform):
    def __init__(self, items, annots):
        self.items, annots = items, annots
        
    def encodes(self, i):
        fn = self.items[i]
        img = PILImage.create(fn)
        
        imgid = fn.stem
        df = annots[annots.Image==imgid]
        df.EncodedPixels.fillna(value='', inplace=True)
        df.loc[:,'Mask'] = df.EncodedPixels.apply(partial(rle_decode, shape=img.shape))
        masks = {o:df[df.Label==o].Mask.values[0] for o in df.Label}
        return imgid, img, masks
        
    def decodes(self, o): return CloudTypesImage(*o)

In [8]:
CATS = Category.create(['fish', 'flower', 'gravel', 'sugar'], add_na=True)
COLORS = dict(fish='b', flower='r', gravel='y', sugar='c')

def get_random_cmap(length):
    return colors.ListedColormap([np.random.rand(3,) for _ in range(length)])

In [9]:
CATS('flower')

tensor(2)

In [10]:
class MaskRTargetTfm(Transform):
    def __init__(self, items, annots, cats): 
        self.items, self.annots, self.cats = items, annots, cats
        
    def encodes(self, i): 
        fn = self.items[i]
        img = PILImage.create(fn)
        
        imgid = fn.stem
        df = annots[(annots.Image == imgid) & (annots.EncodedPixels.notnull())]
        num_objs = len(df)
        
        boxes, masks, labels = [], [], []
        for _, r in df.iterrows(): 
            mask = rle_decode(r.EncodedPixels, shape=img.shape)
            
            pos = np.where(mask)
            xmin, xmax = pos[1].min(), pos[1].max()
            ymin, ymax = pos[0].min(), pos[0].max()
            
            labels.append(self.cats(r.Label))
            boxes.append([xmin, ymin, xmax, ymax])
            masks.append(mask)

        labels = torch.as_tensor(labels, dtype=torch.int64)
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        
        area = (boxes[:,3] - boxes[:,1]) * (boxes[:,2] - boxes[:,0])
        
        target = {}
        target['image_id'] = torch.tensor([i])
        target['labels'] = labels
        target['boxes'] = boxes
        target['masks'] = masks
        target['area'] = area
        target['is_crowd'] = torch.zeros((num_objs,), dtype=torch.int64)
        return target
    
    def decodes(self, o): return o

In [55]:
class CloudDataset(torch.utils.data.Dataset):
    def __init__(self, items, annots, cats, transforms=None):
        self.items, self.annots, self.cats = items, annots, cats
        self.transforms = transforms
        
    def __getitem__(self, idx):
        fn = self.items[idx]
        img = Image.open(fn).convert('RGB')
        
        imgid = fn.stem
        df = annots[(annots.Image == imgid) & (annots.EncodedPixels.notnull())]
        num_objs = len(df)
        
        boxes, masks, labels = [], [], []
        for _, r in df.iterrows(): 
            mask = rle_decode(r.EncodedPixels, shape=img.shape)
            
            pos = np.where(mask)
            xmin, xmax = pos[1].min(), pos[1].max()
            ymin, ymax = pos[0].min(), pos[0].max()
            
            labels.append(self.cats(r.Label))
            boxes.append([xmin, ymin, xmax, ymax])
            masks.append(mask)

        labels = torch.as_tensor(labels, dtype=torch.int64)
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        
        area = (boxes[:,3] - boxes[:,1]) * (boxes[:,2] - boxes[:,0])
        
        target = {}
        target['image_id'] = torch.tensor([idx])
        target['labels'] = labels
        target['boxes'] = boxes
        target['masks'] = masks
        target['area'] = area
        target['iscrowd'] = torch.zeros((num_objs,), dtype=torch.int64)
        
        if self.transforms is not None:
            img, target = self.transforms(img, target
                                         )
        return img, target        
        
    def __len__(self): return len(self.items)
    

In [15]:
clouds_dataset = CloudDataset(items, annots, CATS)

In [19]:
clouds_dataset[9]

(<local.vision.core.PILImage image mode=RGB size=2100x1400 at 0x7FEC4B489B90>,
 {'image_id': tensor([9]),
  'labels': tensor([1, 2]),
  'boxes': tensor([[  28.,  288.,  515.,  920.],
          [ 306.,    7., 2098., 1368.]]),
  'masks': tensor([[[0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           ...,
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           ...,
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8),
  'area': tensor([ 307784., 2438912.]),
  'is_crowd': tensor([0, 0])})

### Model

In [6]:
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

In [7]:
model.roi_heads

RoIHeads(
  (box_roi_pool): MultiScaleRoIAlign()
  (box_head): TwoMLPHead(
    (fc6): Linear(in_features=12544, out_features=1024, bias=True)
    (fc7): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (box_predictor): FastRCNNPredictor(
    (cls_score): Linear(in_features=1024, out_features=91, bias=True)
    (bbox_pred): Linear(in_features=1024, out_features=364, bias=True)
  )
  (mask_roi_pool): MultiScaleRoIAlign()
  (mask_head): MaskRCNNHeads(
    (mask_fcn1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu1): ReLU(inplace=True)
    (mask_fcn2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu2): ReLU(inplace=True)
    (mask_fcn3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu3): ReLU(inplace=True)
    (mask_fcn4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu4): ReLU(inplace=True)
  )
  (mask_predictor): MaskRCNNPredictor(
    (conv5_mask): ConvTr

In [21]:
def get_instance_segmentation_model(num_classes):
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, num_classes)
    return model

In [27]:
get_instance_segmentation_model(len(CATS.vocab)).roi_heads

RoIHeads(
  (box_roi_pool): MultiScaleRoIAlign()
  (box_head): TwoMLPHead(
    (fc6): Linear(in_features=12544, out_features=1024, bias=True)
    (fc7): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (box_predictor): FastRCNNPredictor(
    (cls_score): Linear(in_features=1024, out_features=5, bias=True)
    (bbox_pred): Linear(in_features=1024, out_features=20, bias=True)
  )
  (mask_roi_pool): MultiScaleRoIAlign()
  (mask_head): MaskRCNNHeads(
    (mask_fcn1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu1): ReLU(inplace=True)
    (mask_fcn2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu2): ReLU(inplace=True)
    (mask_fcn3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu3): ReLU(inplace=True)
    (mask_fcn4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu4): ReLU(inplace=True)
  )
  (mask_predictor): MaskRCNNPredictor(
    (conv5_mask): ConvTran

### Train and evaluation functions

In [30]:
! git clone https://github.com/pytorch/vision.git

Cloning into 'vision'...
remote: Enumerating objects: 5598, done.[K
remote: Total 5598 (delta 0), reused 0 (delta 0), pack-reused 5598[K
Receiving objects: 100% (5598/5598), 9.42 MiB | 0 bytes/s, done.
Resolving deltas: 100% (3716/3716), done.


In [31]:
os.chdir('vision')

In [32]:
! cp references/detection/utils.py ../
! cp references/detection/transforms.py ../
! cp references/detection/coco_eval.py ../
! cp references/detection/engine.py ../
! cp references/detection/coco_utils.py ../

In [33]:
os.chdir('../.')

In [34]:
from engine import train_one_epoch, evaluate
import utils
import transforms as T

In [36]:
def get_transform(train):
    transforms = [T.ToTensor()]
    if train: pass
    return T.Compose(transforms)

### Putting everything together

In [56]:
dataset = CloudDataset(items, annots, CATS, transforms=get_transform(train=True))
dataset_test = CloudDataset(items, annots, CATS, transforms=get_transform(train=False))

torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
indices = indices[:200]

dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=2, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=1, shuffle=False, num_workers=4,
    collate_fn=utils.collate_fn)

In [57]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

num_classes = len(CATS.vocab)

model = get_instance_segmentation_model(num_classes)
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

In [58]:
num_epochs = 1

for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    lr_scheduler.step()
    evaluate(model, data_loader_test, device=device)

Epoch: [0]  [ 0/75]  eta: 0:03:10  lr: 0.000073  loss: 4.0856 (4.0856)  loss_classifier: 1.2874 (1.2874)  loss_box_reg: 0.1296 (0.1296)  loss_mask: 2.3939 (2.3939)  loss_objectness: 0.2328 (0.2328)  loss_rpn_box_reg: 0.0419 (0.0419)  time: 2.5389  data: 1.7019  max mem: 5210
Epoch: [0]  [10/75]  eta: 0:01:01  lr: 0.000748  loss: 2.0427 (2.7665)  loss_classifier: 0.5894 (0.6758)  loss_box_reg: 0.1166 (0.0978)  loss_mask: 1.2456 (1.8540)  loss_objectness: 0.0606 (0.0960)  loss_rpn_box_reg: 0.0419 (0.0428)  time: 0.9438  data: 0.1717  max mem: 5210
Epoch: [0]  [20/75]  eta: 0:00:47  lr: 0.001422  loss: 1.1770 (1.9503)  loss_classifier: 0.2613 (0.4512)  loss_box_reg: 0.1009 (0.1011)  loss_mask: 0.7020 (1.2736)  loss_objectness: 0.0379 (0.0673)  loss_rpn_box_reg: 0.0474 (0.0571)  time: 0.7812  data: 0.0158  max mem: 5210
Epoch: [0]  [30/75]  eta: 0:00:37  lr: 0.002097  loss: 0.9944 (1.6273)  loss_classifier: 0.1835 (0.3655)  loss_box_reg: 0.1187 (0.1082)  loss_mask: 0.6007 (1.0442)  loss_ob

KeyboardInterrupt: 