# Keras Segmentation Models
In this kernel, we learn how to install and use pretrained Keras segmentation models from GitHub [here][1] with docs [here][2]. We also plot EDA showing training examples, UNET prediction examples, and UNET error.

# Data Generator
First let's restructure the train.csv dataframe and build a data generator. We will need to feed our neural network `X_train` of images and `y_train` of masks. We will resize all images by a factor of 0.5 for efficiency. (Convert 256x1600 RGB to 128x800 RGB).

[1]: https://github.com/qubvel/segmentation_models
[2]: https://segmentation-models.readthedocs.io/en/latest/tutorial.html

In [None]:
import numpy as np, pandas as pd, os, gc
import matplotlib.pyplot as plt, time
from PIL import Image 
import warnings
#import tensorflow as tf
#from tensorflow.keras.losses import binary_crossentropy
warnings.filterwarnings("ignore")
path = '../input/'
train = pd.read_csv('../input/submission1/submission (1).csv')
train['ImageId'] = train['ImageId_ClassId'].map(lambda x: x.split('.')[0]+'.jpg')
train2 = pd.DataFrame({'ImageId':train['ImageId'][::4]})
train2['e1'] = train['EncodedPixels'][::4].values
train2['e2'] = train['EncodedPixels'][1::4].values
train2['e3'] = train['EncodedPixels'][2::4].values
train2['e4'] = train['EncodedPixels'][3::4].values



#train2 = pd.DataFrame({'ImageId':train['ImageId']}) 
#train2['e1'] = train.loc[train['ClassId'] == 1, ['EncodedPixels']]
#train2['e2'] = train.loc[train['ClassId'] == 2, ['EncodedPixels']]
#train2['e3'] = train.loc[train['ClassId'] == 3, ['EncodedPixels']]
#train2['e4'] = train.loc[train['ClassId'] == 4, ['EncodedPixels']]


train2.reset_index(inplace=True,drop=True) #不创建新的对象，直接对原始对象进行修改；去掉原来的索引index列；
train2.fillna('',inplace=True); 
train2['count'] = np.sum(train2.iloc[:,1:]!='',axis=1).values #axis=按列填充;iloc按行查看
train2.head()

In [None]:

# https://www.kaggle.com/ateplyuk/pytorch-starter-u-net-resnet
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
import keras
import math
class DataGenerator(keras.utils.Sequence):
    def __init__(self, df, batch_size = 16, subset="train", shuffle=False, 
                 preprocess=None, info={}):
        #batch_size = 16每个序列时间训练样本数为16，shuffle：是否打乱输出样本，还是按照时间顺序绘制它们。
        super().__init__()
        self.df = df #image_id
        self.shuffle = shuffle
        self.subset = subset#子集
        self.batch_size = batch_size
        self.preprocess = preprocess
        self.info = info
        
        if self.subset == "train":
            self.data_path = path + 'severstal-steel-defect-detection/train_images/'
        if self.subset == "test":
            self.data_path = path + 'severstal-steel-defect-detection/test_images/'
        self.on_epoch_end()

        
    def __fillna__(self):
        self.df.fillna('0',inplace=True)
    #def __len__(self):
     #   return int(np.floor(len(self.df) / self.batch_size))
    def __len__(self):
        return int(math.ceil(len(self.df) / self.batch_size))
    #np.floor向下取整函数#计算每一个epoch的迭代次数使用次数（表示每个历元的批数）
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.df))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
    #在每一次epoch结束是否需要进行一次随机，重新随机一下index
    
    def __getitem__(self, index): 
    #   该函数返回每次我们需要的经过处理的数据。   
        X = np.empty((self.batch_size,128,800,3),dtype=np.float32)
        
        #numpy.empty(shape, dtype=float, order=‘C’)
        
        y = np.empty((self.batch_size,128,800,4),dtype=np.int8)#*self.img_size训练的图片尺寸
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        #生成每个batch的索引indexs
        for i,f in enumerate(self.df['ImageId'].iloc[indexes]):
            self.info[index*self.batch_size+i]=f
            X[i,] = Image.open(self.data_path + f).resize((800,128))
            if self.subset == 'train': 
                for j in range(4):
                    y[i,:,:,j] = rle2maskResize(self.df['e'+str(j+1)].iloc[indexes[i]])
        if self.preprocess!=None: X = self.preprocess(X)
        if self.subset == 'train': return X, y
        else: return X

# Utility Functions
Next we'll need some utility functions. The first converts rle to mask. The second converts a mask to its contour. The third enlarges a mask. The second and third together put blank space between defect and mask contour for better visualization.

In [None]:
# https://www.kaggle.com/titericz/building-and-visualizing-masks
def rle2maskResize(rle):
    # CONVERT RLE TO MASK 
    if (pd.isnull(rle))|(rle==''): 
        return np.zeros((128,800) ,dtype=np.uint8)
    #判断是否为空值，如果是用0填充（确保格式相同)
    height= 256
    width = 1600
    mask= np.zeros( width*height ,dtype=np.uint8)

    array = np.asarray([int(x) for x in rle.split()])
    starts = array[0::2]-1#起始值为1，确保为0
    lengths = array[1::2]    
    for index, start in enumerate(starts):
        mask[int(start):int(start+lengths[index])] = 1
    
    return mask.reshape( (height,width), order='F' )[::2,::2]

def mask2contour(mask, width=3):
    # CONVERT MASK TO ITS CONTOUR
    w = mask.shape[1]
    h = mask.shape[0]
    mask2 = np.concatenate([mask[:,width:],np.zeros((h,width))],axis=1)#聚合array
    mask2 = np.logical_xor(mask,mask2)#相同时输出0，不同时输出1；建立掩模
    
    mask3 = np.concatenate([mask[width:,:],np.zeros((width,w))],axis=0)
    mask3 = np.logical_xor(mask,mask3)
    return np.logical_or(mask2,mask3) 

def mask2pad(mask, pad=2):#
    # ENLARGE MASK TO INCLUDE MORE SPACE AROUND DEFECT/
    w = mask.shape[1]
    h = mask.shape[0]

    # MASK UP
    for k in range(1,pad,2):
        temp = np.concatenate([mask[k:,:],np.zeros((k,w))],axis=0)
        mask = np.logical_or(mask,temp)
       
    # MASK DOWN
    for k in range(1,pad,2):
        temp = np.concatenate([np.zeros((k,w)),mask[:-k,:]],axis=0)
        mask = np.logical_or(mask,temp)
    # MASK LEFT
    for k in range(1,pad,2):
        temp = np.concatenate([mask[:,k:],np.zeros((h,k))],axis=1)
        mask = np.logical_or(mask,temp)
    # MASK RIGHT
    for k in range(1,pad,2):
        temp = np.concatenate([np.zeros((h,k)),mask[:,:-k]],axis=1)
        mask = np.logical_or(mask,temp)
    
    return mask 

# Train EDA
Let's confirm our Data Generator works and view some training images. We will only show examples with defects. Note that all mask contours are plotted with a little blank space around the defect to aid visualization. Below we show examples of each type but note that in the training set only 7.1%, 2.0%, 41.0%, 6.4% of images have defects 1, 2, 3, 4 respectively.

In [None]:
plt.figure(figsize=(13.5,2.5))#指定figure的宽和高，单位为英寸
bar = plt.bar( [1,2,3,4],100*np.mean( train2.iloc[:,1:5]!='',axis=0) )
plt.title('Percent Training Images with Defect', fontsize=16)
plt.ylabel('Percent of Images'); plt.xlabel('Defect Type')
plt.xticks([1,2,3,4])
for rect in bar:
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width()/2.0, height, '%.1f %%' % height,
             ha='center', va='bottom',fontsize=16)
plt.ylim((0,100)); plt.show()#此处将50修改为100

In [None]:
# DEFECTIVE IMAGE SAMPLES
filenames = {}
defects = list(train2[train2['e1']!=''].sample(1).index)
defects += list(train2[train2['e2']!=''].sample(1).index)
defects += list(train2[train2['e3']!=''].sample(2).index)
defects += list(train2[train2['e4']!=''].sample(2).index)

# DATA GENERATOR
#train_batches= pd.read_csv('../input/submission1/submission (1).csv')
train_batches = DataGenerator(train2[train2.index.isin(defects)],shuffle=True,info=filenames)
print('Images and masks from our Data Generator')
print('KEY: yellow=defect1, green=defect2, blue=defect3, magenta=defect4')

# DISPLAY IMAGES WITH DEFECTS
for i,batch in enumerate(train_batches):
    plt.figure(figsize=(14,50)) #20,18
    for k in range(16):#sample为6张
        plt.subplot(16,1,k+1)
        img = batch[0][k,]
        img = Image.fromarray(img.astype('uint8'))
        img = np.array(img)
        extra = '  has defect'
        for j in range(4):
            msk = batch[1][k,:,:,j]
            msk = mask2pad(msk,pad=3)
            msk = mask2contour(msk,width=2)
            if np.sum(msk)!=0: extra += ' '+str(j+1)
            if j==0: # yellow
                img[msk==1,0] = 235 
                img[msk==1,1] = 235
            elif j==1: img[msk==1,1] = 210 # green
            elif j==2: img[msk==1,2] = 255 # blue
            elif j==3: # magenta
                img[msk==1,0] = 255
                img[msk==1,2] = 255
        plt.title(filenames[16*i+k]+extra)
        plt.axis('off') 
        plt.imshow(img)
    plt.subplots_adjust(wspace=0.05)
    plt.show()