# SSDD Ensemble Notebook

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
from pathlib import Path

In [None]:
INPUT = Path("/kaggle/input")

In [None]:
sample_submission_path = INPUT/"severstal-steel-defect-detection"/"sample_submission.csv"

In [None]:
DATASETS = ["submission-ssdd-001","submission-ssdd-002"]
SUB_PATHS = []
for d in DATASETS:
    SUB_PATHS+=list(INPUT/d/p for p in os.listdir(INPUT/d))

### Show All Submission Files

In [None]:
print(SUB_PATHS, len(SUB_PATHS))
assert len(SUB_PATHS) >2,"too few submission csv to ensemble"

#### Preview Data

In [None]:
test_df = pd.read_csv(SUB_PATHS[0])

In [None]:
def augDf(df_path):
    df = pd.read_csv(df_path)
    df['ImageId'], df['ClassId'] = zip(*df['ImageId_ClassId'].str.split('_'))
    df['ClassId'] = df['ClassId'].astype(int)
    df = df.pivot(index='ImageId',columns='ClassId',values='EncodedPixels')
    df['defects'] = df.count(axis=1)
    return df

In [None]:
test_df = augDf(SUB_PATHS[0])
test_df.head()

### Helper Functions

In [None]:
from tqdm import tqdm_notebook as tqdm
from bcolz import carray
import random

In [None]:
from torch.utils.data import Dataset,DataLoader

class TestDataset(Dataset):
    '''Dataset for test prediction'''
    def __init__(self, df):
        df['ImageId'] = df['ImageId_ClassId'].apply(lambda x: x.split('_')[0])
        self.fnames = df['ImageId'].unique().tolist()
        self.num_samples = len(self.fnames)

    def __getitem__(self, idx):
        fname = self.fnames[idx]
        return fname, ""

    def __len__(self):
        return self.num_samples

df = pd.read_csv(sample_submission_path)
testset = DataLoader(
    TestDataset(df),
    batch_size=64,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

In [None]:
def mask2rle(img):
    '''
    Numpy image to run length encoding
    img: numpy array, 1 -> mask, 0 -> background
    Returns run length as string formated
    '''
    pixels= img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def make_mask(row_id, df):
    '''Given a row index, return image_id and mask (256, 1600, 4) from the dataframe `df`'''
    fname = df.iloc[row_id].name
    labels = df.iloc[row_id][:4]
    masks = np.zeros((256, 1600, 4), dtype=np.float32) # float32 is V.Imp
    # 4:class 1～4 (ch:0～3)

    for idx, label in enumerate(labels.values):
        if label is not np.nan:
            label = label.split(" ")
            positions = map(int, label[0::2])
            length = map(int, label[1::2])
            mask = np.zeros(256 * 1600, dtype=np.uint8)
            for pos, le in zip(positions, length):
                mask[pos:(pos + le)] = 1
            masks[:, :, idx] = mask.reshape(256, 1600, order='F')
    return fname, masks

def dfToNumpy(df_path):
    """
    data frame saved to bcolz array
    """
    df = augDf(df_path)
    bc = carray(np.zeros((0,256,1600,4)),rootdir = INPUT/hex(random.randint(10000,20000)))
    print("Running on csv:",str(df_path))
    for i in tqdm(range(len(df))):
        fname, masks = make_mask(i,df)
        bc.append(np.expand_dims(masks,0))
        bc.flush()
    return bc

### Transfer All The Run Length Encoding to Mask

In [None]:
arrs = list(dfToNumpy(p) for p in SUB_PATHS)

In [None]:
n = len(arrs)
final_bc = carray(np.zeros((0,256,1600,4)),rootdir = INPUT/hex(random.randint(10000,20000)))
for i in tqdm(range(len(arrs[0]))):
    pred = np.stack(list(arrs[p][i:i+1] for p in range(n)))
    pred = pred.sum(axis=0)/n
    final_bc.append(pred)
    final_bc.flush()

In [None]:
final_bc.shape

In [None]:
img_arr = df.ImageId.unique()

### Post process function:
* Get binary output according to threshold
* Component minimun size

In [None]:
import cv2

def post_process(probability, threshold, min_size):
    '''Post processing of each predicted mask, components with lesser number of pixels
    than `min_size` are ignored'''
    mask = cv2.threshold(probability, threshold, 1, cv2.THRESH_BINARY)[1]
    num_component, component = cv2.connectedComponents(mask.astype(np.uint8))
    predictions = np.zeros((256, 1600), np.float32)
    num = 0
    for c in range(1, num_component):
        p = (component == c)
        if p.sum() > min_size:
            predictions[p] = 1
            num += 1
    return predictions, num

In [None]:
results = list()
for i in tqdm(range(len(img_arr))):
    for c in range(4):
        pred = final_bc[i][:,:,c]
        final_bc.flush() # flush back from the memory to hard drive
        pred_b,_ = post_process(pred, 0.5, 0)
        row_dict = dict({
            "ImageId_ClassId":"%s_%s"%(img_arr[i],c+1),
            "EncodedPixels":mask2rle(pred_b)
        })
        results.append(row_dict)
        

### Output to CSV file

In [None]:
sub_df = pd.DataFrame(results)
sub_df.to_csv("submission.csv",index=False)