In [1]:
# import torch
import pandas as pd
import numpy as np
import pycm
# from tqdm import tqdm_notebook as tqdm
# from tqdm import tqdm
from matplotlib import pyplot as plt

%matplotlib inline

In [2]:
import psutil
import os
def show_ram_usage():
    py = psutil.Process(os.getpid())
    print('RAM usage: {} GB'.format(py.memory_info()[0]/2. ** 30))

show_ram_usage()

RAM usage: 0.09289169311523438 GB


In [3]:
def run_length_encode(component):
    component = component.T.flatten()
    start = np.where(component[1:] > component[:-1])[0]+1
    end = np.where(component[:-1] > component[1:])[0]+1
    length = end-start

    rle = []
    for i in range(len(length)):
        if i == 0:
            rle.extend([start[0], length[0]])
        else:
            rle.extend([start[i]-end[i-1], length[i]])

    rle = ' '.join([str(r) for r in rle])
    return rle

def run_length_decode(rle, height=1024, width=1024, fill_value=1):

    component = np.zeros((height, width), np.float32)
    component = component.reshape(-1)
    rle = np.array([int(s) for s in rle.strip().split(' ')])
    rle = rle.reshape(-1, 2)

    start = 0
    for index, length in rle:
        start = start+index
        end = start+length
        component[start: end] = fill_value
        start = end

    component = component.reshape(width, height).T
    return component



In [4]:
def compute_dice(im1, im2, empty_score=1.0):
    """
    Computes the Dice coefficient, a measure of set similarity.
    Parameters
    ----------
    im1 : array-like, bool
        Any array of arbitrary size. If not boolean, will be converted.
    im2 : array-like, bool
        Any other array of identical size. If not boolean, will be converted.
    Returns
    -------
    dice : float
        Dice coefficient as a float on range [0,1].
        Maximum similarity = 1
        No similarity = 0
        Both are empty (sum eq to zero) = empty_score

    Notes
    -----
    The order of inputs for `dice` is irrelevant. The result will be
    identical if `im1` and `im2` are switched.
    """
    im1 = np.asarray(im1).astype(np.bool)
    im2 = np.asarray(im2).astype(np.bool)

    if im1.shape != im2.shape:
        raise ValueError("Shape mismatch: im1 and im2 must have the same shape.")

    im_sum = im1.sum() + im2.sum()
    if im_sum == 0:
        return empty_score

    # Compute Dice coefficient
    intersection = np.logical_and(im1, im2)

    return 2. * intersection.sum() / im_sum



### Ensemble

In [6]:
# from sklearn.model_selection import train_test_split, StratifiedKFold
# df = pd.read_csv('stage_2_train.csv')
# df = df.drop_duplicates('ImageId')
# df = df.sample(frac=1, random_state=69)
# fold = 1
# total_folds = 5
# kfold = StratifiedKFold(total_folds, shuffle=True, random_state=69)
# train_idx, val_idx = list(kfold.split(df["ImageId"], df["has_mask"]))[fold]
# train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]
# df = val_df.copy()


# res34 = pd.read_csv('stage2_train_preds/res34_train_model.csv')
# dn121 = pd.read_csv('stage2_train_preds/unetdn121_train_ckpt30.csv')
# inres = pd.read_csv('stage2_train_preds/inresv2_train_ckpt30.csv')

# res34 = res34.loc[df.index]
# dn121 = dn121.loc[df.index]
# inres = inres.loc[df.index]
# df.head()

In [5]:
res34 = pd.read_csv('stage2_test_preds/res34_test_model.csv')
dn121 = pd.read_csv('stage2_test_preds/unetdn121_test_ckpt30.csv')
inres = pd.read_csv('stage2_test_preds/inresv2_test_ckpt30.csv')

In [6]:
def get_mask(rle):
    if rle!='-1':
        mask = run_length_decode(rle)
        return mask
    else:
        return np.zeros([1024, 1024], dtype='uint8')


res34_masks = []
dn121_masks = []
inres_masks = []

for i in range(len(res34)):
    rs = res34.iloc[i]['EncodedPixels']
    dn = dn121.iloc[i]['EncodedPixels']
    ir = inres.iloc[i]['EncodedPixels']    
    res34_masks.append(get_mask(rs))
    dn121_masks.append(get_mask(dn))
    inres_masks.append(get_mask(ir))
    
show_ram_usage()

# res34_masks = np.array(res34_masks)
# dn121_masks = np.array(dn121_masks)
# inres_masks = np.array(inres_masks)

RAM usage: 12.974700927734375 GB


In [10]:
# gt_masks = []
# for i in range(len(res34)):
#     gt = df.iloc[i]['EncodedPixels']
#     gt_masks.append(get_mask(gt))
# show_ram_usage()


RAM usage: 13.296497344970703 GB


In [17]:
compute_dice(res34_masks, dn121_masks) # all train: 0.7805, val: 0.7385

0.7385570897539164

In [18]:
compute_dice(res34_masks, inres_masks) # all train: 0.78299, val: 0.744

0.7442366452071161

In [20]:
compute_dice(inres_masks, dn121_masks) # all train: 0.80, val:0.77,  test: 0.77

0.7715407769952831

As the stats of test matches with the val, we gotta select what's the best for val

In [21]:
print(compute_dice(gt_masks, res34_masks))
print(compute_dice(gt_masks, dn121_masks))
print(compute_dice(gt_masks, inres_masks))

0.5268914118979333
0.5335597924444545
0.5346357042829183


In [9]:
inter_masks = []
union_masks = []
for i in range(len(res34_masks)):
#     intersection = np.logical_and(res34_masks[i], dn121_masks[i], inres_masks[i])
    intersection = res34_masks[i] * dn121_masks[i] * inres_masks[i]
    union = ((res34_masks[i] + dn121_masks[i] + inres_masks[i]) > 0).astype('uint8')
#     union = np.logical_or(res34_masks[i], dn121_masks[i], inres_masks[i])
    inter_masks.append(intersection)
    union_masks.append(union)
#np.logical_and take only 2 args

In [63]:
# i = 0
# plt.imshow(res34_masks[i]); plt.show()
# plt.imshow(dn121_masks[i]); plt.show()
# plt.imshow(inres_masks[i]); plt.show()
# plt.imshow(inter_masks[i]); plt.show()
# plt.imshow(union_masks[i]); plt.show()

In [10]:
show_ram_usage()

RAM usage: 21.86492919921875 GB


In [47]:
print(compute_dice(gt_masks, inter_masks)) # 0.515 val dice
print(compute_dice(gt_masks, union_masks)) # 0.542 val dice

0.5150624553729506
0.5421760361790672


Union scores better than intersection 

In [7]:
mv_masks = [] # pixel level majority vote i.e, if atleast two masks say a pixel is 1 then it is 1, else 0
for i in range(len(res34_masks)):
#     intersection = np.logical_and(res34_masks[i], dn121_masks[i], inres_masks[i])
#     intersection = res34_masks[i] * dn121_masks[i] * inres_masks[i]
#     union = ((res34_masks[i] + dn121_masks[i] + inres_masks[i]) > 0).astype('uint8')
#     union = np.logical_or(res34_masks[i], dn121_masks[i], inres_masks[i])
    mv = ((res34_masks[i] + dn121_masks[i] + inres_masks[i]) >= 2).astype('uint8')
    mv_masks.append(mv)
#np.logical_and take only 2 args

In [12]:
compute_dice(gt_masks, mv_masks) #DAYUMMMMMM 0.5492 val dice

0.5492432168229313

In [11]:
# union_rle = []
# for mask in union_masks:
#     rle = '-1'
#     if len(np.unique(mask))>1:
#         rle = run_length_encode(mask)
#     union_rle.append(rle)
    
# inter_rle = []
# for mask in inter_masks:
#     rle = '-1'
#     if len(np.unique(mask))>1:
#         rle = run_length_encode(mask)
#     inter_rle.append(rle)

In [8]:
mv_rle = []
for mask in mv_masks:
    rle = '-1'
    if len(np.unique(mask))>1:
        rle = run_length_encode(mask)
    mv_rle.append(rle)

In [11]:
sub = pd.DataFrame()
sub['ImageId'] = res34['ImageId']
sub['EncodedPixels'] = mv_rle

In [12]:
sub.head()

Unnamed: 0,ImageId,EncodedPixels
0,ID_0011fe81e,-1
1,ID_9ca06e9b9,-1
2,ID_6f00d6ce6,-1
3,ID_9258110b0,-1
4,ID_fa01c9546,297260 10 39 13 959 13 34 25 949 17 29 33 943 ...


In [13]:
# sub.to_csv('res34_dn121_inres_mv_ensemble.csv', index=False)

In [49]:
df = pd.DataFrame()
df['ImageId'] = res34['ImageId']
df['res34'] = res34['EncodedPixels']
df['dn121'] = dn121['EncodedPixels']
df['inres'] = inres['EncodedPixels']
# df['gt'] = pd.read_csv('stage_2_train.csv')['EncodedPixels']
df['gt'] = val_df['EncodedPixels']

In [50]:
df.head()

Unnamed: 0,ImageId,res34,dn121,inres,gt
1477,1.2.276.0.7230010.3.1.4.8323329.11619.15178752...,-1,232645 4 1018 7 1016 8 1016 8 1015 8 1015 9 10...,230597 1 1020 4 1020 4 1019 4 1020 4 1018 6 10...,-1
875,1.2.276.0.7230010.3.1.4.8323329.11800.15178752...,-1,-1,-1,-1
3713,1.2.276.0.7230010.3.1.4.8323329.3061.151787517...,-1,-1,-1,-1
11897,1.2.276.0.7230010.3.1.4.8323329.3189.151787517...,-1,-1,-1,-1
5463,1.2.276.0.7230010.3.1.4.8323329.4362.151787518...,590002 10 1012 13 1009 16 1006 18 1004 20 1003...,-1,578745 2 1020 5 1018 6 1014 12 1006 19 1005 19...,593079 1 1020 5 1016 9 1014 11 1012 13 1011 14...


In [51]:
TNs = df.query('res34 == "-1" and dn121 == "-1" and inres == "-1"')
TPs = df.query('res34 != "-1" and dn121 != "-1" and inres != "-1"')
Ts = pd.concat([TPs, TNs])
diff = df[df.apply(lambda x: x[0] not in Ts['ImageId'].tolist(), axis=1)]

In [52]:
df.shape, TNs.shape, TPs.shape, Ts.shape, diff.shape

((2410, 5), (1851, 5), (351, 5), (2202, 5), (208, 5))

### Mask visualization

In [53]:
diff.head()

Unnamed: 0,ImageId,res34,dn121,inres,gt
1477,1.2.276.0.7230010.3.1.4.8323329.11619.15178752...,-1,232645 4 1018 7 1016 8 1016 8 1015 8 1015 9 10...,230597 1 1020 4 1020 4 1019 4 1020 4 1018 6 10...,-1
5463,1.2.276.0.7230010.3.1.4.8323329.4362.151787518...,590002 10 1012 13 1009 16 1006 18 1004 20 1003...,-1,578745 2 1020 5 1018 6 1014 12 1006 19 1005 19...,593079 1 1020 5 1016 9 1014 11 1012 13 1011 14...
11575,1.2.276.0.7230010.3.1.4.8323329.11474.15178752...,-1,495715 7 1015 11 1011 14 1009 15 1007 18 1005 ...,-1,-1
12950,1.2.276.0.7230010.3.1.4.8323329.11296.15178752...,399496 4 1017 8 1014 11 1011 14 1009 15 1009 1...,-1,-1,426110 2 1021 3 1020 4 1019 4 1019 5 1018 5 10...
4277,1.2.276.0.7230010.3.1.4.8323329.3999.151787518...,-1,556187 4 1016 9 1014 10 1012 13 1009 15 1008 1...,-1,556182 17 1005 24 999 27 996 28 995 29 994 30 ...


RAM usage: 66.44263076782227 GB


In [27]:
# gt_masks = np.array(gt_masks)
# res34_masks = np.array(res34_masks)
# dn121_masks = np.array(dn121_masks)
# inres_masks = np.array(inres_masks)

KeyboardInterrupt: 

In [28]:
show_ram_usage()

RAM usage: 64.61175155639648 GB


In [29]:
diff_idx = diff.index.tolist()
dice = 0
for idx in diff_idx:
    dice += compute_dice(gt_masks[idx], res34_masks[idx])
print(dice / len(diff_idx))

0.2748269962263053


In [30]:
dice = 0
for idx in diff_idx:
    dice += compute_dice(gt_masks[idx], dn121_masks[idx])
print(dice / len(diff_idx))

0.3388629685424114


In [31]:
dice = 0
for idx in diff_idx:
    dice += compute_dice(gt_masks[idx], inres_masks[idx])
print(dice / len(diff_idx))

0.372467691762776


In [16]:
# i = 0
i += 1
row = diff.iloc[i]
image_id = row['ImageId']
path = f'npy_test_stage2/{image_id}.npy'
img = np.load(path)
idx = row.name
rs = res34_masks[idx]
dn = dn121_masks[idx]
ir = inres_masks[idx]

plt.figure(figsize=(20, 20))
plt.subplot(1, 3, 1)
plt.imshow(img[:, :, 0], cmap='bone')
plt.imshow(rs, alpha=0.2)
plt.subplot(1, 3, 2)
plt.imshow(img[:, :, 0], cmap='bone')
plt.imshow(dn, alpha=0.2)
plt.subplot(1, 3, 3)
plt.imshow(img[:, :, 0], cmap='bone')
plt.imshow(ir, alpha=0.2)
plt.show()


IndexError: single positional indexer is out-of-bounds

### Ideas of ensembling

Test on train/val set:
* Take the union of the three preds
* Take the intersection of the three preds.


In [4]:
train2 = pd.read_csv('stage_2_train.csv')
test1 = pd.read_csv('test_model.csv')
#test1['preds']
train2['isTest'] = train2.apply(lambda x: 1 if x[0] in test1.ImageId.tolist() else 0, axis=1) # bool for whether the image belongs to stage1 test set or not

train2['isTest'].value_counts()

train2 = train2.query('isTest == 1') # get the test images out

train2.query('EncodedPixels == "-1"').shape,  test1.query('EncodedPixels == "-1"').shape, 

train2.head()

test1 = test1.rename(columns={'EncodedPixels': 'preds'}) # renaming

df = pd.merge(train2, test1, on="ImageId") # merge ground truth and predictions in one dataframe

df.head() # EncodedPixels is ground truth, preds is predicted mask


Unnamed: 0,ImageId,EncodedPixels,isTest,preds
0,1.2.276.0.7230010.3.1.4.8323329.6904.151787520...,-1,1,-1
1,1.2.276.0.7230010.3.1.4.8323329.6340.151787519...,-1,1,-1
2,1.2.276.0.7230010.3.1.4.8323329.6873.151787520...,-1,1,-1
3,1.2.276.0.7230010.3.1.4.8323329.5994.151787519...,-1,1,-1
4,1.2.276.0.7230010.3.1.4.8323329.6840.151787520...,-1,1,-1


In [7]:

def get_mask(rle):
    if rle!='-1':
        mask = run_length_decode(rle)
        return mask
    else:
        return np.zeros([1024, 1024], dtype='uint8')

mask = get_mask('-1')
mask.shape, mask.dtype

labels = []
predictions = []
for itr, row in df.iterrows():
    gt = row['EncodedPixels']
    pred = row['preds']
    labels.append(get_mask(gt))
    predictions.append(get_mask(pred))

labels = np.array(labels)
predictions = np.array(predictions)
labels.shape, predictions.shape

labels = torch.from_numpy(labels)
predictions = torch.from_numpy(predictions)


RAM usage: 10.872844696044922 GB


In [11]:
dice, dice_neg, dice_pos, num_neg, num_pos = metric(predictions, labels, threshold=0.5, reduction='none')

In [12]:
show_ram_usage()

RAM usage: 10.877708435058594 GB


In [13]:
dice

0.8542532920837402

In [14]:
dice_neg

0.9565619230270386

In [18]:
dice_pos

0.4725361764431

### further analysis

In [100]:
# bin_df = pd.read_csv('bin_cls_test_model.csv')
# bin_df = pd.read_csv('bin_oldckpt20test.csv')
bin_df = pd.read_csv('test_ckpt_po1.csv')

In [101]:
bin_df.head()

Unnamed: 0,ImageId,EncodedPixels,label
0,1.2.276.0.7230010.3.1.4.8323329.6067.151787519...,-1,0
1,1.2.276.0.7230010.3.1.4.8323329.6214.151787519...,-1,0
2,1.2.276.0.7230010.3.1.4.8323329.6177.151787519...,-1,1
3,1.2.276.0.7230010.3.1.4.8323329.6655.151787519...,-1,0
4,1.2.276.0.7230010.3.1.4.8323329.6359.151787519...,-1,0


In [102]:
df.head()

Unnamed: 0,ImageId,EncodedPixels,isTest,preds
0,1.2.276.0.7230010.3.1.4.8323329.6904.151787520...,-1,1,-1
1,1.2.276.0.7230010.3.1.4.8323329.6340.151787519...,-1,1,-1
2,1.2.276.0.7230010.3.1.4.8323329.6873.151787520...,-1,1,-1
3,1.2.276.0.7230010.3.1.4.8323329.5994.151787519...,-1,1,-1
4,1.2.276.0.7230010.3.1.4.8323329.6840.151787520...,-1,1,-1


In [103]:
df_cls = df.copy()
df_cls['EncodedPixels'] = df_cls['EncodedPixels'].apply(lambda x: 0 if x == '-1' else 1)
df_cls['preds'] = df_cls['preds'].apply(lambda x: 0 if x == '-1' else 1)
df_cls['bin_preds'] = bin_df['label']

In [104]:
df_cls.head()

Unnamed: 0,ImageId,EncodedPixels,isTest,preds,bin_preds
0,1.2.276.0.7230010.3.1.4.8323329.6904.151787520...,0,1,0,0
1,1.2.276.0.7230010.3.1.4.8323329.6340.151787519...,0,1,0,0
2,1.2.276.0.7230010.3.1.4.8323329.6873.151787520...,0,1,0,1
3,1.2.276.0.7230010.3.1.4.8323329.5994.151787519...,0,1,0,0
4,1.2.276.0.7230010.3.1.4.8323329.6840.151787520...,0,1,0,0


In [105]:
cm = pycm.ConfusionMatrix(df_cls['EncodedPixels'].values, df_cls['preds'].values)

In [106]:
cm.print_matrix()

Predict    0          1          
Actual
0          1035       47         

1          75         215        




In [107]:
bin_cm = pycm.ConfusionMatrix(df_cls['EncodedPixels'].values, df_cls['bin_preds'].values)
bin_cm.print_matrix()

Predict   0         1         
Actual
0         853       229       

1         236       54        


