In [None]:
import os
import numpy as np 
import pandas as pd 
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
from glob import glob
import time
import random
import tensorflow as tf
import torch
import glob
from tqdm.notebook import tqdm
tqdm.pandas()
import cv2
from joblib import Parallel, delayed
import shutil

## **Creating the Dataset**

In [None]:
#Read the traning csv file
train = pd.read_csv('../input/uw-madison-gi-tract-image-segmentation/train.csv')
print(train.columns)
print(train['class'].value_counts())
train['class'].value_counts().plot.bar()
print(train.shape)

In [None]:
#Adding the columns

def from_id(x):
    data = x['id'].split('_')
    case = int(data[0].replace('case',''))
    day = int(data[1].replace('day',''))
    slice_ = data[-1]
    x['case'] = case
    x['day'] = day
    x['slice'] = slice_
    return x

def from_path(x):
    #For applying to this column
    p = x['image_paths']
    data = x['image_paths'].split('/')
    data1 = data[-1].split('_')
    slice_ = data1[1]
    height = data1[2]
    width = data1[3]
    case = data[-4].replace('case','')
    day0 = data[-3].split('_')
    day = day0[1].replace('day','')
    x['case'] = case
    x['slice'] = slice_
    x['height'] = height
    x['width'] = width
    x['day'] = day
    
    return x

In [None]:
#Load the training csv
train = pd.read_csv('../input/uw-madison-gi-tract-image-segmentation/train.csv')
train = train.progress_apply(from_id, axis=1)
print(train.head)

In [None]:
train.to_csv('train_.csv',index=False)

In [None]:
#Load the path dataframe
path = glob.glob('../input/uw-madison-gi-tract-image-segmentation/train/*/*/*/*')
path_df = pd.DataFrame(path,columns=['image_paths'])
print(path_df)

In [None]:
path_df = path_df.progress_apply(from_path, axis=1)
print(path_df.head)

In [None]:
path_df.to_csv('path.csv',index=False)

In [None]:
print(train.columns)
print(path_df.columns)

In [None]:
train = pd.read_csv('./train_.csv')
path_df = pd.read_csv('./path.csv')

In [None]:
#merge both the datsets
train__ = train.merge(path_df,how='inner', on=['case','day','slice'])
print(train__.head(6))

In [None]:
#save the csv file
train__.to_csv('train_final.csv',index=False)

In [None]:
print(train__.shape)
print(train__.isna().sum())
print(train__.columns)

## **Let's Visualize the mask manually**

In [None]:
image_df = pd.read_csv('../input/gi-tract/train_final.csv')
s = image_df[~image_df['segmentation'].isna()].sample()
paths = s.image_paths.to_list()
print(paths)
s = s['segmentation'].tolist()
print(s)
s = list(map(int,s[0].split(' ')))
print(s)

In [None]:
pixel, count = [],[]
#Notice it starts from 0
[pixel.append(s[i]) if i % 2 == 0 else count.append(s[i]) for i in range(0,len(s))]
print(pixel)
print(count)

In [None]:
#Lets generate masked pixel locations where exactly the mask is there using above 2 lists
rle_pixels = [list(range(pixel[i],pixel[i]+count[i])) for i in range(0, len(pixel))]
print('rle_pixels\n:', rle_pixels[:10])

In [None]:
#Now lets convert list of lists into a single list
rle_mask_pixels = sum(rle_pixels,[]) 
print('rle mask pixels:\n', rle_mask_pixels[:10])

In [None]:
p = paths[0]
image = load_images(p)
plt.imshow(image, cmap = 'bone')

In [None]:
show_images(image,rle_mask_pixels)

# **Creating the Mask**

In [None]:
#Define the rlu functions
# ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
def rle2mask(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = np.asarray(mask_rle.split(), dtype=int)
    starts = s[0::2] - 1
    lengths = s[1::2]
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction

# ref.: https://www.kaggle.com/stainsby/fast-tested-rle
def mask2rle(img):
    """
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formatted
    The encoding rule is pretty simple: 
    Where the mask is. Index of the mask, and how many pixels follows.
    """
    pixels = img.flatten()
    
    #Perform padding in front an dat the end
    pixels = np.concatenate([[0], pixels, [0]])
    
    #
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [None]:
#Convert id to mask
def idtomask(idz):
    d = df[df['id'] == idz]
    g = d[['height','width']].iloc[0]
    shape = (g.height, g.width, 3)
    mask = np.zeros(shape, dtype=np.uint8)
    for i, c in enumerate(['large_bowel', 'small_bowel', 'stomach']):
        c = d[d['class'] == c]
        rle = c.segmentation.squeeze()
        if len(c) and not pd.isna(rle):
            mask[...,i] = rle2mask(rle, shape[:2])
    return mask

#Convert rgb to gray
#def rgbtogray(mask):
    #pad_mask = 

# **Visualize the Images**

In [None]:
def load_images(image):
    image = cv2.imread(image,0)
    image = image.astype('float32')
    image = ((image - image.min()) - (image.max() - image.min())) * 255
    image = image.astype(np.uint8)
    return image

def show_images(image, mask=None):
    #Apply CLAHE
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    image = clahe.apply(image)
    #Plot the images
    plt.imshow(image, cmap='bone')
    
    if mask is not None:
        plt.imshow(mask, alpha=0.5)
        handles = [Rectangle((0,0),1,1, color=_c) for _c in [(0.667,0.0,0.0), 
                                                             (0.0,0.667,0.0), (0.0,0.0,0.667)]]
        labels = [ "Large Bowel", "Small Bowel", "Stomach"]
        plt.legend(handles,labels)
    plt.axis('off')

In [None]:
#Load the images from csv
df = pd.read_csv('../input/gi-tract/train_mask.csv')
print(df.columns)

In [None]:
#See randomly an image
import random
path_df = df.image_paths.tolist()
path = random.sample(path_df,1)
print(path)
img = cv2.imread(path[0],0)
show_images(img)

In [None]:
#Let's check the masks of the image
row=1; col=4
plt.figure(figsize=(5*col,5*row))
for i, id_ in enumerate(df[~df.segmentation.isna()].sample(frac=1.0)['id'].unique()[:row*col]):
    img = load_images(df[df['id']==id_].image_paths.iloc[0])
    mask = idtomask(id_)*255
    plt.subplot(row, col, i+1)
    i+=1
    show_images(img, mask=mask)
    plt.tight_layout()

# **Mask Data**

In [None]:
print(df)

In [None]:
#Function to save the mask
def save_mask(id_):
    idf = df[df['id']==id_]
    mask = idtomask(id_)*255
    image_path = idf.image_paths.iloc[0]
    mask_path = image_path.replace('../input/','./png/')
    mask_folder = mask_path.rsplit('/',1)[0]
    os.makedirs(mask_folder, exist_ok=True)
    cv2.imwrite(mask_path, mask, [cv2.IMWRITE_PNG_COMPRESSION, 1])
    mask_path2 = image_path.replace('../input/','./np/').replace('.png','.npy')
    mask_folder2 = mask_path2.rsplit('/',1)[0]
    os.makedirs(mask_folder2, exist_ok=True)
    np.save(mask_path2, mask)
    return mask_path

In [None]:
#Save it in the working directory
ids = df['id'].unique()
_ = Parallel(n_jobs=-1, backend='threading')(delayed(save_mask)(id_)\
                                             for id_ in tqdm(ids, total=len(ids)))

In [None]:
!ls np

In [None]:
#Check the masked data from .png folder
i = 250
img = load_images(df.image_paths.iloc[i]) 
mask_path = df['image_paths'].iloc[i].replace('../input/','./png/')
mask = cv2.imread(mask_path, cv2.IMREAD_UNCHANGED)
plt.figure(figsize=(5,5))
show_images(img, mask=mask)

In [None]:
#Check the masked data from .np folder
i = 250
img = load_images(df.image_paths.iloc[i])
mask_path = df['image_paths'].iloc[i].replace('../input/','./np/').replace('.png','.npy')
mask = np.load(mask_path)
plt.figure(figsize=(5,5))
show_images(img, mask=mask)

In [None]:
#Save the new csv file with mask paths
df = df.drop(['mask_path'],axis=1)
df['mask_path'] = df['image_paths'].str.replace('../input', '../input/gi-tract/png')
print(df['mask_path'].head(n=6))
df.to_csv('train.csv', index=False)

In [None]:
#Compress the files
shutil.make_archive('./png',
                    'zip',
                    './png',
                    'uw-madison-gi-tract-image-segmentation')

In [None]:
#Compress the files
shutil.make_archive('./np',
                    'zip',
                    './np',
                    'uw-madison-gi-tract-image-segmentation')

# **Check the newly created Masks**

In [None]:
df = pd.read_csv('../input/gi-tract/train_mask.csv')
df.head(n=6)

In [None]:
print(df.mask_path.isna().sum())
print(df.segmentation.isna().sum())

In [None]:
img = df['mask_path'].sample(1).tolist()
print(img)
img = cv2.imread(img[0],1)
#print(img)
#show_images(img)
plt.imshow(img)

In [None]:
#Overlap the two images
def images(img,mask=None):
    
    img = cv2.imread(img, cv2.IMREAD_UNCHANGED)
    plt.imshow(img, cmap='bone')
    
    if mask is not None:
        mask = cv2.imread(mask, cv2.IMREAD_UNCHANGED)
        plt.imshow(mask, alpha=0.6)
    plt.axis('off')

In [None]:
i = 250
img = cv2.imread(df.image_paths.iloc[i],0)
mask = cv2.imread(df.mask_path.iloc[i])
images(img,mask=mask)

# **Creating final DataFrame**

In [None]:
df = pd.read_csv('../input/gi-tract/train_mask.csv')

In [None]:
print(df.columns)

In [None]:
df['mask_path_npy']  = df['mask_path'].str.replace('.png', '.npy')

In [None]:
df['segmentation'] = df.segmentation.fillna('')
df['rle_length'] = df['segmentation'].map(len)

In [None]:
df2 = df.groupby(['id'])['segmentation'].agg(list).to_frame().reset_index()
print(df2)

In [None]:
df1 = df.groupby(['id'])['rle_length'].agg(sum).to_frame().reset_index()
df1.head()

In [None]:
df2 = df2.merge(df1, on = ['id'])
df2.head()

In [None]:
df.columns

In [None]:
df = df.drop(['segmentation','rle_length','class'], axis = 1)
#Select 1 row from each group
df = df.groupby(['id']).head(1).reset_index(drop=True)
df.head()

In [None]:
df.shape

In [None]:
df = df.merge(df2, on = ['id'])
df.head()

In [None]:
#Remove the faulty items
fault1 = 'case7_day0'
fault2 = 'case81_day30'
df = df[ ~df['id'].str.contains(fault1) & ~df['id'].str.contains(fault2)].reset_index(drop=True)
print(df.shape)

In [None]:
df['empty'] = (df['rle_length'] == 0)

In [None]:
df.to_csv('final_train.csv', index=False)

# **Check Images**

In [None]:
#Check with the mask
i = 90
df = pd.read_csv('../input/gi-tract/final_train.csv')
img = df.image_paths.iloc[i]
mask = df.mask_path.iloc[i]
print(mask)
images(img,mask)

In [None]:
#fig = plt.figure(figsize=(20, 20))
plt.figure(figsize=(20, 20))
for i, j in enumerate(range(64,114)):
    img = df.image_paths.iloc[j]
    mask = df.mask_path.iloc[j]
    #fig.add_subplot(10,5,i+1)
    plt.subplot(10,5,i+1)
    i += 1
    images(img, mask)
    plt.tight_layout()