# **Please upvote my notebook if you liked the contents**

## Load the libraries

In [None]:
from pathlib import Path
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from skimage import color
import seaborn as sns
from glob import glob
from PIL import Image

## Load the data

* CSV data available is arranged in the following format.
    1. ID
    2. Class
    3. Segmentation
* Here we tried taking a peek into some samples from the data.
* The data have some values having Nan so it need be cleaned before using.
* The image sizes are embedded in the file names so we have to extract it out.

In [None]:
root_dir = Path('../input/uw-madison-gi-tract-image-segmentation')
train_dir = root_dir / 'train'
df_train = pd.read_csv(root_dir / 'train.csv')
df_train.head()

## Checking number of data instances
* we have same number of samples for
    1. large bowel
    2. small bowel
    3. Stomach
* No data cleaning is needed since we have same number of samples for all the classses
* Note : no need to use any class weightage.

In [None]:
df_train['class'].value_counts()

## Creating metadata for training
* Why we have to create extra metadata
    1. we need to extract cases to match with the segmentation masks
    2. Need to extract the days from file names
    3. Slice number is also embedded in the filename
    
**Example**

sample file name from csv : case123_day20_slice_0002
* case number : 123
* day : 20
* slice : 002

Sample scaned file name : ../input/uw-madison-gi-tract-image-segmentation/train/case101/case101_day20/scans/slice_0001_266_266_1.50_1.50.png

**Parameters needed**
* case number
* day
* slice number

**So we need to extract all these details from the train.csv file to generate the file name**

In [None]:
#Generate the list of images
images_list = glob('../input/uw-madison-gi-tract-image-segmentation/train/*/*/scans/*.png')

#extract details from the path
images_metadata = pd.DataFrame({'Path':images_list})

#split the path to get individual parameters
path_split = images_metadata['Path'].str.split('/',n=7,expand=True)

#we need to extract [5] and [7]
images_metadata['CaseNum_Day'] = path_split[5]
images_metadata['SliceNum'] = path_split[7]

#Resplitting to extract case, day, slice, height and width
case_split = images_metadata['CaseNum_Day'].str.split('_',n=2, expand=True)
images_metadata['Case'] = case_split[0].str[4:].astype(int)
images_metadata['Day'] = case_split[1].str[3:].astype(int)

#Resplitting to extract slice, height and width
fileName_split = images_metadata['SliceNum'].str.split('_',n=6, expand=True)
images_metadata['Slice'] = fileName_split[1].astype(int)
images_metadata['Height'] = fileName_split[2].astype(int)
images_metadata['Width'] = fileName_split[3].astype(int)

images_metadata.head()

### Unique values from the metadata

* There are 85 cases with 35 different dates
* 4 types of heights and widths

In [None]:
print('Unique case numbers ',len(images_metadata['Case'].unique()))
print('Unique Days ',len(images_metadata['Day'].unique()))
print('Unique Heights ',len(images_metadata['Height'].unique()))
print('Unique Widths ',len(images_metadata['Width'].unique()))

## Visualizing the MRI samples
1. pixel values in MRI are not in range of 0 -255
2. So we need to normalize the values to visualize it properly
3. we can use np.interp for normalizing the values

In [None]:
#plotting random samples
plt.subplots(figsize=(12,16))
for i in range(12):
    index = np.random.randint(0, images_metadata.shape[0])
    image = np.array(Image.open(images_metadata.loc[index, 'Path']))
    
    plt.subplot(4,3,i+1)
    title = (images_metadata.loc[index, 'CaseNum_Day'] + 
           '_Slice_' + str(images_metadata.loc[index, 'Slice']))
    plt.title(title)
    plt.imshow(np.interp(image, [np.min(image), np.max(image)], [0,255]))
plt.show()

## Visualizing the Masks
* The masks provided are in the RLE format encoded in the train.csv file
* we need to parse the format to create the masks
* [Notebook1](http:/https://www.kaggle.com/code/abhishek123maurya/1-visualization-and-decoding-rle/) and [Notebook2](http:/https://www.kaggle.com/code/subinek/2-understanding-plotting-rle-bounding-boxes/edit/) helped in providing the essential code.
* I would reccoment saving [Notebook2](http:/https://www.kaggle.com/code/subinek/2-understanding-plotting-rle-bounding-boxes/edit/) for your future challenges and notebooks

In [None]:
#Helper functions

# Extract mask data from the train.csv file and load all non null values
mask_encoding = df_train[df_train['segmentation'].notnull()]

#convert it into a list index
mask_index = list(mask_encoding.index)

#function for getting pixel location
def get_pixel_loc(rle_string, img_shape):
    rle = [int(i) for i in rle_string.split(' ')]
    pairs = list(zip(rle[0::2],rle[1::2]))
    p_loc = []     #   Pixel Locations

    for start, length in pairs:
        for p_pos in range(start, start + length):
            p_loc.append((p_pos % img_shape[1], p_pos // img_shape[0]))
    return p_loc

#function for getting the mask
def get_mask(mask, img_shape):
    canvas = np.zeros(img_shape).T
    canvas[tuple(zip(*mask))] = 1
    return canvas.T

#applying the mask
def apply_mask(image, mask, img_shape):
    image = image / image.max()
    image = np.dstack((image, get_mask(mask, img_shape), get_mask(mask, img_shape)))
    return image

In [None]:
#Plotting random masks from the dataset
for i in range(5):
    index = mask_index[np.random.randint(0,len(mask_index) - 1)]
    curr_id = mask_encoding.loc[index, 'id']
    class_of_scan = mask_encoding.loc[index, 'class']

    splits = curr_id.split('_')
    x = images_metadata[(images_metadata['Case']==int(splits[0][4:]))
                      &(images_metadata['Day']==int(splits[1][3:]))
                      &(images_metadata['Slice']==int(splits[3]))]

    image = np.array(Image.open(x['Path'].values[0]))
    k = image.shape
    rle_string = mask_encoding.loc[index, 'segmentation']
    p_loc = get_pixel_loc(rle_string, k)


    fig, ax = plt.subplots(1,3, figsize=(12,16))
    ax[0].set_title('Image')
    ax[0].imshow(image)

    ax[1].set_title('Mask')
    ax[1].imshow(get_mask(p_loc, k))

    ax[2].set_title(f'{class_of_scan} Segmented')
    ax[2].imshow(apply_mask(image, p_loc, k))
    plt.show()
plt.show()



**Thank you for reviewing my notebook**