In [None]:
import numpy as np 
import pandas as pd 
import os
import glob

import cv2 
import matplotlib.pyplot as plt

### Understanding directories

In [None]:
# get current work directory
print('Current directory is', os.getcwd())

# having a look at the previous folder
print('Previous folder contains --> ', os.listdir('..'))

# having a look at the input folder where images are located
print('Input folder contains --> ', os.listdir('../input/'))

# having a look at the data competition folder
print('Data competition folder contains --> ', os.listdir('../input/uw-madison-gi-tract-image-segmentation'))

### First approach to each file

#### 1. "train" folder

This folder contains cases. Each case corresponds to a patient and each patient has undergone several sessions of radiotherapy in different days. And a different number of images is adquiered in each session.

Showing some info associated to a random case. 

In [None]:
cases = os.listdir('../input/uw-madison-gi-tract-image-segmentation/train/')[0:5]
print('Some cases are --> ', cases)

In [None]:
days_case36 = os.listdir('../input/uw-madison-gi-tract-image-segmentation/train/case36')
print('Sessions of case 36 are --> ', days_case36)

In [None]:
slices_day0_case36 = os.listdir('../input/uw-madison-gi-tract-image-segmentation/train/case36/case36_day0/scans/')
print('Some slices of day 0 of case 36 --> ', slices_day0_case36[0:2])
print(f'There are {len(slices_day0_case36)} slices.')

In [None]:
# showing a slice
ima_path = '../input/uw-madison-gi-tract-image-segmentation/train/case36/case36_day16/scans/slice_0028_266_266_1.50_1.50.png'
ima = cv2.imread(ima_path)
ima = ima/np.max(ima) # normalization
plt.imshow(ima)
plt.axis('off')
plt.show()

#### 2. "train.csv" file

In [None]:
file_path = '../input/uw-madison-gi-tract-image-segmentation/train.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
df[df['segmentation'].notna()].head()

"train.csv" file contains info related to:

* id: identifies each segmented region and links it to a particular study, day and slice
* class: indicates which organ is segmented. For each register three regions are always considered (stomach, large_bowel, small_bowel)
* segmentation: indicates which pixels are part of the segmented region in rle encoding format

#### 3. "sample_submission.csv" file

Indicates the structure that the submission file must have.

In [None]:
submission_file_path = '../input/uw-madison-gi-tract-image-segmentation/sample_submission.csv'
sub_df = pd.read_csv(submission_file_path)
cols = sub_df.columns

print('Submission file must contain ', cols.values)

### Going deeper with "train.csv" file. Exploratory data analysis (EDA).

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
df.info()

#### 1. Checking NaN values

In [None]:
df.isna().sum()

As it can be seen, the dataframe has NaN values in 'segmentation' column. Why this could be happening? Let's zoom in a particular case.

In [None]:
case123_day20 = df[df['id'].str.contains('case123_day20')] # selecting all the rows associated to a case
n_rows = len(case123_day20)
n_nan = case123_day20['segmentation'].isna().sum()

print(f'The number of slices is {n_rows} and the numer of NaN is {n_nan}.')

Consequently, we can se that having a nan value in a particular study does not mean that we have to discard the whole study. Probably, having a NaN value in a row means that there was not an organ of interest to segment in that slice. In other words, having a NaN value in column 'segmentation' means that the organ indicated by column 'class' does not appear in the slice selected by 'id' column. Therefore, we can discard this slices. 

For example, for 'id' = slice_0001_266_266_1, it is not possible to segment any of the target organs as they do not appear in the image. 

In [None]:
ima_path = '../input/uw-madison-gi-tract-image-segmentation/train/case123/case123_day20/scans/slice_0001_266_266_1.50_1.50.png'
ima = cv2.imread(ima_path)
ima = ima/np.max(ima) # normalization
plt.imshow(ima)
plt.title('slice_0001_266_266_1')
plt.axis('off')
plt.show()

As they are unuseful, we can remove NaN rows.

In [None]:
df = df[df['segmentation'].notna()].reset_index(drop=True)
df.head()

#### 2. Restructuring the dataframe
Let's extract all the info included in the 'id' and create new columns to storage it. 

In [None]:
ids = [row.split('_') for row in df['id']] # ids is a list of lists containing 4 strings
cases = [x[0][4:] for x in ids] # cases is a list with the case number
days = [x[1][3:] for x in ids] # days is a list with the day of the study
slices = [x[3] for x in ids] # slices is a list with the number of slice

In [None]:
# adding new cols
df['case'] = pd.Series(cases)
df['day'] = pd.Series(days)
df['slice'] = pd.Series(slices)

In [None]:
df.head()

#### 3. Adding info from image paths

To access to the images a path is defined using asterisks as they allow considering all files and folders in the directory. We only accept those paths that finish in *png* extenstion, because this is the image format. Then, using *glob* library we can list each image path.

In [None]:
images_path = '../input/uw-madison-gi-tract-image-segmentation/train/*/*/*/*.png'
paths = glob.glob(images_path)
paths[0:3]

We can extract some information from each path such as image size and pixel spacing. To storage this data, we can define a new DataFrame. But, it makes no sense to have information about the same images in two independent data structures, so we must merge both of them. To perform that, we need to have common columns in both DataFrames, so we save info about the case, day and slice again.

In [None]:
# get info from path
cases_days_info = [x.split('/')[-3].split('_') for x in paths]
slices_info = [x.split('/')[-1].split('_') for x in paths]

path_info = {'case': [],'day': [],'slice': [],'x_size': [],
             'y_size': [],'px_spacing_x': [], 'px_spacing_y': [],
             'image_path': [], 'image_name': []}

for slice_info, case_day_info, path in zip(slices_info, cases_days_info, paths):
    path_info['case'].append(case_day_info[0][4:]) # case number
    path_info['day'].append(case_day_info[1][3:]) # day number
    
    path_info['slice'].append(slice_info[1]) # slice number
    path_info['x_size'].append(int(slice_info[2])) # image x size
    path_info['y_size'].append(int(slice_info[3])) # image y size
    path_info['px_spacing_x'].append(float(slice_info[4])) # pixel spacing x axis
    path_info['px_spacing_y'].append(float(slice_info[5][:-4])) # pixel spacing y axis
    
    path_info['image_path'].append(path)
    path_info['image_name'].append(path.split('/')[-1])

In [None]:
path_info_df = pd.DataFrame(path_info)
path_info_df.head()

In [None]:
# merge path info with previous df
df = pd.merge(df, path_info_df, on=['case', 'day', 'slice'])

In [None]:
df.head(20)

#### 4. Extraction of information

In [None]:
n_cases = df['case'].nunique()
n_slices = df['image_path'].nunique() # number of distinct slices
n_slices_per_class = df.groupby('class').agg({'image_path': pd.Series.nunique}) # number of distinct slices
n_slices_per_class = [[organ, counts[0]] for organ, counts in zip(n_slices_per_class.index, n_slices_per_class.values)]
img_sizes = [[x_size, y_size] for x_size, y_size in zip(df['y_size'].unique(), df['x_size'].unique())]
px_spacing = [[x_space, y_space] for x_space, y_space in zip(df['px_spacing_x'].unique(), df['px_spacing_y'].unique())]

In [None]:
print(f'There are {n_cases} cases.')
print(f'There are {n_slices} in total.')
print('The number of appearances of each organ in the slices is:', n_slices_per_class)
print('Image sizes are: ', img_sizes)
print('Pixel spacings are: ', px_spacing)

### Visualization 

Lets work on patient 123.

In [None]:
df_123_20 = df[(df['case'] == '123') & (df['day'] == '20')]
df_123_20.head(20)

In [None]:
# take each slice
# read image from image_path column
# overlay segmentation
# check if next row has the same slice number
# if True overlay new region else take next slice

# lets visualize slice 0075 and the segmentation associated
df_123_20_75 = df_123_20[df_123_20['slice'] == '0075']
df_123_20_75

In [None]:
regions = df_123_20_75['segmentation']
masks = []

# from rlc to numpy
for reg_id, region in enumerate(regions.values):
    region = region.split()

    seg_pxs = []
    num_pxs = []
    for idx, element in enumerate(region):
        if (idx % 2) == 0:
            seg_pxs.append(int(element))
        else:
            num_pxs.append(int(element))

    mask = np.zeros(shape=(266*266,))
    for px in range(0, len(mask)):
        if px not in seg_pxs:
            continue
        else:
            pos = np.argwhere(np.array(seg_pxs) == px)[0][0]
            mask[px:px+num_pxs[pos]] = np.ones(num_pxs[pos]) * (reg_id + 1)

    mask = np.reshape(mask, (266,266))

    masks.append(mask)

In [None]:
# combining all the regions in one image
full_mask = np.sum(masks, axis=0)

In [None]:
# for visualiztion purposes a true/false mask is generated
visu_masks = []
for idx, mask in enumerate(masks):
    mask = np.ma.masked_where(mask==idx+1, mask)
    visu_masks.append(mask)

In [None]:
path_ima = df_123_20_75.loc[10, 'image_path']
ima = cv2.imread(path_ima)
ima = ima / np.max(ima)
plt.imshow(ima)
plt.imshow(visu_masks[0].mask, alpha=0.5)
plt.imshow(visu_masks[1].mask, alpha=0.5)

In [None]:
np.unique(visu_masks[1].mask)

In [None]:
len(masks)

In [None]:
plt.imshow(np.ma.masked_where(masks[0]==0, masks[0]))