This notebook tries to explore the provided data for the UW-Madison GI Tract Image Segmentation competition.

# Preparation

Import the necessary libraries and define some constants.

In [None]:
import numpy as np
import pandas as pd
import os
from PIL import Image
import matplotlib.pyplot as plt
import glob

In [None]:
DATA_DIR = os.path.join(
    '..', 'input', 'uw-madison-gi-tract-image-segmentation')
TRAIN_DIR = os.path.join(
    DATA_DIR, 'train')

# train.csv

Let's take a look at train.csv.

In [None]:
train_csv_path = os.path.join(DATA_DIR, 'train.csv')
train_df = pd.read_csv(train_csv_path)

train_df

## ID

Each IDs can be divided into case, day and slice.
The [extract](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.extract.html) method below takes out the
capture groups denoted by '(...)' in the specified regex
into DataFrame columns.

In [None]:
case_day_slice_df = train_df['id'] \
    .str \
    .extract(r'(case\d\d*)_(day\d\d*)_(slice_\d\d*)') \
    .rename(columns={0: 'case', 1: 'day', 2: 'slice'})

case_day_slice_df

### Case

How many unique cases?

In [None]:
unique_cases = case_day_slice_df['case'].unique()

print("Number of unique cases:", len(unique_cases))
unique_cases

### Day

Check number of days for each case.

In [None]:
days_per_case_df = case_day_slice_df[['case', 'day']] \
    .groupby('case') \
    .nunique()

days_per_case_df

Number of days for a case:
* minimum 1 day
* maximum 6 days
* average 3.2 days

In [None]:
days_per_case_df['day'].describe()

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(days_per_case_df['day'])
plt.title("Number of days per case")
plt.xlabel("case")
plt.ylabel("number of days")
plt.xticks(rotation=90)
plt.xlim(0, len(days_per_case_df) - 1)
plt.show()

### Slice

Check number of slices for each day.

In [None]:
slices_per_day_df = case_day_slice_df \
    .groupby(['case', 'day']) \
    .nunique()

slices_per_day_df

Most of slices for a day are consists of 144 images.
Some are 80.

In [None]:
slices_per_day_df['slice'].value_counts()

Check number of slices for each case.

In [None]:
slices_per_case_df = slices_per_day_df \
    .reset_index() \
    [['case', 'slice']] \
    .groupby('case') \
    .sum()

slices_per_case_df

Number of slices for a case:
* Minimum 144
* Maximum 864
* Average 452.9

In [None]:
slices_per_case_df['slice'].describe()

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(slices_per_case_df['slice'])
plt.title("Number of slices per case")
plt.xlabel("case")
plt.ylabel("number of slices")
plt.xticks(rotation=90)
plt.xlim(0, len(slices_per_case_df) - 1)
plt.show()

## Class

There are 3 classes, 'large_bowel', 'small_bowel, and 'stomach'.
Each of them are 38,496.

In [None]:
train_df['class'].value_counts()

## Segmentation

There are 81,575 of NaN and 33,913 not NaN.

In [None]:
print("Total:  ", len(train_df['segmentation']))
print("NaN:    ", train_df['segmentation'].isna().sum())
print("Not NaN:", train_df['segmentation'].notna().sum())

Some not NaN examples are as follows:

In [None]:
train_df[train_df['segmentation'].notna()]

# Segmentation per case/class

Check how much percentage of segmentation is NaN or not NaN for each cases/classes.
At first, make a DataFrame which consists of case, class, segmentation is NaN, and not NaN.

In [None]:
case_class_seg_df = pd.concat([
    case_day_slice_df['case'],
    train_df['class']],
    axis=1)
case_class_seg_df['seg_isna'] = train_df['segmentation'].isna()
case_class_seg_df['seg_notna'] = train_df['segmentation'].notna()

case_class_seg_df

From the DataFrame made at the previous step, take the columns of 'case', 'seg_isna', and 'seg_notna'. Then, calculate 'total' and 'notna_ratio'.

In [None]:
seg_na_per_case_df = case_class_seg_df \
    [['case', 'seg_isna', 'seg_notna']] \
    .groupby('case') \
    .sum()
seg_na_per_case_df['total'] = \
    seg_na_per_case_df['seg_isna'] + seg_na_per_case_df['seg_notna']
seg_na_per_case_df['notna_ratio'] = \
    seg_na_per_case_df['seg_notna'] / seg_na_per_case_df['total']

seg_na_per_case_df

The ratio of not NaN for each case:
* Minimum: 0.168
* Maximum: 0.432
* Average: 0.296

In [None]:
seg_na_per_case_df['notna_ratio'].describe()

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(seg_na_per_case_df['notna_ratio'])
plt.title("Ratio of not NaN segments per case")
plt.xlabel("case")
plt.ylabel("Ratio of not NaN segments")
plt.xticks(rotation=90)
plt.xlim(0, len(seg_na_per_case_df) - 1)
plt.show()

Check the ratio of not NaN segmentations for each classes.
* large_bowel: 0.366
* small_bowel: 0.291
* stomach: 0.224

In [None]:
seg_na_per_class_df = case_class_seg_df \
    [['class', 'seg_isna', 'seg_notna']] \
    .groupby('class') \
    .sum()
seg_na_per_class_df['total'] = \
    seg_na_per_class_df['seg_isna'] + seg_na_per_class_df['seg_notna']
seg_na_per_class_df['notna_ratio'] = \
    seg_na_per_class_df['seg_notna'] / seg_na_per_class_df['total']

seg_na_per_class_df

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(seg_na_per_class_df['notna_ratio'])
plt.title("Ratio of not NaN segments per class")
plt.xlabel("class")
plt.ylabel("Ratio of not NaN segments")
plt.show()

# Segmentation per case/day

Check segmentation is NaN or not NaN for each day of each case.
Make a DataFrame which consists of case, day, segmentaion is NaN, and not NaN.

In [None]:
case_day_seg_df = case_day_slice_df[['case', 'day']]
case_day_seg_df['seg_isna'] = train_df['segmentation'].isna()
case_day_seg_df['seg_notna'] = train_df['segmentation'].notna()

case_day_seg_df

For the DataFrame above, sum up for 'seg_isna' and 'seg_notna' for each day of each case. Then, calculate 'total' and 'notna_ratio' of them.

In [None]:
seg_na_per_case_day_df = case_day_seg_df \
    .groupby(['case', 'day']) \
    .sum()
seg_na_per_case_day_df['total'] = \
    seg_na_per_case_day_df['seg_isna'] + seg_na_per_case_day_df['seg_notna']
seg_na_per_case_day_df['notna_ratio'] = \
    seg_na_per_case_day_df['seg_notna'] / seg_na_per_case_day_df['total']

seg_na_per_case_day_df

The ratio of not NaN segmentation for each day of each case:
* Minimum: 0.130
* Maximum: 0.533
* Average: 0.297

In [None]:
seg_na_per_case_day_df['notna_ratio'].describe()

# Train directory

The directory structure look like as shown below:

<pre>
train<br>
  |<br>
  +-- case123<br>
  |      |<br>
  |      +-- case123_day20<br>
  |      |         |<br>
  |      |         +-- scans<br>
  |      |               |<br>
  |      |               +-- slice_0001_266_266_1.50_1.50.png<br>
  |      |               |<br>
  |      |               +-- slice_0002_266_266_1.50_1.50.png<br>
  |      |               |<br>
        . . . . .
</pre>

In [None]:
train_contents = os.listdir(TRAIN_DIR)

print("Number of train contents:", len(train_contents))
train_contents[:5]

In [None]:
case123_dir = os.path.join(TRAIN_DIR, 'case123')
case123_contents = os.listdir(case123_dir)

print("Number of 'case123' contents:", len(case123_contents))
case123_contents

In [None]:
case123_day20_dir = os.path.join(case123_dir, 'case123_day20')
case123_day20_contents = os.listdir(case123_day20_dir)

print("Number of 'case123_day20' contents:", len(case123_day20_contents))
case123_day20_contents

In [None]:
case123_day20_scans_dir = os.path.join(case123_day20_dir, 'scans')
case123_day20_scans_contents = sorted(os.listdir(case123_day20_scans_dir))

print("Number of 'scans' contents:", len(case123_day20_scans_contents))
print(case123_day20_scans_contents[:5])
print(case123_day20_scans_contents[-5:])

# File name

Find all the '*.png' files under the 'train' directory.

In [None]:
file_path_pattern = os.path.join(TRAIN_DIR, '**', '*.png')
file_paths = glob.glob(file_path_pattern, recursive=True)

print("Number of matched files:", len(file_paths))
file_paths[:5]

Make a DataFrame for the found file paths.

In [None]:
file_path_df = pd.DataFrame({"file_path": file_paths})

file_path_df

Extract information from the file paths by using the [extract](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.extract.html) method.

In [None]:
file_name_info_df = file_path_df['file_path'] \
    .str \
    .extract(
        r'(case\d\d*)_(day\d\d*)' + \
        '/scans' + \
        '/(slice_\d\d*)_(\d\d*_\d\d*)_(\d\d*\.\d\d*_\d\d*\.\d\d*)') \
    .rename(columns=
        {0: 'case', 1: 'day', 2: 'slice', 3: 'size', 4: 'spacing'})

file_name_info_df

For image file size, there are 4 types: 266x266, 360x310, 276x276, and 234x234.

In [None]:
file_name_info_df['size'].value_counts()

For image file spacing, there are 2 types, 1.50x1.50 and 1.63x1.63.

In [None]:
file_name_info_df['spacing'].value_counts()

# Image

Load and draw a image. It looks dark.

In [None]:
case123_day20_slice_0065_path = os.path.join(
    case123_day20_scans_dir, 'slice_0065_266_266_1.50_1.50.png')
sample_img = Image.open(case123_day20_slice_0065_path)

sample_img

Check the minimum and maximum value in the image.
They are 0 and 13,452.

In [None]:
sample_np = np.array(sample_img)

print("Shape:  ", sample_np.shape)
print("Minimum:", np.min(sample_np))
print("Maximum:", np.max(sample_np))

Draw distribution of values in the image.

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(sample_np.flatten(), bins=100)
plt.title('Distribution of values in the image')
plt.xlabel("value")
plt.ylabel("count")
plt.show()

Make the value range to 0..255 to make lighter.

In [None]:
sample_np_f = sample_np.astype(np.float32)
sample_img_min = np.min(sample_np_f)
sample_img_max = np.max(sample_np_f)
sample_np_ui8 = (
    (sample_np_f - sample_img_min) \
    / (sample_img_max - sample_img_min) * 255.0) \
    .astype(np.uint8)
sample_scaled_img = Image.fromarray(sample_np_ui8)

sample_scaled_img

# Mask

Check the segmentation mask.

In [None]:
sample_seg = train_df.at[194, 'segmentation']

sample_seg

Split each numbers in the list, then take a look.

In [None]:
sample_seg_list = sample_seg.split()

print("Length:", len(sample_seg_list))
sample_seg_list[:10]

The odd numbered numbers (1st, 3rd, 5th, ...) show the start position of the mask, and even numbered numbers (2nd, 4th, 6th, ...) indicate the length of the mask from the start.

In [None]:
start_list = sample_seg_list[0::2]
length_list = sample_seg_list[1::2]

print("Length:", len(start_list), len(length_list))
print(start_list[:5])
print(length_list[:5])

Make a mask image by using pairs of the starts and the lengths.

In [None]:
sample_mask = np.zeros(266 * 266, dtype=np.uint8)

for start, length in zip(start_list, length_list):
    start = int(start)
    length = int(length)
    sample_mask[start:start + length] = 255
    
sample_mask = np.reshape(sample_mask, (266, 266))
sample_mask_img = Image.fromarray(sample_mask)

sample_mask_img

Overlay the mask with the image. 

In [None]:
sample_scaled_rgba_img = sample_scaled_img.convert("RGBA")
sample_mask_rgba_img = sample_mask_img.convert("RGBA")
sample_overlay_img = Image.blend(
    sample_scaled_rgba_img, sample_mask_rgba_img, 0.5)

sample_overlay_img

# Min/Max Value of Images

What is the minimum/maximum value of images. All images have the same min/max value of 0 and13452.

In [None]:
def get_min_max_value_in(file_path):
    img = Image.open(file_path)
    img_np = np.array(sample_img)
    min_value = np.min(img_np)
    max_value = np.max(img_np)
    return pd.Series({
        "min_value": min_value,
        "max_value": max_value })

min_max_value_df = file_path_df['file_path'] \
    .apply(get_min_max_value_in)

min_max_value_df

In [None]:
min_max_value_df.value_counts()

# Slices And Masks

There must be some relation between slices and mask areas.

In [None]:
def calc_mask_length(segmentation):
    segment_list = segmentation.split()
    length_list = segment_list[1::2]
    mask_length = sum(map(int, length_list))
    return mask_length

segment_len_ser = train_df['segmentation'] \
    .fillna('') \
    .apply(calc_mask_length) \
    .rename('mask_len')

segment_len_ser[segment_len_ser > 0]

Combine the calculated mask length with id and class.

In [None]:
case_slice_mask_len_df = pd.concat(
    [train_df[['id', 'class']], segment_len_ser], axis=1)

case_slice_mask_len_df

Unstack the DataFrame to make large_bowel, small_bowel, and stomach rows to columns.

In [None]:
case_slice_mask_len_df = \
    case_slice_mask_len_df \
        .set_index(['id', 'class']) \
        .unstack() \
        .reset_index()
case_slice_mask_len_df.columns = \
    ['id', 'large_bowel', 'small_bowel', 'stomach']

case_slice_mask_len_df

Extract slice numbers from 'id's.

In [None]:
slice_mask_len_df = case_slice_mask_len_df.copy()
slice_mask_len_df['slice_no'] = \
    slice_mask_len_df['id'] \
        .str \
        .replace(pat=r'.*slice_(\d\d*)', repl=r'\1', regex=True) \
        .astype(int)
slice_mask_len_df = slice_mask_len_df[
    ['slice_no', 'large_bowel', 'small_bowel', 'stomach']]

slice_mask_len_df

Sum up the mask length for each class by slice number.

In [None]:
slice_mask_len_grp = \
    slice_mask_len_df.groupby(by='slice_no').sum()

slice_mask_len_grp

Plot the calculated sum for each class for each slice number.
As the slice number increases, the mask area become large in the order of stomach, small bowel, and large bowel. 

In [None]:
plt.figure(figsize=(12, 5))
plt.title("Sum of mask length for each slice number")
plt.bar(
    slice_mask_len_grp.index,
    slice_mask_len_grp['large_bowel'], alpha=0.5, label='large_bowel')
plt.bar(
    slice_mask_len_grp.index,
    slice_mask_len_grp['small_bowel'], alpha=0.5, label='small_bowel')
plt.bar(
    slice_mask_len_grp.index,
    slice_mask_len_grp['stomach'], alpha=0.5, label='stomach')
plt.xlabel("slice number")
plt.ylabel("Sum of mask length")
plt.legend()
plt.xlim(slice_mask_len_grp.index[0], slice_mask_len_grp.index[-1])
plt.show()

# How Many Contiguous Masks in Each Set of Slices

In [None]:
case_day_mask_len_df = case_slice_mask_len_df.copy()

case_day_mask_len_df['case_day'] = \
    case_slice_mask_len_df['id'] \
        .str \
        .replace(pat=r'(case\d+_day\d+).*', repl=r'\1', regex=True)
case_day_mask_len_df = case_day_mask_len_df[
    ['case_day', 'large_bowel', 'small_bowel', 'stomach']]

case_day_mask_len_df

For each case_day, join mask length for all slices. This makes one row for each one case_day.

In [None]:
def join_col_values(case_day_df):
    # DataFrame for each group is passed. Join the values of each columns.
    return case_day_df.apply(lambda x: ' '.join(x), axis=0)

case_day_mask_len_df = \
    case_day_mask_len_df \
        .astype(str) \
        .groupby('case_day') \
        .apply(join_col_values) \
        .drop('case_day', axis=1)

case_day_mask_len_df

In [None]:
def detect_one_count(joined_mask_len):
    joined_mask_len_list = list(map(int, joined_mask_len.split()))
    curr_mask_len = np.array(joined_mask_len_list)

    prev_mask_len = np.empty_like(curr_mask_len)
    prev_mask_len[0] = 0
    prev_mask_len[1:] = curr_mask_len[:-1]

    next_mask_len = np.empty_like(curr_mask_len)
    next_mask_len[:-1] = curr_mask_len[1:]
    next_mask_len[-1] = 0

    slice_no = np.arange(len(curr_mask_len))
    mask_start_slices = slice_no[(prev_mask_len == 0) & (curr_mask_len > 0)]
    mask_end_slices = slice_no[(curr_mask_len > 0) & (next_mask_len == 0)]
    return len(mask_start_slices)

def detect_contiguous_mask_count(row):
    return pd.Series({
        'large_bowel': detect_one_count(row['large_bowel']),
        'small_bowel': detect_one_count(row['small_bowel']),
        'stomach': detect_one_count(row['stomach']),
    })

contiguous_mask_count_df = \
    case_day_mask_len_df.apply(detect_contiguous_mask_count, axis=1)

contiguous_mask_count_df

Some sets of slices for case/day have more than one contiguous mask segments. Maximum count is 14 in the large bowel for case138_day0.

In [None]:
contiguous_mask_count_df.sort_values(
    by=['large_bowel', 'small_bowel', 'stomach'], ascending=False)

In [None]:
case_day_mask_len_df.loc['case138_day0', 'large_bowel']

In [None]:
case_day_mask_len_df.loc['case7_day0', 'stomach']