# Imports and Setup

In [None]:
import os
import cv2
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from PIL import Image
import tensorflow as tf

In [None]:
import wandb

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret("WANDB_KEY")
wandb.login(key = wandb_key)

# Quick EDA

## Dir Structure

In [None]:
ROOT_DIR = '../input/uw-madison-gi-tract-image-segmentation/'
os.listdir(ROOT_DIR)

> There aren't any test samples.

> There should be some overlap on case level between train and test dataset. It is also evident from this line - "The goal of this competition is to be able to generalize to both partially and wholly unseen cases."

In [None]:
print('Number of case dirs: ', len(os.listdir(ROOT_DIR+'train')))

## Load the train.csv file

In [None]:
df = pd.read_csv(ROOT_DIR+'train.csv')
print('Length of dataframe: ', len(df))
df.head()

## NaN rows

In [None]:
df.isna().sum()

> There are 81,575 rows with no segmentation RLE encoded masks.

In [None]:
# Remove rows with NaN Segmentation masks
df = df[df.segmentation.notna()].reset_index(drop=False)
df.head()

## Number of Classes with num samples

In [None]:
df['class'].value_counts()

> There are three classes to segment - large_bowel (large intestine), small_bowel (small intestine), stomach.

## Number of cases (verifying from dataframe)

In [None]:
def get_case_str(row):
    case_num = row.id.split('_')[0]
    return case_num

def get_case_id(row):
    case_num = row.id.split('_')[0]
    return int(case_num[4:])

df['case_str'] = df.apply(lambda row: get_case_str(row), axis=1)
df['case_id'] = df.apply(lambda row: get_case_id(row), axis=1)
df.head()

In [None]:
print('Number of cases: ', len(df.case_str.unique()))

In [None]:
df.case_id.value_counts().sort_index()

> So some cases has more images compared to others, this may be because of more number of slices per day basis or more number of days per case.Let's see.

## Get day number

In [None]:
def get_day_str(row):
    return row.id.split('_')[1]

def get_day_id(row):
    return int(row.id.split('_')[1][3:])

df['day_str'] = df.apply(lambda row: get_day_str(row), axis=1)
df['day_id'] = df.apply(lambda row: get_day_id(row), axis=1)

df.head()

In [None]:
print('Number of unique days a scan was taken: ', len(df.day_str.unique()))

In [None]:
df.day_id.value_counts().sort_index()

> Not sure what physical meaning `day` has but here's a guess - 
> "Of these patients, about half are eligible for radiation therapy, usually delivered over 10-15 minutes a day for 1-6 weeks......oncologists are able to visualize the daily position of the tumor and intestines, which can vary day to day. " The case is provided with radiotherapy and scanned for 1-6 weeks (0 - 41 days). 

> Note that each case is not scanned frequently (almost every day) in the context of the data available.

In [None]:
df.groupby(['case_id', 'day_id'])['class'].count().head(20)

## Get Slide Ids

In [None]:
def get_slice_str(row):
    slice_id = row.id.split('_')[-1]
    return f'slice_{slice_id}'

df['slice_str'] = df.apply(lambda row: get_slice_str(row), axis=1)
df.head()

## Add path

In [None]:
filepaths = glob.glob(ROOT_DIR+'train/*/*/*/*')
filepaths[:5]

In [None]:
file_df = pd.DataFrame(columns=['case_str', 'day_str', 'slice_str', 'filename', 'filepath'])
for idx, filepath in tqdm(enumerate(filepaths)):
    case_day_str = filepath.split('/')[5]
    case_str, day_str = case_day_str.split('_')

    filename = filepath.split('/')[-1]
    slice_id = filename.split('_')[1]
    slice_str = f'slice_{slice_id}'
    
    file_df.loc[idx] = [case_str, day_str, slice_str, filename, filepath]

file_df.head()

In [None]:
df = pd.merge(df, file_df, on=['case_str', 'day_str', 'slice_str'])
df.head()

## Extract other information from filename

> Note that the image filenames include 4 numbers (ex. 276_276_1.63_1.63.png). These four numbers are slice height / width (integers in pixels) and heigh/width pixel spacing (floating points in mm). The first two defines the resolution of the slide. The last two record the physical size of each pixel.

From filename we will get -
* height of image,
* width of image,
* pixel height,
* pixel width

In [None]:
def get_image_height(row):
    return int(row.filename[:-4].split('_')[2])
    
def get_image_width(row):
    return int(row.filename[:-4].split('_')[3])

def get_pixel_height(row):
    return float(row.filename[:-4].split('_')[4])

def get_pixel_width(row):
    return float(row.filename[:-4].split('_')[5])

df['img_height'] = df.apply(lambda row: get_image_height(row), axis=1)
df['img_width'] = df.apply(lambda row: get_image_width(row), axis=1)
df['pixel_height (mm)'] = df.apply(lambda row: get_pixel_height(row), axis=1)
df['pixel_width (mm)'] = df.apply(lambda row: get_pixel_width(row), axis=1)

df.head()

In [None]:
df.drop('index', axis=1, inplace=True)

# Analyze - Case, Day, Slice Level

If you think about the data, each image (slice) can be reached by traversing three level -
* Case - which case the scan belongs to,
* Day - which day the scan was taken/produced,
* Slice - in the context of the csv file provided, each slice can have multiple rows where each row has a unique segmentation mask (RLE encoded).

In [None]:
by_case = df.groupby('case_str')
case_df = by_case.get_group('case123')
case_df.head()

In [None]:
case_df.day_str.value_counts()

In [None]:
by_day = case_df.groupby('day_str')
day_df = by_day.get_group('day0')
day_df.head()

In [None]:
day_df.slice_str.value_counts()

In [None]:
by_slice = day_df.groupby('slice_str')
slice_df = by_slice.get_group('slice_0075')
slice_df.head()

## Visualize Segmentation Masks

In [None]:
filepath = slice_df.filepath.values[0]
image = cv2.imread(filepath, cv2.IMREAD_UNCHANGED)
image.shape

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(image, cmap='gray');

In [None]:
def rle2mask(rles, class_names, height, width, class_dict):
    img = np.zeros(height*width, dtype=np.uint16)
    for rle, class_name in zip(rles, class_names):
        s = rle.split(' ')
        starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
        starts -= 1
        ends = starts + lengths
        for lo, hi in zip(starts, ends):
            img[lo:hi] = class_dict[class_name]
        
    mask = img.reshape((width, height))
    return mask

In [None]:
class2id = {class_name: idx+1 for idx, class_name in enumerate(df['class'].unique())} # 0 is reserved for background
id2class = {v:k for k, v in class2id.items()}
id2class

In [None]:
mask = rle2mask(slice_df.segmentation.values,
                slice_df['class'].values,
                slice_df.img_height.values[0],
                slice_df.img_width.values[0],
                class2id)

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(mask);

## Visualize Segmentation Mask using Weights and Biases

In [None]:
# 1. Generate a dict of mask data to log
wandb_mask = {
    'gt_mask':{
        'mask_data': mask,
        'class_labels': id2class
    }
}

In [None]:
run = wandb.init(project='UW-Madison-Viz')
wandb.log({'Ground Truth Segmentation': wandb.Image(image, masks=wandb_mask)})
wandb.finish()
run

* # Visualize a case day-wise using W&B Tables.

We will use the case selected above.

In [None]:
# A dict with key id and name for logging segmentation mask as W&B Tables.
wandb_class_set = wandb.Classes([{
                     'id': id,
                     'name': name
                  } for id, name in id2class.items()])

In [None]:
for day, day_df in by_day:
    print('The day the scan was taken: ', day)
    
    # 1. Initialize a W&B Run
    run = wandb.init(project='UW-Madison-Viz', group='case123-viz')

    # 2. Initialize a W&B Table
    data_at = wandb.Table(columns=['slice', 'image'])
    
    # Group by slice
    by_slice = day_df.groupby('slice_str')
    
    # Iterate through each slice, open the image, and get mask
    for slice_num, slice_df in tqdm(by_slice):
        # Open the image
        filepath = slice_df.filepath.values[0]
        image = cv2.imread(filepath, cv2.IMREAD_UNCHANGED)
        image = tf.convert_to_tensor(image, dtype=tf.uint16)
        image = tf.image.convert_image_dtype(image, dtype=tf.float16)
        
        # Get mask
        mask = rle2mask(slice_df.segmentation.values,
                slice_df['class'].values,
                slice_df.img_height.values[0],
                slice_df.img_width.values[0],
                class2id)
        
        # 3. Generate a dict of mask data to log
        wandb_mask = {
            'gt_mask':{
                'mask_data': mask,
                'class_labels': id2class
            }
        }
        
        # 4. Add the data as a new row
        data_at.add_data(
            slice_num,
            wandb.Image(image, masks=wandb_mask, classes=wandb_class_set)
        )
        
    # 5. Log the table onto W&B dashboard
    wandb.log({f'Segmentation Viz {day}': data_at})

    # 6. Close the W&B run
    wandb.finish()

![img](https://i.imgur.com/mSjIJW3.mp4)

# WIP