# このcodeの目的
- マスク情報を統計分析する
- 大腸・小腸・胃の位置関係を、データから理解する

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from PIL import Image
import os, shutil
from tqdm import tqdm

# 1. train画像データの情報構造化
id、ファイル名から分かる情報を構造化<br>
参考：https://www.kaggle.com/code/ammarnassanalhajali/uwmgi-unet-pytorch-train-with-eda

In [None]:
df_train = pd.read_csv('../input/uw-madison-gi-tract-image-segmentation/train.csv')
df_train.rename(columns = {'class':'class_name'}, inplace = True)

# id情報から、case, day, slice 情報を抜き出して列追加
df_train['case'] = df_train['id'].apply(lambda x: int(x.split('_')[0].replace('case', '')))
df_train['day'] = df_train['id'].apply(lambda x: int(x.split('_')[1].replace('day', '')))
df_train['slice'] = df_train['id'].apply(lambda x: x.split('_')[3])

# path_partial：idから画像データの path 情報を部分的に作成（4つの数字が分からない）
TRAIN_DIR='../input/uw-madison-gi-tract-image-segmentation/train'
all_train_images = glob(os.path.join(TRAIN_DIR, '**', '*.png'), recursive=True)
x = all_train_images[0].rsplit('/', 4)[0] ## ../input/uw-madison-gi-tract-image-segmentation/train

path_partial_list = []
for i in range(0, df_train.shape[0]):
    path_partial_list.append(os.path.join(x,
                          'case'+str(df_train['case'].values[i]),
                          'case'+str(df_train['case'].values[i])+'_'+ 'day'+str(df_train['day'].values[i]),
                          'scans',
                          'slice_'+str(df_train['slice'].values[i])))
df_train["path_partial"] = path_partial_list

# inputフォルダから、直接正しいpath情報を取得し、path_partialと対応付け
path_partial_list = []
for i in range(0, len(all_train_images)):
    path_partial_list.append(str(all_train_images[i].rsplit('_',4)[0]))
    
tmp_df = pd.DataFrame()
tmp_df['path_partial'] = path_partial_list
tmp_df['path'] = all_train_images

# path 情報列追加
df_train = df_train.merge(tmp_df, on='path_partial').drop(columns=['path_partial'])

# path の数値から、幅と高さの情報取得
df_train['width'] = df_train['path'].apply(lambda x: int(x[:-4].rsplit('_',4)[1]))
df_train['height'] = df_train['path'].apply(lambda x: int(x[:-4].rsplit('_',4)[2]))
df_train['width_space'] = df_train['path'].apply(lambda x: float(x[:-4].rsplit('_',4)[3]))
df_train['height_space'] = df_train['path'].apply(lambda x: float(x[:-4].rsplit('_',4)[4]))

del x,path_partial_list,tmp_df

tmp_df = df_train

# 同じidに対する、large_bowel, small_bowel, stomach の情報を1行にまとめる
df_train = pd.DataFrame({'id':tmp_df['id'][::3]})

df_train['large_bowel'] = tmp_df['segmentation'][::3].values
df_train['small_bowel'] = tmp_df['segmentation'][1::3].values
df_train['stomach'] = tmp_df['segmentation'][2::3].values

df_train['path'] = tmp_df['path'][::3].values
df_train['case'] = tmp_df['case'][::3].values
df_train['day'] = tmp_df['day'][::3].values
df_train['slice'] = tmp_df['slice'][::3].values
df_train['width'] = tmp_df['width'][::3].values
df_train['height'] = tmp_df['height'][::3].values
df_train['width_space'] = tmp_df['width_space'][::3].values
df_train['height_space'] = tmp_df['height_space'][::3].values

del tmp_df

df_train.reset_index(inplace=True,drop=True)
df_train.fillna('',inplace=True); 
# 各画像に対して、存在するラベル数をカウント
df_train['count'] = np.sum(df_train.iloc[:,1:4]!='',axis=1).values

In [None]:
df_train.sample(5)

In [None]:
df_train.describe()

In [None]:
df_train.describe(include='object')

In [None]:
print('case（患者）の数：{}'.format(len(df_train['case'].unique())))
print('case × dayの数 = 3次元画像数：{}'.format(len(df_train.iloc[:,5:7].value_counts())))

In [None]:
df_train.iloc[:,5:7].value_counts().value_counts().rename_axis('num of slice').reset_index().rename(columns={0: 'value_count'})

# 2. 各ラベルが出現する頻度確認

In [None]:
# スライス画像データに出現するラベルの頻度
plt.figure(figsize=(6,4))
bar = plt.bar([1,2,3],100*np.mean(df_train.iloc[:,1:4]!='',axis=0))
plt.title('Percent Training Images with Mask', fontsize=16)
plt.ylabel('Percent of Images'); plt.xlabel('Class Type')
plt.xticks([1,2,3])
labels=['large bowel','small bowel','stomach']
for rect,lbl in zip(bar,labels):
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width()/2, height+2,  lbl,
             ha='center', va='bottom',fontsize=12)
    plt.text(rect.get_x() + rect.get_width()/2, height/2, '%.1f %%' % height,
             ha='center', va='center',fontsize=12, color = 'white')

plt.ylim((0,50)); plt.show()

大腸・小腸・胃の順に、深さ方向に出現する割合が減っていく。

In [None]:
# スライス画像データに出現するラベル数
bar = plt.bar([0,1,2,3],df_train['count'].value_counts(sort=False, normalize=True)*100)
plt.xlabel('label count')
plt.xticks([0,1,2,3])
for rect in bar:
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width()/2, height/2, '%.1f %%' % height,
             ha='center', va='center',fontsize=12, color = 'white')
    
plt.show()

半分以上は、全てのラベルが存在しない画像。<br>
次いで、2つの臓器が同時に存在する画像が多い。

# 3. 各ラベルが存在する数を、ベン図で可視化

In [None]:
# ベン図可視化用のライブラリ
!pip install matplotlib-venn

In [None]:
from matplotlib_venn import venn3

In [None]:
# 全体集合数表示
print('U = {}'.format(len(df_train)))

# 各ラベルが存在するindexのarray作成
large_bowel_set = np.array(df_train[df_train['large_bowel']!=''].index)
small_bowel_set = np.array(df_train[df_train['small_bowel']!=''].index)
stomach_set = np.array(df_train[df_train['stomach']!=''].index)

# ベン図で可視化
plt.figure(figsize=(8,6))
venn3(subsets=[set(large_bowel_set),set(small_bowel_set),set(stomach_set)],set_labels=('large bowel','small bowel','stomach'))
plt.show()

大腸・小腸は同じ画像に出現しやすく、胃は離れていることが分かる

# 4. slice位置とラベルの関係

In [None]:
df_tmp_id = df_train.loc[:,['case','day','slice']]
df_label_exist = pd.DataFrame(columns=['case','day','slice','label'])

labels = ['large_bowel','small_bowel','stomach']
for i in range(3):
    df_tmp_exist = pd.DataFrame(df_train[df_train[labels[i]]!=''].loc[:,['case','day','slice']])
    df_tmp_exist['label'] = i
    df_tmp_exist = pd.merge(df_tmp_id, df_tmp_exist, on=['case','day','slice'], how='inner')
    df_label_exist = pd.concat([df_label_exist,df_tmp_exist])

del df_tmp_id, df_tmp_exist

df_label_exist['slice'] = df_label_exist['slice'].astype('int')

In [None]:
# case day 組み合わせ確認・指定用
df_case_day_unique = df_train[["case", "day"]].drop_duplicates().sort_values(['case', 'day']).reset_index()
df_case_day_unique

In [None]:
def show_label_slice(df, CASE, DAY):
    fig = plt.figure(figsize=(6,4))
    ax = fig.add_subplot(1,1,1)
    plt.scatter(x=df['label'], y=df['slice'], alpha=1, s=2)
    plt.title('case {} day {}'.format(CASE, DAY))
    plt.xticks([0, 1, 2], ['large_bowel','small_bowel','stomach'])
    plt.xlabel('')
    plt.ylabel('slice')
    plt.xlim(-0.5,2.5)
    plt.ylim(0,int(df_train[(df_train['case']==CASE) & (df_train['day']==DAY)]['slice'].max()))
    ax.invert_yaxis()
    plt.show()

In [None]:
# ランダムに表示
for i in np.random.randint(0, len(df_case_day_unique), 10):
    CASE = df_case_day_unique.loc[i,'case']
    DAY  = df_case_day_unique.loc[i,'day']
    show_label_slice(df_label_exist[(df_label_exist['case']==CASE) & (df_label_exist['day']==DAY)], CASE, DAY)

In [None]:
# 確認したい case day の指定
CASE = 15
DAY  = 0
show_label_slice(df_label_exist[(df_label_exist['case']==CASE) & (df_label_exist['day']==DAY)], CASE, DAY)

CASE = 138
DAY  = 0
show_label_slice(df_label_exist[(df_label_exist['case']==CASE) & (df_label_exist['day']==DAY)], CASE, DAY)

対象の消化器官が、スライスの端にあることはない。<br>
比較的に、小腸と大腸は足側、胃は頭側に位置する。<br>
大腸は患者によって、小腸と同じ範囲だったり、胃の位置まで存在したり、ばらついている。<br>
まれに、大腸のラベルが途中で途切れている。例：case 15 day 0, case 138 day 0

# 5. ラベルの途切れている画像の確認

In [None]:
def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = np.asarray(mask_rle.split(), dtype=int)
    starts = s[0::2] - 1
    lengths = s[1::2]
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape) 

def rle_encode(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [None]:
import tensorflow as tf
tqdm.pandas()
from matplotlib.patches import Rectangle
import cv2
import math

In [None]:
def get_metadata(row):
    data = row['id'].split('_')
    case = int(data[0].replace('case',''))
    day = int(data[1].replace('day',''))
    slice_ = int(data[-1])
    row['case'] = case
    row['day'] = day
    row['slice'] = slice_
    return row

def path2info(row):
    path = row['image_path']
    data = path.split('/')
    slice_ = int(data[-1].split('_')[1])
    case = int(data[-3].split('_')[0].replace('case',''))
    day = int(data[-3].split('_')[1].replace('day',''))
    width = int(data[-1].split('_')[2])
    height = int(data[-1].split('_')[3])
    row['height'] = height
    row['width'] = width
    row['case'] = case
    row['day'] = day
    row['slice'] = slice_
    return row

In [None]:
def id2mask(id_):
    idf = df_train[df_train['id']==id_]
    wh = idf[['height','width']].iloc[0]
    shape = (wh.height, wh.width, 3)
    mask = np.zeros(shape, dtype=np.uint8)
    for i, class_ in enumerate(['large_bowel', 'small_bowel', 'stomach']):
        cdf = idf[idf['class']==class_]
        rle = cdf.segmentation.squeeze()
        if len(cdf) and not pd.isna(rle):
            mask[..., i] = rle_decode(rle, shape[:2])
    return mask

def rgb2gray(mask):
    pad_mask = np.pad(mask, pad_width=[(0,0),(0,0),(1,0)])
    gray_mask = pad_mask.argmax(-1)
    return gray_mask

def gray2rgb(mask):
    rgb_mask = tf.keras.utils.to_categorical(mask, num_classes=4)
    return rgb_mask[..., 1:].astype(mask.dtype)

In [None]:
def load_img(path):
    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
    img = img.astype('float32') 
    img = (img - img.min())/(img.max() - img.min())*255.0 
    img = img.astype('uint8')
    return img

def display_xy_imgs_masked(CASE, DAY, slice_list, df_train):
    # train画像のpath取得
    TRAIN_DIR='../input/uw-madison-gi-tract-image-segmentation/train/case'+CASE+'/case'+CASE+'_day'+DAY+'/scans/'
    train_images = glob(os.path.join(TRAIN_DIR, '**', '*.png'), recursive=True)
    train_images = sorted(train_images)

    # 画像表示
    i=1
    plt.figure(figsize=(40,10 * math.ceil(len(slice_list)/5)))
    for slice_i in slice_list:
        path = train_images[slice_i-1]
        img = load_img(path)
        mask = id2mask(df_train[df_train['image_path']==path].iloc[0,0])*255
        #print(df_train[df_train['image_path']==path].iloc[0,0])
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        img = clahe.apply(img)
        plt.subplot(math.ceil(len(slice_list)/5),5,i)
        plt.imshow(img, cmap='gray')
        if mask is not None:
            plt.imshow(mask, alpha=0.5)
            handles = [Rectangle((0,0),1,1, color=_c) for _c in [(0.667,0.0,0.0), (0.0,0.667,0.0), (0.0,0.0,0.667)]]
            labels = [ "Large Bowel", "Small Bowel", "Stomach"]
            plt.legend(handles,labels)
        #plt.axis('off')
        plt.title('slice {}'.format(slice_i))
        #plt.tight_layout()
        i+=1
    plt.show()

In [None]:
df_train = pd.read_csv('../input/uw-madison-gi-tract-image-segmentation/train.csv')
df_train = df_train.progress_apply(get_metadata, axis=1)
df_train.head()

In [None]:
paths = glob('../input/uw-madison-gi-tract-image-segmentation/train/*/*/*/*')
path_df = pd.DataFrame(paths, columns=['image_path'])
path_df = path_df.progress_apply(path2info, axis=1)
df_train = df_train.merge(path_df, on=['case','day','slice'])
df_train.head()

In [None]:
CASE = '15'
DAY = '0'
slice_list = range(50,60,1) #スライス番号 1始まり
display_xy_imgs_masked(CASE, DAY, slice_list, df_train)

In [None]:
CASE = '138'
DAY = '0'
slice_list = range(70,120,1) #スライス番号 1始まり
display_xy_imgs_masked(CASE, DAY, slice_list, df_train)

case15  day0 :大腸のラベルが抜けている？<br>
case138 day0 :明らかに大腸のラベルが抜けている。また、大腸と小腸のマスクが被っている箇所がある（Slice97）。

今回は、たまたま目についたラベル（マスク）情報のミスを見つけたが、次回は網羅的にラベルのおかしい箇所を洗い出したい。