In [None]:
from PIL import Image
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
from glob import glob
import gc
from sklearn.model_selection import KFold
gc.enable()

In [None]:
df = pd.read_csv("../input/uw-madison-gi-tract-image-segmentation/train.csv")

df["case_id_str"] = df["id"].apply(lambda x: x.split("_", 2)[0])
df["case_id"] = df["id"].apply(lambda x: int(x.split("_", 2)[0].replace("case", "")))

# 2. Get Day as a column
df["day_num_str"] = df["id"].apply(lambda x: x.split("_", 2)[1])
df["day_num"] = df["id"].apply(lambda x: int(x.split("_", 2)[1].replace("day", "")))

# 3. Get Slice Identifier as a column
df["slice_id"] = df["id"].apply(lambda x: x.split("_", 2)[2])

TRAIN_DIR = '../input/uw-madison-gi-tract-image-segmentation/train'
# Get all training images
all_train_images = glob(os.path.join(TRAIN_DIR, "**", "*.png"), recursive=True)
p = []
x = all_train_images[0].rsplit("/", 4)[0]
for i in range(0, df.shape[0]):
    p.append(os.path.join(x, df["case_id_str"].values[i], df["case_id_str"].values[i]+"_"+df["day_num_str"].values[i], "scans", df["slice_id"].values[i]))
df["_partial_ident"] = p

p = []
for i in range(0, len(all_train_images)):
    p.append(str(all_train_images[i].rsplit("_",4)[0]))
    
_tmp_merge_df = pd.DataFrame()
_tmp_merge_df['_partial_ident'] = p
_tmp_merge_df['f_path'] = all_train_images

df = df.merge(_tmp_merge_df, on="_partial_ident").drop(columns=["_partial_ident"])

# 5. Get slice dimensions from filepath (int in pixels)
df["slice_h"] = df["f_path"].apply(lambda x: int(x[:-4].rsplit("_",4)[1]))
df["slice_w"] = df["f_path"].apply(lambda x: int(x[:-4].rsplit("_",4)[2]))

# 6. Pixel spacing from filepath (float in mm)
df["px_spacing_h"] = df["f_path"].apply(lambda x: float(x[:-3].rsplit("_",4)[3]))
df["px_spacing_w"] = df["f_path"].apply(lambda x: float(x[:-4].rsplit("_",4)[4]))


df1 = df[df.index % 3 == 0]
df2 = df[df.index % 3 == 1]
df3 = df[df.index % 3 == 2]
df = df1.copy()
df.pop('class')
gc.collect()

x = df1.pop('segmentation')
x1 = df2.pop('segmentation')
x2 = df3.pop('segmentation')
df['large_bowel_segmentation'] = x.values
df['small_bowel_segmentation'] = x1.values
df['stomach_segmentation'] = x2.values
df.pop('segmentation')
del x, x2, x1, df1, df2, df3
gc.collect()
df = df.reset_index(drop=True)

un, co = np.unique(df['case_id'], return_counts=True)
pdf = pd.DataFrame()
pdf['cases'] = un
pdf['count'] = co

skf = KFold(n_splits=5)
i = 0
for fold, (train_index, test_index) in enumerate(skf.split(un, co)):
    pdf.loc[test_index, "kfold"] = fold
    i += 1
    
df['kfold'] = -1
for i in range(0, 5):
    df.loc[df[df['case_id'].isin(pdf[pdf['kfold'] == i]['cases'].values)].index, 'kfold'] = i
print (df['kfold'].value_counts())

rle_paths = []
image_paths = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    rle_paths.append(os.path.join("train_masks", row['case_id_str'] + '_' + row['day_num_str'], "scans", row['slice_id']+'_'+str(row['slice_h'])+'_'+str(row['slice_w'])+'_'+str(format(row['px_spacing_h'],'.2f'))+'_'+str(format(row['px_spacing_w'],'.2f'))+ '.npy'))
    image_paths.append(os.path.join("../input/uw-madison-gi-tract-image-segmentation/train",row['case_id_str'] ,row['case_id_str'] + '_' + row['day_num_str'], "scans", row['slice_id']+'_'+str(row['slice_h'])+'_'+str(row['slice_w'])+'_'+str(format(row['px_spacing_h'],'.2f'))+'_'+str(format(row['px_spacing_w'],'.2f'))+ '.png'))
df['mask_path'] = rle_paths
df['image_path'] = image_paths
df

In [None]:
if not os.path.exists('./train_masks'):
    os.makedirs('./train_masks')
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    img = Image.open(row['image_path']).convert("RGB")
    data = np.array( img, dtype='uint8' )
    np.save('./train_masks/' + row['id'] + '.npy', data)