# Process Data

This notebook:
- loads the raw images from the kaggle dataset
- Converts them to a uniform size without distorting them
- Generates the mask images for the test and validation images given the encoded solutions
- Packages the images, their masks and the image id's into a dictionary and saves them.

## File Paths

In [2]:
test_dir = 'kaggle_raw/stage2_test_final'
val_dir = 'kaggle_raw/stage1_test'
train_dir = 'kaggle_raw/stage1_train'
train_mask_csv = 'kaggle_raw/stage1_train_labels_withmeta.csv'
val_mask_csv = 'kaggle_raw/stage1_solution.csv'
test_mask_csv = 'kaggle_raw/stage2_solution_final.csv'

## Imports

In [3]:
import numpy as np
import torch
from imageio import imread
import pandas as pd
import os
from create_masks import CreateMask
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from skimage.transform import rescale

## Data Loading Functions

In [21]:
def img_2_square(img, new_size=128):
    '''Crops and resizes an image into a square image without distorting
    the contents of the image.

    Args:
        img: a numpy array ot tuple containing the image
        new_size (int): An integer representing the length and width
            of the ouput images

    Returns:
        output_images (list): A list containing the output images. If an
            image is highly rectangular, then it is split into multiple
            sub images in order to preserve as much data as possible
    '''
    img = np.array(img)
    dim = img.shape
    
    if (dim[0] > dim[1]):
        num_splits = int(dim[0]/dim[1] + 0.5)
        step = int(dim[0]/num_splits)
        img_list = [img[step*i: step*(i + 1), :] for i in range(num_splits)]
    else:
        num_splits = int(dim[1]/dim[0] + 0.5)
        step = int(dim[1]/num_splits)
        img_list = [img[:, step*i: step*(i + 1)] for i in range(num_splits)]
    
    output_images = []
    for im in img_list:
        # Scale the smaller dimension to the exact desired size, and crop the excess
        im = rescale(im, scale=new_size/np.min(dim[0:2]))
        im = im[:new_size, :new_size]
        output_images.append(im)

    return output_images

def add_info_to_train_csv(img_dir, labels_csv, save_path):
    labels = pd.read_csv(labels_csv)
    unique_ids = labels['ImageId'].unique()
    rows = []
    for id in tqdm(unique_ids):
        img = imread(os.path.join(img_dir, id, 'images', id + '.png'))
        vals = labels.loc[labels['ImageId'] == id]['EncodedPixels'].values
        for val in vals:
            rows.append([id, val, img.shape[0], img.shape[1], 'Public'])
    df = pd.DataFrame(rows, columns=['ImageId', 'EncodedPixels', 'Height', 'Width', 'Usage'])
    df.to_csv(save_path, index=False)


def get_data(img_dir, labels_csv, train=True):
    masks = CreateMask().generate_masks(labels_csv, iterator=tqdm)
    rows = []
    for i, row in tqdm(masks.iterrows(), total=len(masks.index)):
        id = row['ImageId']
        mask = row['Mask']
        if train:
            img = imread(os.path.join(img_dir, id, 'images', id + '.png'), pilmode='RGB')
        else:
            img = imread(os.path.join(img_dir, id, id + '.png'), pilmode='RGB')
        square_imgs = img_2_square(img)
        square_masks = img_2_square(mask)
        for i in range(len(square_imgs)):
            rows.append([id, square_imgs[i], square_masks[i], len(square_imgs)])
    
    return pd.DataFrame(rows, columns=['image_id', 'image', 'mask', 'num_splits'])

In [22]:
#add_info_to_train_csv(train_dir, labels_csv='kaggle_raw/stage1_train_labels.csv', save_path='kaggle_raw/stage1_train_labels_withmeta.csv')
train = get_data(train_dir, train_mask_csv, train=True)
    

  0%|          | 0/670 [00:00<?, ?it/s]

  0%|          | 0/670 [00:00<?, ?it/s]

In [24]:
print(train.keys())

fig = plt.figure()
i = 1
for k in range(5):
    fig.add_subplot(5, 2, i)
    plt.imshow(train['image'][k])
    fig.add_subplot(5, 2, i+1)
    i = i+2

Index(['image_id', 'image', 'mask', 'num_splits'], dtype='object')
