This notebook shows you how to resize your image into a smaller format (e.g. 128px, 256px) and perform various transforms to obtain a specific shape (e.g. pad or stretch to square). It also saves the data into a `tar.gz` file that you can use to create a dataset.

For the list of datasets created from (variations of) this notebook, check out [this discussion post](https://www.kaggle.com/c/bms-molecular-translation/discussion/223477). You will be able to find various shapes, sizes and file format (png and jpg).



By using the output of this notebook, you are accepting the [competition rules](https://www.kaggle.com/c/bms-molecular-translation/rules).

In [None]:
import os
from multiprocessing import Pool

from PIL import Image, ImageOps
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

In [None]:
def convert_to_path(split: str):
    # Modified from: https://www.kaggle.com/ihelon/molecular-translation-exploratory-data-analysis
    def aux(image_id: str) -> str:
        return "../input/bms-molecular-translation/{}/{}/{}/{}/{}.png".format(
            split, image_id[0], image_id[1], image_id[2], image_id 
        )

    return aux

In [None]:
def pillow_pad(im, desired_size, color=0, resample=Image.LANCZOS, copy=False):
    # Source: https://jdhao.github.io/2017/11/06/resize-image-to-square-with-padding/
    
    if copy:
        im = im.copy()
    
    old_size = im.size  # old_size[0] is in (width, height) format

    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    # use thumbnail() or resize() method to resize the input image

    # thumbnail is a in-place operation
    im.thumbnail(new_size, resample)

    # create a new image and paste the resized on it

    new_im = Image.new("RGB", (desired_size, desired_size), color=color)
    new_im.paste(im, ((desired_size-new_size[0])//2,
                        (desired_size-new_size[1])//2))

    return new_im

In [None]:
def resize(im, size, mode='keep_ratio', color=0, resample=Image.LANCZOS, inplace=True):
    # Source: https://www.kaggle.com/xhlulu/bms-molecular-resizing-and-reshaping
    if not inplace:
        im = im.copy()
    if mode == 'keep_ratio':
        im.thumbnail((size, size), resample)
    elif mode == "pad":
        im = pillow_pad(im, desired_size=size, color=color, resample=resample, copy=False)
    elif mode == 'stretch':
        im = im.resize((size, size), resample)
    else:
        raise ValueError("Invalid mode. Please choose 'keep_ratio', 'pad', or 'stretch'.")
    
    return im

In [None]:
%%time
train = pd.read_csv('../input/bms-molecular-translation/train_labels.csv')
test = pd.read_csv('../input/bms-molecular-translation/sample_submission.csv')

train_paths = train.image_id.apply(convert_to_path('train'))
test_paths = test.image_id.apply(convert_to_path('test'))

paths = {"train": train_paths.tolist(), "test": test_paths.tolist()}

In [None]:
%%time
for split in ['train', 'test']:
    save_dir = f'/kaggle/tmp/{split}/'
    os.makedirs(save_dir, exist_ok=True)
    
    def process_fn(path):
        name = path.split('/')[-1]
        im = Image.open(path)
        im = resize(im, 128, mode='pad', color=(255, 255, 255))
        im.save(save_dir + name.replace("png", "jpg"))
    
    with Pool() as p:
        p.map(process_fn, paths[split])

In [None]:
%%time
!tar -zcf data.tar.gz -C "/kaggle/tmp/" .
# alternatively:
# !tar -cf data.tar -C "/kaggle/tmp/" .