In [None]:
import joblib
import json
import multiprocessing
import os
import shutil
import zipfile
import warnings
warnings.simplefilter('ignore', UserWarning)

import tqdm.auto as tqdm

import skimage
import skimage.io
import skimage.color
import skimage.transform

Add Painter by Numbers competition dataset on Kaggle to the session before running the notebook.

COCO Dataset downloaded from [here](https://cocodataset.org/#download).

Painter by Numbers competition dataset is [here](https://www.kaggle.com/c/painter-by-numbers/data).

If you run this locally, download the data beforehand and adjust the paths appropriately.

In [None]:
UPDATE = False # set to True if this is an update to the dataset

In [None]:
DATASET_STORAGE_PATH = "/kaggle/tmp/coco_wikiart_nst_dataset"
os.makedirs(DATASET_STORAGE_PATH)

In [None]:
IMAGE_SIZE = 512
DATASET_SIZE = 50000 # this fits into the Kaggle /tmp folder, but barely

In [None]:
!wget http://images.cocodataset.org/zips/unlabeled2017.zip -O unlabeled.zip
with zipfile.ZipFile("unlabeled.zip", "r") as archive:
    for i, member in enumerate(tqdm.tqdm(archive.namelist(), desc="Extracting", unit="files", unit_scale=False)):
        if i > DATASET_SIZE:
            break
        archive.extract(member, f"{DATASET_STORAGE_PATH}")

In [None]:
with zipfile.ZipFile("../input/painter-by-numbers/train.zip", "r") as archive:
    for i, member in enumerate(tqdm.tqdm(archive.namelist(), desc="Extracting", unit="files", unit_scale=False)):
        if i > DATASET_SIZE:
            break
        archive.extract(member, f"{DATASET_STORAGE_PATH}")

In [None]:
extracted = len(list(os.listdir(f"{DATASET_STORAGE_PATH}/train")))
print(f"Already extracted {extracted} files")

In [None]:
with zipfile.ZipFile("../input/painter-by-numbers/test.zip", "r") as archive:
    for i, member in enumerate(tqdm.tqdm(archive.namelist(), desc="Extracting", unit="files", unit_scale=False)):
        if extracted + i > DATASET_SIZE:
            break
        archive.extract(member, f"{DATASET_STORAGE_PATH}")

In [None]:
def resize_image(file, target_dir):
    file = os.path.abspath(file)
    fname, ext = os.path.splitext(file)
    root, fname = os.path.split(fname)
    try:
        image = skimage.io.imread(file)
    except Exception:
        os.remove(file)
        return
    resized = skimage.transform.resize(image, (IMAGE_SIZE, IMAGE_SIZE), anti_aliasing=True)
    if len(image.shape) == 2:
        resized = skimage.color.gray2rgb(resized)
    if image.shape[-1] == 4:
        resized = skimage.color.rgba2rgb(resized)
    skimage.io.imsave(os.path.join(target_dir, fname + ".jpg"), skimage.img_as_ubyte(resized))

In [None]:
def copy_and_resize_images(paths, target, size):
    """
    Move dataset images to a single folder while resizing each image.
    :param paths: list of paths to all directories with images.
    :param target: target directory where to store all images.
    """
    os.makedirs(target, exist_ok=True)
    tqdm_wrapped = tqdm.tqdm(paths, desc="Moving", unit="directory", unit_scale=False)
    acc = 0
    for dir in tqdm_wrapped:
        tqdm_wrapped.set_description(f"Moving files from {os.path.abspath(dir)}")
        acc = 0
        for root, dirs, files in os.walk(os.path.abspath(dir)):
            if acc + len(files) > size:
                files = files[:size - acc]
            files = [os.path.join(root, i) for i in files]
            num_of_files = joblib.Parallel(n_jobs=-1)\
            (joblib.delayed(resize_image)(e, target)\
             for i, e in enumerate(tqdm.tqdm(files, desc="Resizing", unit="images", unit_scale=False)))
            copied = len(num_of_files)
            acc += copied

Copy your entire `kaggle.json` to a session Secret (`Add-ons -> Secrets`).

(Not applicable for local runs)

In [None]:
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') != '':
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    kaggle_key = user_secrets.get_secret("kaggle_key")
    os.makedirs("/root/.kaggle", exist_ok=True)
    with open("/root/.kaggle/kaggle.json", "w") as f:
        f.write(kaggle_key)
    del kaggle_key
    os.chmod("/root/.kaggle/kaggle.json", 600)

In [None]:
copy_and_resize_images([f"{DATASET_STORAGE_PATH}/train", f"{DATASET_STORAGE_PATH}/test"], 
                       f"{DATASET_STORAGE_PATH}/style", DATASET_SIZE)

In [None]:
shutil.rmtree(f"{DATASET_STORAGE_PATH}/train")
shutil.rmtree(f"{DATASET_STORAGE_PATH}/test")

In [None]:
DATASET_SIZE = len(list(os.listdir(f"{DATASET_STORAGE_PATH}/style")))

In [None]:
copy_and_resize_images([f"{DATASET_STORAGE_PATH}/unlabeled2017"],
          f"{DATASET_STORAGE_PATH}/content", DATASET_SIZE)

In [None]:
shutil.rmtree(f"{DATASET_STORAGE_PATH}/unlabeled2017")

In [None]:
!ls {DATASET_STORAGE_PATH}

In [None]:
!kaggle datasets init -p {DATASET_STORAGE_PATH}
METADATA = {
        "title": "COCO/WikiArt NST Dataset",
        "id": "shaorrran/coco-wikiart-nst-dataset-512-100000",
        "description": "Dataset for Neural Style Transfer consisting of COCO2017 images \
        and Kaggle competition \"Painter by Numbers\" dataset.\nThe number of COCO images and style images is the same.\n\
        Intended for use with NST using Adaptive Instance Normalization.\n\
        All respective licenses for used datasets apply to corresponding parts of this dataset.",
        "licenses": [{"name": "unknown"}],
    }
with open(f"{DATASET_STORAGE_PATH}/dataset-metadata.json", "w", encoding="utf-8") as f:
    json.dump(METADATA, f, ensure_ascii=False, indent=4)
if UPDATE:
    !kaggle datasets version -p {DATASET_STORAGE_PATH} -m "update dataset" -r zip
else:
    !kaggle datasets create -p {DATASET_STORAGE_PATH} -r zip