In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ["KMP_SETTINGS"] = "false"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
import numpy as np
from zipfile import ZipFile

In [None]:
DS_PATH = '/kaggle/input/painter-by-numbers'
ds_path_of = lambda x : os.path.join(DS_PATH, x)
df = pd.read_csv(ds_path_of('all_data_info.csv'))

In [None]:
df = df.sample(frac=1, random_state=27)
train_df = df[df['in_train'] == True]
test_df = df[df['in_train'] == False]
train_filenames = train_df['new_filename']
test_filenames = test_df['new_filename']
train_labels = train_df['artist'].astype("category").cat.codes 
test_labels = test_df['artist'].astype("category").cat.codes 

In [None]:
IMG_WIDTH = 224
IMG_HEIGHT = 224
NUM_EXAMPLES = len(df)
EXAMPLE_SIZE = (IMG_WIDTH * IMG_HEIGHT * 3) + 8
SHARD_SIZE = 150 * int(2**20)
EXAMPLES_PER_SHARD = SHARD_SIZE // EXAMPLE_SIZE
TRAIN_EXAMPLES = len(test_filenames)
TEST_EXAMPLES = len(train_filenames)

TRAIN_SHARDS = TRAIN_EXAMPLES // EXAMPLES_PER_SHARD
TEST_SHARDS = TEST_EXAMPLES // EXAMPLES_PER_SHARD

NUM_CLASSES = max(train_labels)

print(f'Num train examples: {TRAIN_EXAMPLES}')
print(f'Num test examples: {TEST_EXAMPLES}')
print(f'Num classes: {NUM_CLASSES}')
print(f'Num examples per shard is {EXAMPLES_PER_SHARD}')
print(f'Dataset size is: {(EXAMPLE_SIZE * NUM_EXAMPLES)/(2**30):.0f} GB')

In [None]:
from multiprocessing import Process, Semaphore, cpu_count

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def write_shard(shard, zippath, output_filename, num_zeros, n_shards, shard_size, subdir, filenames, labels):
    with ZipFile(zippath) as input_zip:
        shard_filename = f'{output_filename}.{str(shard+1).zfill(num_zeros)}-of-{n_shards}'
        start = shard_size * shard
        end = shard_size * (shard+1)
        try:
            with tf.io.TFRecordWriter(shard_filename) as writer:
                for filename, label in zip(filenames[start:end], labels[start:end]):
                    img = tf.io.decode_image(input_zip.read(subdir + filename), channels=3, expand_animations=False)
                    img = tf.image.resize(img, (IMG_HEIGHT, IMG_WIDTH))
                    img = tf.image.convert_image_dtype(img, tf.uint8)
                    img_bytes = tf.io.encode_jpeg(img)

                    example = tf.train.Example(features=tf.train.Features(feature={
                        'label' : _int64_feature(label),
                        'image' : _bytes_feature(img_bytes)
                    }))

                    writer.write(example.SerializeToString())
        except Exception as e:
            print(e)
            
def worker(shard, zippath, output_filename, num_zeros, n_shards, shard_size, subdir, filenames, labels, sema):
    write_shard(shard, zippath, output_filename, num_zeros, n_shards, shard_size, subdir, filenames, labels)
    sema.release()
                    
def write_dataset(filenames, labels, input_file, subdir, output_filename, shard_size):
    n_shards = len(filenames) // shard_size
    num_zeros = len(str(n_shards))
    done = 0
    zippath = ds_path_of(input_file)
    sema = Semaphore(cpu_count())
    processes = []
    for shard in range(n_shards):
        sema.acquire()
        print(f'\rShard {shard+1}/{n_shards} started',end="")
        proc = Process(target=worker, args=(shard, zippath, output_filename, num_zeros, n_shards, shard_size, subdir, filenames, labels, sema))
        processes.append(proc)
        proc.start()
        
    [p.join() for p in processes]
    print('Done')
            

In [None]:
save_path = '/kaggle/working/tfrecords/'

if not os.path.exists(save_path):
    os.makedirs(save_path)

In [None]:
write_dataset(train_filenames, train_labels, 'train.zip', 'train/', os.path.join(save_path,'train.tfrecord'), EXAMPLES_PER_SHARD)

In [None]:
write_dataset(test_filenames, test_labels, 'test.zip', 'test/', os.path.join(save_path,'test.tfrecord'), EXAMPLES_PER_SHARD)