# Convert Places365 data to pngs

To show overhead of loading individual files. This is intended to be used (and should be run before) the `PyTorch_profiling.ipynb` notebook.

In [47]:
# From https://github.com/mttk/STL10/blob/master/stl10_input.py

from __future__ import print_function

import sys
import os, sys, tarfile, errno
import numpy as np
import matplotlib.pyplot as plt
import pickle
import time

from multiprocessing import Pool, Manager
    
if sys.version_info >= (3, 0, 0):
    import urllib.request as urllib # ugly but works
else:
    import urllib

from PIL import Image
#try:
#    from imageio import imsave
#except:
#    from scipy.misc import imsave

print(sys.version_info) 

# image shape
HEIGHT = 96
WIDTH = 96
DEPTH = 3

# size of a single image in bytes
SIZE = HEIGHT * WIDTH * DEPTH

# path to the directory with the data
DATA_DIR = os.getenv('TEACHER_DIR', os.getcwd()) + '/JHL_data'

# url of the binary data
DATA_URL = 'http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz'

# path to the binary train file with image data
DATA_PATH = os.path.join(DATA_DIR, 'stl10_binary/train_X.bin')

# path to the binary train file with labels
LABEL_PATH = os.path.join(DATA_DIR, 'stl10_binary/train_y.bin')

def read_labels(path_to_labels):
    """
    :param path_to_labels: path to the binary file containing labels from the STL-10 dataset
    :return: an array containing the labels
    """
    with open(path_to_labels, 'rb') as f:
        labels = np.fromfile(f, dtype=np.uint8)
        return labels


def read_all_images(path_to_data):
    """
    :param path_to_data: the file containing the binary images from the STL-10 dataset
    :return: an array containing all the images
    """

    with open(path_to_data, 'rb') as f:
        # read whole file in uint8 chunks
        everything = np.fromfile(f, dtype=np.uint8)

        # We force the data into 3x96x96 chunks, since the
        # images are stored in "column-major order", meaning
        # that "the first 96*96 values are the red channel,
        # the next 96*96 are green, and the last are blue."
        # The -1 is since the size of the pictures depends
        # on the input file, and this way numpy determines
        # the size on its own.

        images = np.reshape(everything, (-1, 3, 96, 96))

        # Now transpose the images into a standard image format
        # readable by, for example, matplotlib.imshow
        # You might want to comment this line or reverse the shuffle
        # if you will use a learning algorithm like CNN, since they like
        # their channels separated.
        images = np.transpose(images, (0, 3, 2, 1))
        return images


def read_single_image(image_file):
    """
    CAREFUL! - this method uses a file as input instead of the path - so the
    position of the reader will be remembered outside of context of this method.
    :param image_file: the open file containing the images
    :return: a single image
    """
    # read a single image, count determines the number of uint8's to read
    image = np.fromfile(image_file, dtype=np.uint8, count=SIZE)
    # force into image matrix
    image = np.reshape(image, (3, 96, 96))
    # transpose to standard format
    # You might want to comment this line or reverse the shuffle
    # if you will use a learning algorithm like CNN, since they like
    # their channels separated.
    image = np.transpose(image, (2, 1, 0))
    return image


def plot_image(image):
    """
    :param image: the image to be plotted in a 3-D matrix format
    :return: None
    """
    plt.imshow(image)
    plt.show()

def save_image(image, name):
    #imsave("%s.png" % name, image, format="png")
    pil_image = Image.fromarray(image)
    pil_image.save(f'{name}')

def download_and_extract():
    """
    Download and extract the STL-10 dataset
    :return: None
    """
    dest_directory = DATA_DIR
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    filename = DATA_URL.split('/')[-1]
    filepath = os.path.join(dest_directory, filename)
    if not os.path.exists(filepath):
        def _progress(count, block_size, total_size):
            sys.stdout.write('\rDownloading %s %.2f%%' % (filename,
                float(count * block_size) / float(total_size) * 100.0))
            sys.stdout.flush()
        filepath, _ = urllib.urlretrieve(DATA_URL, filepath, reporthook=_progress)
        print('Downloaded', filename)
        tarfile.open(filepath, 'r:gz').extractall(dest_directory)

def save_img(img, label, j, iteration):
    
def save_images(images, labels):
    print("Saving images to disk")
    label_filename = os.path.join(DATA_DIR, "stl10_labels.pkl")
    img_dir = os.path.join(DATA_DIR, "stl10_png")
  
    # Create 1000 copies of this dataset, so we have a total of 95 GB of data, to avoid the FS caching everything
    j = 0
    
    with Manager() as manager:
        label_dict = manager.dict()
    for image in images:
        tic = time.perf_counter()
        label = labels[j]
        #print(f"Label: {label}")
        pil_image = Image.fromarray(image)
        for iteration in range(1000):
            filename = str(label) + "_" + str(j) + "_" str(iteration) + str(".png")
            #print(f"Saving file: {filename}")
    
            full_filepath = os.path.join(img_dir, filename)
            #print(f"At filepath: {full_filepath}")
    
            pil_image.save(f'{full_filepath}')
        
            # Store label in dictionary
            label_dict[filename] = label
    
        j = j+1
        
        print(f"{100*j/5000}%")
                
        toc = time.perf_counter()
        print(f"Image took {toc - tic:0.4f} seconds")
    
    print(f"Storing labels at: {label_filename}")
    with open(label_filename, "wb") as f:
        pickle.dump(label_dict, f)
        
        # print(filename)
        # save_image_and_label(image, filename, label)

    


IndentationError: expected an indented block (3412865834.py, line 140)

In [53]:
# From https://github.com/mttk/STL10/blob/master/stl10_input.py

from __future__ import print_function

import sys
import os, sys, tarfile, errno
import numpy as np
import matplotlib.pyplot as plt
import pickle
import time

from multiprocessing import Pool, Manager
    
if sys.version_info >= (3, 0, 0):
    import urllib.request as urllib # ugly but works
else:
    import urllib

from PIL import Image
#try:
#    from imageio import imsave
#except:
#    from scipy.misc import imsave

print(sys.version_info) 

# image shape
HEIGHT = 96
WIDTH = 96
DEPTH = 3

# size of a single image in bytes
SIZE = HEIGHT * WIDTH * DEPTH

# path to the directory with the data
DATA_DIR = os.getenv('TEACHER_DIR', os.getcwd()) + '/JHL_data'

# url of the binary data
DATA_URL = 'http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz'

# path to the binary train file with image data
DATA_PATH = os.path.join(DATA_DIR, 'stl10_binary/train_X.bin')

# path to the binary train file with labels
LABEL_PATH = os.path.join(DATA_DIR, 'stl10_binary/train_y.bin')

def read_labels(path_to_labels):
    """
    :param path_to_labels: path to the binary file containing labels from the STL-10 dataset
    :return: an array containing the labels
    """
    with open(path_to_labels, 'rb') as f:
        labels = np.fromfile(f, dtype=np.uint8)
        return labels


def read_all_images(path_to_data):
    """
    :param path_to_data: the file containing the binary images from the STL-10 dataset
    :return: an array containing all the images
    """

    with open(path_to_data, 'rb') as f:
        # read whole file in uint8 chunks
        everything = np.fromfile(f, dtype=np.uint8)

        # We force the data into 3x96x96 chunks, since the
        # images are stored in "column-major order", meaning
        # that "the first 96*96 values are the red channel,
        # the next 96*96 are green, and the last are blue."
        # The -1 is since the size of the pictures depends
        # on the input file, and this way numpy determines
        # the size on its own.

        images = np.reshape(everything, (-1, 3, 96, 96))

        # Now transpose the images into a standard image format
        # readable by, for example, matplotlib.imshow
        # You might want to comment this line or reverse the shuffle
        # if you will use a learning algorithm like CNN, since they like
        # their channels separated.
        images = np.transpose(images, (0, 3, 2, 1))
        return images


def read_single_image(image_file):
    """
    CAREFUL! - this method uses a file as input instead of the path - so the
    position of the reader will be remembered outside of context of this method.
    :param image_file: the open file containing the images
    :return: a single image
    """
    # read a single image, count determines the number of uint8's to read
    image = np.fromfile(image_file, dtype=np.uint8, count=SIZE)
    # force into image matrix
    image = np.reshape(image, (3, 96, 96))
    # transpose to standard format
    # You might want to comment this line or reverse the shuffle
    # if you will use a learning algorithm like CNN, since they like
    # their channels separated.
    image = np.transpose(image, (2, 1, 0))
    return image


def plot_image(image):
    """
    :param image: the image to be plotted in a 3-D matrix format
    :return: None
    """
    plt.imshow(image)
    plt.show()

def save_image(image, name):
    #imsave("%s.png" % name, image, format="png")
    pil_image = Image.fromarray(image)
    pil_image.save(f'{name}')

def download_and_extract():
    """
    Download and extract the STL-10 dataset
    :return: None
    """
    dest_directory = DATA_DIR
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    filename = DATA_URL.split('/')[-1]
    filepath = os.path.join(dest_directory, filename)
    if not os.path.exists(filepath):
        def _progress(count, block_size, total_size):
            sys.stdout.write('\rDownloading %s %.2f%%' % (filename,
                float(count * block_size) / float(total_size) * 100.0))
            sys.stdout.flush()
        filepath, _ = urllib.urlretrieve(DATA_URL, filepath, reporthook=_progress)
        print('Downloaded', filename)
        tarfile.open(filepath, 'r:gz').extractall(dest_directory)

def save_img(img, label_dict, label, j, iteration):
    filename = str(label) + "_" + str(j) + "_" + str(iteration) + str(".png")
            #print(f"Saving file: {filename}")
    
    full_filepath = os.path.join(img_dir, filename)
            #print(f"At filepath: {full_filepath}")
    
    img.save(f'{full_filepath}')
        
            # Store label in dictionary
    label_dict[filename] = label
    
def save_images(images, labels):
    print("Saving images to disk")
    label_filename = os.path.join(DATA_DIR, "stl10_labels.pkl")
    img_dir = os.path.join(DATA_DIR, "stl10_png")
  
    # Create 1000 copies of this dataset, so we have a total of 95 GB of data, to avoid the FS caching everything
    j = 0
    
    with Manager() as manager:
        label_dict = manager.dict()
    pool = Pool(processes=6)
    
    for image in images:
        tic = time.perf_counter()
        label = labels[j]
        #print(f"Label: {label}")
        pil_image = Image.fromarray(image)  
        pool.starmap(save_img, ((pil_image, label_dict, label, j, iteration) for iteration in range(1000)))
        #for iteration in range(1000):
            #filename = str(label) + "_" + str(j) + "_" str(iteration) + str(".png")
            #print(f"Saving file: {filename}")
    
            #full_filepath = os.path.join(img_dir, filename)
            #print(f"At filepath: {full_filepath}")
    
            #pil_image.save(f'{full_filepath}')
        
            # Store label in dictionary
            #label_dict[filename] = label
    
        j = j+1
        
        print(f"{100*j/5000}%")
                
        toc = time.perf_counter()
        print(f"Image took {toc - tic:0.4f} seconds")
    
    print(f"Storing labels at: {label_filename}")
    with open(label_filename, "wb") as f:
        pickle.dump(label_dict, f)
        
        # print(filename)
        # save_image_and_label(image, filename, label)

    


sys.version_info(major=3, minor=9, micro=5, releaselevel='final', serial=0)


In [54]:
# download data if needed
# download_and_extract()

# test to check if the image is read correctly
with open(DATA_PATH) as f:
    image = read_single_image(f)
    plot_image(image)

# test to check if the whole dataset is read correctly
images = read_all_images(DATA_PATH)
print(images.shape)

labels = read_labels(LABEL_PATH)
print(labels.shape)

# save images to disk
save_images(images, labels)

(5000, 96, 96, 3)
(5000,)
Saving images to disk


Process ForkPoolWorker-66:
Process ForkPoolWorker-67:
Process ForkPoolWorker-70:
Process ForkPoolWorker-65:
Process ForkPoolWorker-69:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-68:
  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/

KeyboardInterrupt: 

  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/sw/arch/Debian10/EB_production/2021/software/Python/3.9.5-GCCcore-10.3.0/lib/python3.9/multiprocessing/process.py", line 108, in run


In [None]:
print(labels)

Create path to where pngs should be stored

In [47]:
DATA_PATH = os.getenv('TEACHER_DIR', os.getcwd()) + '/JHL_data'

png_path=os.path.join(DATA_PATH,'places365_png')
if not os.path.exists(png_path):
    os.makedirs(png_path)

print(f"Path to store places365 data: {DATA_PATH}")
print(f"Path to store pngs: {png_path}")

Path to store cifar10 data: /project/jhlsrf016/JHL_data
Path to store pngs: /project/jhlsrf016/JHL_data/cifar10_png


Make sure Cifar10 data is present in the DATA_PATH