In [1]:
import os
import PIL
from PIL import Image
import gzip
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
def load_data_from_idx_file(path, idx=1):

    DATA_TYPE_MAP = {
       8  : np.uint8,   # 0x08
       9  : np.int8,    # 0x09   
       11 : np.int16,   # 0x0B    
       12 : np.int32,   # 0x0C           
       13 : np.float32, # 0x0D      
       14 : np.float64, # 0x0E       
    }
    print("Load data from :", path)
    
    with gzip.open(path, 'r') as f: 
        magic_byte = f.read(4)
        magic_number = int.from_bytes(magic_byte, 'big')
        print( "\t magic_number:", magic_number )

        data_count = int.from_bytes(f.read(4), 'big')
        print( "\t data_count:", data_count )

        dims = [int.from_bytes(f.read(4), 'big') for i in range(1, idx)]
        print("\t dims:", dims)

        data_type = DATA_TYPE_MAP[ magic_byte[2] ]
        data_arr = np.frombuffer(f.read(), dtype=data_type).reshape((data_count, *dims))
        print("\t data_arr.shape:", data_arr.shape, "\n")
    return data_arr

In [3]:
def convert_and_save(
    images_gz_path,
    labels_gz_path,
    prefix,
    dst_dir
):
    img_save_dir = os.path.join(dst_dir, prefix)
    if not os.path.exists(img_save_dir):
        os.makedirs(img_save_dir)
    
    image_arr = load_data_from_idx_file(images_gz_path, idx=3)
    label_arr = load_data_from_idx_file(labels_gz_path, idx=1)
    
    data_list = []
    pbar = tqdm(zip(image_arr, label_arr), total=len(image_arr))
    for i, (img, label) in enumerate( pbar ):
        save_name = f"{prefix}_{i:05}.png"
        save_path = os.path.join(img_save_dir, save_name)
        PIL.Image.fromarray(img).save(save_path)
        data_list.append({
            "name":save_name,
            "label":label
        })
    metadata_df = pd.DataFrame(data_list)
    metadata_df.to_csv( os.path.join(dst_dir, f"{prefix}_labels.csv"), index=False )
    return metadata_df

In [4]:
convert_and_save(
    images_gz_path=r"/DATA_1/dataset/MNIST/t10k-images-idx3-ubyte.gz",
    labels_gz_path=r"/DATA_1/dataset/MNIST/t10k-labels-idx1-ubyte.gz",
    prefix="test",
    dst_dir=r"/DATA_1/Projects/exercise/mnist_pytorch/mnist"
)

  3%|███▍                                                                                                               | 302/10000 [00:00<00:03, 3018.95it/s]

Load data from : /DATA_1/dataset/MNIST/t10k-images-idx3-ubyte.gz
	 magic_number: 2051
	 data_count: 10000
	 dims: [28, 28]
	 data_arr.shape: (10000, 28, 28) 

Load data from : /DATA_1/dataset/MNIST/t10k-labels-idx1-ubyte.gz
	 magic_number: 2049
	 data_count: 10000
	 dims: []
	 data_arr.shape: (10000,) 



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:03<00:00, 3011.70it/s]


Unnamed: 0,name,label
0,test_00000.png,7
1,test_00001.png,2
2,test_00002.png,1
3,test_00003.png,0
4,test_00004.png,4
...,...,...
9995,test_09995.png,2
9996,test_09996.png,3
9997,test_09997.png,4
9998,test_09998.png,5


In [5]:
convert_and_save(
    images_gz_path=r"/DATA_1/dataset/MNIST/train-images-idx3-ubyte.gz",
    labels_gz_path=r"/DATA_1/dataset/MNIST/train-labels-idx1-ubyte.gz",
    prefix="train",
    dst_dir=r"/DATA_1/Projects/exercise/mnist_pytorch/mnist"
)

Load data from : /DATA_1/dataset/MNIST/train-images-idx3-ubyte.gz
	 magic_number: 2051
	 data_count: 60000
	 dims: [28, 28]


  1%|▌                                                                                                                  | 315/60000 [00:00<00:18, 3146.27it/s]

	 data_arr.shape: (60000, 28, 28) 

Load data from : /DATA_1/dataset/MNIST/train-labels-idx1-ubyte.gz
	 magic_number: 2049
	 data_count: 60000
	 dims: []
	 data_arr.shape: (60000,) 



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60000/60000 [00:20<00:00, 2979.63it/s]


Unnamed: 0,name,label
0,train_00000.png,5
1,train_00001.png,0
2,train_00002.png,4
3,train_00003.png,1
4,train_00004.png,9
...,...,...
59995,train_59995.png,8
59996,train_59996.png,3
59997,train_59997.png,5
59998,train_59998.png,6
