In [1]:
pip install lmdb


Note: you may need to restart the kernel to use updated packages.


In [2]:
import lmdb
import os
import pandas as pd
from PIL import Image
import io
import random

In [3]:
data_dir = 'data/imagenet_dataset'
train_dir = os.path.join(data_dir, 'ILSVRC/Data/CLS-LOC/train')
val_dir = os.path.join(data_dir, 'ILSVRC/Data/CLS-LOC/val')
mapping_file = os.path.join(data_dir, 'LOC_synset_mapping.txt')
val_solution_file = os.path.join(data_dir, 'LOC_val_solution.csv')

In [4]:
# Load the label mapping from LOC_synset_mapping.txt
def load_label_mapping(mapping_file):
    wnid_to_idx = {}
    with open(mapping_file, 'r') as f:
        lines = f.readlines()
        for idx, line in enumerate(lines):
            wnid = line.strip().split()[0]
            wnid_to_idx[wnid] = idx
    return wnid_to_idx


In [5]:
def image_to_bytes(image_path):
    with Image.open(image_path) as img:
        if img.mode != 'RGB':
            img = img.convert('RGB')  # Force conversion to RGB
        img = img.resize((224, 224))
        buffer = io.BytesIO()
        img.save(buffer, format='JPEG')
        return buffer.getvalue()

def create_lmdb(dataset, lmdb_path, map_size=200 * 1024**3):
    env = lmdb.open(lmdb_path, map_size=map_size)
    with env.begin(write=True) as txn:
        for idx, (image_path, label) in enumerate(dataset):
            key_img = f'image-{idx:09d}'.encode()
            key_label = f'label-{idx:09d}'.encode()

            img_bytes = image_to_bytes(image_path)
            txn.put(key_img, img_bytes)
            txn.put(key_label, str(label).encode())

            if idx % 25000 == 0:
                print(f"Stored {idx} images")
    env.close()
    print(f"Finished creating LMDB at {lmdb_path}")


In [6]:
# Prepare label mappings
wnid_to_label = load_label_mapping(mapping_file)

In [7]:
# Train LMDB creation (shuffled)
train_images_labels = []
for wnid in os.listdir(train_dir):
    wnid_folder = os.path.join(train_dir, wnid)
    if os.path.isdir(wnid_folder):
        label = wnid_to_label[wnid]
        for img_file in os.listdir(wnid_folder):
            img_path = os.path.join(wnid_folder, img_file)
            train_images_labels.append((img_path, label))

In [None]:
random.shuffle(train_images_labels)  # Don't group same wnids together
create_lmdb(train_images_labels, os.path.join(data_dir, 'imagenet_train.lmdb'))

Stored 0 images
Stored 25000 images
Stored 50000 images


KeyboardInterrupt: 

: 

In [None]:

# Load the validation annotations from LOC_val_solution.csv
def load_val_annotations():
    val_annotations = pd.read_csv(val_solution_file)
    val_annotations['PredictionString'] = val_annotations['PredictionString'].apply(lambda x: x.split()[0])
    val_images_labels = []
    for _, row in val_annotations.iterrows():
        img_path = os.path.join(val_dir, row['ImageId'] + '.JPEG')
        label = wnid_to_label[row['PredictionString']]
        val_images_labels.append((img_path, label))
    return val_images_labels

In [None]:
create_lmdb(load_val_annotations(), os.path.join(data_dir, 'imagenet_val.lmdb'))

Stored 0 images
Stored 25000 images
Finished creating LMDB at data/imagenet_dataset/imagenet_val.lmdb
