In [16]:
import rplanpy
import numpy as np
import multiprocessing as mp
import logging
import os
import glob
import h5py
from tqdm import tqdm

In [17]:
logging.basicConfig(filename='batch_save.log', level=logging.INFO)
# use all available cpu cores
NUM_PROCESSES = 8


In [18]:


def prepare_feature_single_image(image_path):
    try:
        print(f"single_image_extraction: {image_path}\n")
        logging.info(f"{image_path}")
        data = rplanpy.data.RplanData(image_path)
        data.set_graph()
        G = data.get_graph()

        # print(f"image:{image_path} \n graph: \n {G.graph} \n graph.node: \n {G.nodes.data()} \n graph.edge: \n {G.edges.data()} \n ") 
        # print("-x-x-x--x-x-x-x-x-x-x-x--x-x-x-x")

        min_row , min_col, max_row , max_col = G.graph['site_bounding_box']
        site_width = max_col - min_col
        site_height = max_row - min_row

        #input features

        site_dim = [site_width, site_height] # list, [0] width [1] height of site. 1 per file

        room_categories = []   # list , where each value means 1 room's category. 1 list per file
        
        #######

        #target features

        room_area_norm = [] # list ,each value means 1 room's area. 1 list per file
        room_bb_norm   = [] # list of lists, where 1 list has bb of 1 room y0,x0,y1,x1. 1 per file
        edge_list = []  # list of tuple, where one tuple will have id0,id1 => source,target for 1 edge. 1 per file
        edge_location = [] #list of int where each value corresponds to loc of src wrt to target. mapping in rplanpy/util.py 
        edge_door = [] # list of bool , where each value tells whether door from src to target present or not


        for node in G.nodes:
            # get each room's category
            room_categories.append(G.nodes[node]['category'])

            # get bounding box of a room
            r_min_row, r_min_col , r_max_row, r_max_col = G.nodes[node]['bounding_box']

            x0_norm = (r_min_col - min_col) / site_width
            y0_norm = (r_min_row - min_row) / site_height
            x1_norm = (r_max_col - min_col) / site_width
            y1_norm = (r_max_row - min_row) / site_height

            r_area_norm = (y1_norm-y0_norm) * (x1_norm - x0_norm)

            room_area_norm.append(r_area_norm)
            room_bb_norm.append([y0_norm,x0_norm,y1_norm,x1_norm])

        for src,target,edg_data in G.edges(data=True):
            #subtracting 1 as rplan generated node id based on index 1 
            src = src - 1
            target = target - 1
            location = edg_data['location']
            door = edg_data['door']
            edge_list.append((src,target))
            edge_location.append(location)
            edge_door.append(door)

        print(f"edge_list_for: {image_path} \n{edge_list}\n")

        input_feature_single_image = {'site_dim' : site_dim, 
                                    'room_category' : room_categories}
        target_feature_single_image = {'room_area_norm' : room_area_norm,
                                       'room_bb_norm'   : room_bb_norm,
                                       'edge_list'      : edge_list,
                                       'edge_door'      : edge_door,
                                       'edge_location'  : edge_location}

        # print(f"image:{image_path}\n  input: \n : {input_feature_single_image}\n")
        # print(f"image:{image_path} \n target: \n {target_feature_single_image}\n")

        return input_feature_single_image, target_feature_single_image


    except Exception as e:
        print(f"error in file: {image_path}")
        return None
    

In [19]:


def process_image_wrapper(image_path):
    try:
      
        input_feature,target_feature = prepare_feature_single_image(image_path)
        return (input_feature,target_feature)
    except Exception as e:
        logging.error(f"error processing {image_path}: {e}")
        return (None, None)


def process_batch(image_paths, batch_index):
    input_batch = []
    target_batch = []

    with mp.Pool(processes=NUM_PROCESSES) as pool:
        results = list(tqdm(
            pool.imap(process_image_wrapper,image_paths),
            total=len(image_paths),
            desc = f"Processing batch {batch_index}"
        ))

    for input_data , target_data in results:
        if input_data is not None and target_data is not None:
            input_batch.append(input_data)
            target_batch.append(target_data)
        else:
            logging.warning("Skipped an image due to processign error.")
        
    if input_batch and target_batch :
        save_batch(input_batch,target_batch,batch_index)
        #print(f"BATCH_INDEX: {batch_index} \n input_batch: \n {input_batch} \n target_batch: \n{target_batch}")
        logging.info(f"Finished batch {batch_index}")

In [20]:

def save_batch(input_batch, target_batch, batch_index):
    filename = f'preprocess_data/batch_{batch_index}.h5'
    os.makedirs('preprocess_data' ,exist_ok=True)

    # filename = f'preprocess_data_temp/batch_{batch_index}.h5'
    # os.makedirs('preprocess_data_temp' ,exist_ok=True)

    with h5py.File(filename, 'w') as hf:
        # input.1
        site_dim = np.array([item['site_dim'] for item in input_batch], dtype=np.float32)
        hf.create_dataset('site_dim' , data=site_dim)

        # input.2
        room_categories = [item['room_category'] for item in input_batch]
        room_categories_lens = np.array([len(rc) for rc in room_categories], dtype=np.int32)
        max_rc_len = room_categories_lens.max()

        # padding
        room_categories_padded = np.zeros((len(room_categories),max_rc_len),dtype=np.int32)
        for i,rc in enumerate(room_categories):
            room_categories_padded[i , : len(rc)] = rc

        
        # print(f"BATCH_INDEX = {batch_index} \nroom_categories_padded: \n {room_categories_padded}")
        hf.create_dataset('room_category', data=room_categories_padded)
        hf.create_dataset('room_category_lens', data=room_categories_lens)


        #target.1
        room_areas = [item['room_area_norm'] for item in target_batch]
        room_areas_lens = np.array([len(ra) for ra in room_areas], dtype=np.int32)
        max_ra_len = room_areas_lens.max()
        #padding
        room_areas_padded = np.zeros((len(room_areas), max_ra_len), dtype=np.float32)
        for i,ra in enumerate(room_areas):
            room_areas_padded[i, : len(ra)] = ra
            
        #print(f"BATCH_INDEX = {batch_index} \nroom_area_padded: \n {room_areas_padded}")
        hf.create_dataset('room_area_norm' , data=room_areas_padded)
        hf.create_dataset('room_area_lens' , data=room_areas_lens)

        #target.2 room bb norm

        room_bbs = [item['room_bb_norm'] for item in target_batch]
        room_bbs_lens = np.array([len(bb) for bb in room_bbs],dtype=np.int32)
        max_bb_len = room_bbs_lens.max()

        room_bbs_padded = np.zeros((len(room_bbs),max_bb_len,4),dtype=np.float32)
        for i, bb in enumerate(room_bbs):
            room_bbs_padded[i, : len(bb)] = bb

        # print(f"BATCH_INDEX = {batch_index} \n room_bbs_padded: \n {room_bbs_padded}")
        hf.create_dataset('room_bb_norm' , data=room_bbs_padded)
        hf.create_dataset('room_bb_norm_lens', data=room_bbs_lens)

        edge_lists = [item['edge_list'] for item in target_batch]
        edge_doors = [item['edge_door'] for item in target_batch]
        edge_locations = [item['edge_location'] for item in target_batch]

        edge_offsets = [0]
        total_edges = sum(len(edges) for edges in edge_lists)
        edges_src = np.zeros(total_edges,dtype=np.int32)
        edges_tgt = np.zeros(total_edges,dtype=np.int32)
        edges_door = np.zeros(total_edges,dtype=np.bool_)
        edges_location = np.zeros(total_edges,dtype=np.int32)
        idx = 0
        for edges,doors,locations in zip(edge_lists,edge_doors,edge_locations) : 
            for (src,target) , door , loc in zip(edges,doors,locations) :
                edges_src[idx] = src
                edges_tgt[idx] = target
                edges_door[idx] = door
                edges_location[idx] = loc
                idx += 1
            edge_offsets.append(idx)

        # print(f"BATCH_INDEX = {batch_index} \nedge_src: \n {edges_src}")
        # print(f"BATCH_INDEX = {batch_index} \nedges_tgt: \n {edges_tgt}")
        # print(f"BATCH_INDEX = {batch_index} \nedges_door: \n {edges_door}")
        # print(f"BATCH_INDEX = {batch_index} \nedges_location: \n {edges_location}")
        hf.create_dataset('edges_src', data=edges_src)
        hf.create_dataset('edges_tgt', data=edges_tgt)
        hf.create_dataset('edges_door', data=edges_door)
        hf.create_dataset('edges_location', data=edges_location)
        hf.create_dataset('edge_offsets', data=edge_offsets) 


In [21]:
def process_all_images(image_dir,batch_size):
     
    logging.basicConfig(filename='data_processing.log', level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    # Collect all image paths
    image_paths = glob.glob(os.path.join(image_dir, '*.png'))  # Adjust the pattern as needed

    total_images = len(image_paths)
    total_batches = (total_images + batch_size - 1) // batch_size

    logging.info(f"Starting processing of {total_images} images in {total_batches} batches.")
    for batch_index in range(total_batches):
        start_idx = batch_index * batch_size
        end_idx = min(start_idx + batch_size, total_images)
        batch_image_paths = image_paths[start_idx:end_idx]
        logging.info(f"Processing batch {batch_index + 1}/{total_batches} with {len(batch_image_paths)} images.")
        process_batch(batch_image_paths, batch_index)

In [None]:
IMAGE_DIR = '../dataset/floorplan_dataset'
#IMAGE_DIR =  './floorplan_dataset_temp'
BATCH_SIZE = 10



process_all_images(IMAGE_DIR,BATCH_SIZE)
logging.info("-x-x-x-x-x-x-Finished-x-x-x-x-x-x-x-x-x")
print("-x-x-x-x-x-x-xx-Finished-x-x--x-x--x-x-x")