The whole process begins with raw ModelNet10 data(.OFF file)
It only contains endpoints of the model(like those points at each corner).
Inorder to get points that evenly spread across all surfacts of the model, we need PointCloudLibrary(PCL) to sample our model.
but PCL only accept .PLY file so conversion is needed.

First: convert .OFF file to .PLY file.

In [1]:
import numpy as np
import pandas as pd
import h5py
import os
from sklearn import preprocessing

categories = ['bathtub','bed','chair','desk','dresser','monitor','night_stand','sofa','table','toilet']
path = 'c:\\Users\\sean_\\Downloads\\ModelNet10\\'

def OFFtoPLY(path,categories,DataGroup):
    for cat  in categories:
        DataArray=[]
        #deal with train first
        files = os.listdir(path + cat + '\\'+DataGroup+'\\')
        files = [x for x in files if x[-4:] == '.off']
        for file_index,file in enumerate(files):
            fileName = file.split('.')[0]
            with open(path + cat + '\\'+DataGroup+'\\' + file, 'r') as f:
                f.readline()
                #get number of points in the model
                line = f.readline().replace('\n','')
                point_count = line.split(' ')[0]
                face_count = line.split(' ')[1]
            
                #create ply file,write in header first.
                with open(path + cat + '\\'+DataGroup+'\\' + fileName + ".ply",'w') as plyFile:
                    plyFile.write('ply\nformat ascii 1.0\nelement vertex ')
                    plyFile.write(point_count)
                    plyFile.write('\nproperty float32 x\nproperty float32 y\nproperty float32 z\nelement face ')
                    plyFile.write(face_count)
                    plyFile.write('\nproperty list uint8 int32 vertex_indices\nend_header\n')
                    for index in range(0,int(point_count)+int(face_count)):
                        plyFile.write(f.readline())

In [10]:
OFFtoPLY(path,categories,'train')
OFFtoPLY(path,categories,'test')

Setp two: call tool "pcl_mesh_sampling_release.exe"(for pcl version higher than 1.9.1) to convert all .PLY data to .PCD

In [2]:
import subprocess

def PLYtoPCD(path,categories,DataGroup):
    for cat  in categories:
        DataArray=[]
        #deal with train first
        files = os.listdir(path + cat + '\\'+DataGroup+'\\')
        files = [x for x in files if x[-4:] == '.ply']
        for file_index,file in enumerate(files):
            fileName = file.split('.')[0]
            subprocess.call(['C:\\Users\\sean_\\Desktop\\PLYconv\\pcl_mesh_sampling_release.exe',path + cat + '\\'+DataGroup+'\\' + file,path + cat + '\\'+DataGroup+'\\' + fileName + ".pcd",'-no_vis_result','-n_samples', '2200','-leaf_size', '0.1'])

In [4]:
PLYtoPCD(path,categories,'train')
PLYtoPCD(path,categories,'test')

Step three: Merge converted PCD file to one .h5 file the shape of the data should be [n,2048,3]

In [2]:
def PCDtoH5(path,categories,DataGroup):
    for cat  in categories:
        DataArray=[]    
        #deal with train first
        files = os.listdir(path + cat + '\\'+DataGroup+'\\')
        files = [x for x in files if x[-4:] == '.pcd']
        for file_index,file in enumerate(files):
            fileName = file.split('.')[0]
            with open(path + cat + '\\'+DataGroup+'\\' + file, 'r') as f:
                for y in range(9):
                    f.readline()
                #get number of points in the model
                line = f.readline().replace('\n','')
                point_count = line.split(' ')[1]
                #number of data less or more than 2048
                pad_count = 2048 - int(point_count)
                data = []
                f.readline()
                #fill ndarray with datapoints
                for index in range(0,int(point_count)):
                    line = f.readline().rstrip().split()
                    line[0] = float(line[0])
                    line[1] = float(line[1])
                    line[2] = float(line[2])
                    data.append(line)
                data = np.array(data)
                if pad_count > 0 :
                    idx = np.random.randint(point_count, size=pad_count)
                    data = np.append(data,data[idx],axis=0)
                elif  pad_count < 0 :
                    index_pool = np.arange(int(point_count))
                    np.random.shuffle(index_pool)
                    data = data[index_pool[:2048]]
                centroid = np.mean(data, axis=0)
                data = data - centroid
                m = np.max(np.sqrt(np.sum(data**2, axis=1)))
                data = data / m
                data = np.array([data])
            
                label = np.array(categories.index(cat)).reshape(1,1)
                if file_index == 0 and categories.index(cat) ==0:
                    with h5py.File(path + DataGroup +".h5", "w") as ff:
                        ff.create_dataset(name='data', data=data,maxshape=(None, 2048, 3), chunks=True)
                        ff.create_dataset(name='label', data=label,maxshape=(None, 1), chunks=True)
                else:
                    with h5py.File(path +DataGroup +".h5", "a") as hf:
                        hf['data'].resize((hf['data'].shape[0] + 1), axis=0)
                        hf['data'][-1:] = data
                        hf['label'].resize((hf['label'].shape[0] + 1), axis=0)
                        hf['label'][-1:] = label

In [None]:
PCDtoH5(path,categories,'test')
PCDtoH5(path,categories,'train')

Here is something to check data shape.

In [None]:
import numpy as np
import pandas as pd
import h5py
import os
from sklearn import preprocessing
path = 'C:\\Users\\sean_\\Downloads\\ModelNet10\\'
with h5py.File(path +"test.h5", 'r') as hf:
    data = hf['data'] # <HDF5 dataset>
    for point in data:
        model1 = pd.DataFrame(point,columns=['x','y','z'])
        print(data.shape)