---

# Data preprocessing


Code used for preprocessing the dataset presented in the paper "Pointwise deep learning for leaf-wood segmentation of tropical tree point clouds from terrestrial laser scanning"  

--- 

## Import libraries

In [1]:
import os
from tqdm import tqdm
import numpy as np
import open3d as o3d
# import open3d.ml as _ml3d
# import open3d.ml.torch as ml3d
import ml3d as _ml3d
import ml3d.torch as ml3d
from open3d.ml.torch.datasets import Custom3D
import numpy as np
import glob
import torch
from pclbox.models import CustomRandLANet

DATA_DIR = "/mnt/c/Users/wavdnbro/OneDrive - UGent/Documents/spacetwin/datasets/leaf_wood/"
DATA_PATH = DATA_DIR + 'preprocessed/'

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


## Preprocess dataset

In [3]:
# ---------------
# Code to preprocess the tropical leaf-wood data of Louise Terryn
#
# The manually labeled wood points did not match the original point cloud due to precision mismatch when saving the files.
# I thus saved the original point clouds with the same precision (%.3f). Then, I attributed a label to each point in the 
# original point cloud indicating whether the point is 'wood' or 'non-wood'. 
# 
# ---------------

PATH_TREE = os.path.join(DATA_DIR, 'tree_points')
PATH_WOOD = os.path.join(DATA_DIR, 'wood_points')
plot_names = ['DRO', 'OC', 'RC']

# Create different variables holding the filenames and paths
filenames = {plot_name: os.listdir(os.path.join(PATH_TREE, plot_name)) for plot_name in plot_names}

filenames_all = []
for plot_name in plot_names:
    filenames_all = filenames_all + filenames[plot_name]

filepaths_tree = []
for plot_name in plot_names:
    filepaths_tree = filepaths_tree + [os.path.join(PATH_TREE, plot_name, filename) for filename in filenames[plot_name]]

filepaths_wood = []
for plot_name in plot_names:
    filepaths_wood = filepaths_wood + [os.path.join(PATH_WOOD, plot_name, filename[:-6] + 'tls_0.02_wood.txt') for filename in filenames[plot_name]]


def decrease_precision():

    for file_in, filename_out in zip(filepaths_tree, filenames_all):
        # Read tree
        tree = np.loadtxt(file_in)   

        if not os.path.exists(os.path.join(DATA_DIR, 'tmp')):
                    os.makedirs(os.path.join(DATA_DIR, 'tmp'))

        # Write file with lower precision
        path_out = os.path.join(DATA_DIR, 'tmp', filename_out)
        np.savetxt(path_out, tree, fmt='%.3f')


def view1D(a, b): # a, b are arrays
    a = np.ascontiguousarray(a)
    b = np.ascontiguousarray(b)
    void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
    return a.view(void_dt).ravel(),  b.view(void_dt).ravel()


def isin_nd(a,b):
    # a,b are the 3D input arrays to give us "isin-like" functionality across them
    A,B = view1D(a.reshape(a.shape[0],-1),b.reshape(b.shape[0],-1))
    return np.isin(A,B)


def add_label():

     for filename, path_wood in zip(filenames_all, filepaths_wood):
        # Read tree
        tree = np.loadtxt(os.path.join(DATA_DIR, 'tmp', filename)) 
        wood = np.loadtxt(path_wood) 
        wood = wood[:, :3]

        label = isin_nd(tree, wood)
        file_out = np.hstack((tree, label.reshape(-1, 1)))
     
        if not os.path.exists(os.path.join(DATA_DIR, 'preprocessed')):
            os.makedirs(os.path.join(DATA_DIR, 'preprocessed'))
     
        # Write file with label
        path_out = os.path.join(DATA_DIR, 'preprocessed', filename)
        np.savetxt(path_out, file_out, fmt='%.3f')


# decrease_precision()
# add_label()

In [76]:
## Preprocess data to the file structure necessary for open3d ml datasets
import random

def convert_dataset():
    DATA_PATH_IN = os.path.join(DATA_DIR, 'preprocessed') 
    DATA_PATH_OUT = os.path.join(DATA_DIR, 'preprocessed_open3d') 

    # Make required file structure
    path_train = os.path.join(DATA_PATH_OUT, 'train')
    path_val = os.path.join(DATA_PATH_OUT, 'val')
    path_test = os.path.join(DATA_PATH_OUT, 'test')

    for path in [path_train, path_val, path_test]:
        if not os.path.exists(path):
            os.makedirs(path)

    # Get all filenames of point clouds
    filenames = os.listdir(DATA_PATH_IN)

    # Randomly shuffle the filenames
    random.seed(42)
    random.shuffle(filenames)

    # Define train-val-test split
    n_files = len(filenames)
    split_train = 0.6
    split_test = 0.2

    # Get train-val-test files 
    files_train = filenames[:round(split_train*n_files)]
    files_val = filenames[round(split_train*n_files):round((split_train + split_test)*n_files)]
    files_test = filenames[round((split_train + split_test)*n_files):]


    for files, path_out in zip([files_train, files_val], [path_train, path_val]):
        for file in files:
            # Read file
            pcl = np.loadtxt(os.path.join(DATA_PATH_IN, file)) 
            # Write file
            filename_out = os.path.join(path_out, file[:-3] + 'npy')
            with open(filename_out, 'wb') as f:
                np.save(f, pcl)

    for file in files_test:
        # Read file
        pcl = np.loadtxt(os.path.join(DATA_PATH_IN, file)) 

        # Only retain xyz
        xyz = pcl[:, :3]
        labels = pcl[:, 3].astype(np.uint8)
        
        # Write file
        filename_out = os.path.join(path_test, file[:-3] + 'npy')
        with open(filename_out, 'wb') as f:
            np.save(f, xyz)

        filename_labels = os.path.join(path_test, file[:-4] + '_labels.txt')
        np.savetxt(filename_labels, labels, fmt='%1.i')

convert_dataset()

## Data exploration

In [13]:
DATA_DIR = "/mnt/c/Users/wavdnbro/OneDrive - UGent/Documents/spacetwin/datasets/leaf_wood/"
DATA_PATH = DATA_DIR + 'preprocessed_open3d/test/'

n_trees = len(os.listdir(DATA_PATH))
print('number of trees:', n_trees)

total_points = 0
wood_points = 0
max_points = 0
min_points = 100000000

for filename in os.listdir(DATA_PATH):
    # tree = np.load(DATA_PATH + filename)
    if filename[-3:] == 'txt':
        tree = np.loadtxt(DATA_PATH + filename) 
        # print(tree)
    
    total_points += len(tree)
    # wood_points += tree[:, 3].sum()
    # wood_points += tree.sum()
    max_points = max(max_points, len(tree))
    min_points = min(min_points, len(tree))


print('total number of points:', total_points)
print('average number of points per tree:', total_points / n_trees)
print('max number of points:', max_points)
print('min number of points:', min_points)
print('total number of woody points:', wood_points)
print('total_number of non-wood points:', total_points - wood_points)
print('fraction:', wood_points / total_points)

number of trees: 60
total number of points: 14364895
average number of points per tree: 239414.91666666666
max number of points: 1887667
min number of points: 25483
total number of woody points: 0
total_number of non-wood points: 14364895
fraction: 0.0


In [10]:
import pandas as pd

df = pd.read_csv("/mnt/c/Users/wavdnbro/OneDrive - UGent/Documents/spacetwin/datasets/leaf_wood/AUS_samenvatting.csv")

print('number of species:', len(df.Species.unique()))

number of species: 41
