In [1]:
# July 2023
# Implementing VoxNet for Autoencoder (not classification that explained previously):
# you can see the VoxNet's architecture at https://github.com/AutoDeep/VoxNet

In [2]:
# VoxnetEncoder is a name I myself selected here. I'm not sure if there is something similar out
# there for this or not...

# 1) Voxeling Modelnet10

In [3]:
dataset_addr = '/home/sbn/Downloads/datasets/data_/ModelNet10/'

In [60]:
import torch
from torch.utils.data import DataLoader
import numpy as np




# import sys
# sys.path.insert(0, dataset_addr)
from modelnet10 import ModelNet10

CLASSES = {
    0: 'bathtub',
    1: 'chair',
    2: 'dresser',
    3: 'night_stand',
    4: 'sofa',
    5: 'toilet',
    6: 'bed',
    7: 'desk',
    8: 'monitor',
    9: 'table'
}
N_CLASSES = len(CLASSES)



train_dataset = ModelNet10(data_root= dataset_addr, 
                           n_classes=N_CLASSES, 
                           idx2cls=CLASSES, 
                           split='train')

train_dataloader = DataLoader(train_dataset, 
                              batch_size=1, 
                              shuffle=True)




{'bathtub': 0, 'chair': 1, 'dresser': 2, 'night_stand': 3, 'sofa': 4, 'toilet': 5, 'bed': 6, 'desk': 7, 'monitor': 8, 'table': 9}


# Testing voxel

In [61]:
for sample in train_dataloader:
    voxel, cls_idx = sample['voxel'], sample['cls_idx']
    print(voxel.shape)
    break

torch.Size([1, 1, 32, 32, 32])


# VoxNet Encoder/ Decoder

<!-- ![image](images/VoxNet.png) -->

In [62]:
import torch
from torch import nn

In [63]:
# VoxNetDecoder = nn.Sequential(
#                 nn.Linear(3,25),
#                 nn.Linear(25, 128),
#                 nn.Linear(128, 1000),
#                 nn.Unflatten(dim=1, unflattened_size=(10,10,10)),

#                 nn.ConvTranspose3d(in_channels=1, out_channels=32, kernel_size=(7,7,7), dilation=2),
#                 nn.ReLU(),
#                 nn.ConvTranspose3d(in_channels=32, out_channels=32, kernel_size=(5,5,5), dilation=2),
#                 nn.ReLU(),
#                 nn.ConvTranspose3d(in_channels=32, out_channels=1, kernel_size=(5,5,5), dilation=1, 
#                                    output_padding=0,
#                                   padding=1,
#                                    stride=1),     
#                 nn.Tanh()
# )
    

In [64]:
# VoxNetAutoencoder = nn.Sequential(
#                 VoxNetEncoder(),
#                 VoxNetDecoder(),
# )

In [65]:
class BinaryActication(torch.autograd.Function):
    @staticmethod
    def forward(ctx, z):
        
        ctx.save_for_backward(z) # saves z in y=f(z). z is an array for all output neurons.
        y = torch.sign(z)        # y shows y in y=f(z). Here just I know the size is like z'size.              
        return y
    
    @staticmethod
    def backward(ctx, grad_output):
        z, = ctx.saved_tensors
        dy_dz = z.clone()
        dy_dz[(dy_dz>=-1) * (dy_dz<=1)] = 1
        dy_dz[(dy_dz!=1)] = 0
        # print('dy/dz=', dy_dz)
        return  dy_dz

In [66]:
torch.sign(torch.tensor(0.01))

tensor(1.)

In [76]:
class VoxNetAutoencoder(nn.Module):
    def __init__(self):
        super(VoxNetAutoencoder, self).__init__()
        self.VoxNetEncoder = nn.Sequential(
                nn.Conv3d(in_channels=1, out_channels=16, kernel_size=(5,5,5), stride=2),
                nn.BatchNorm3d(16),
                nn.ReLU(),
                nn.Conv3d(in_channels=16, out_channels=32, kernel_size=(3,3,3), stride=1),
                nn.BatchNorm3d(32),
                nn.ReLU(),
                nn.AvgPool3d(kernel_size=(2,2,2), stride=2),
                nn.Flatten(start_dim=1),
                nn.Linear(6912, 128),
                nn.ReLU(),
                nn.Linear(128, 25),
                nn.ReLU(),
                nn.Linear(25,3),
                nn.ReLU()
        )
        
        self.VoxNetDecoder = nn.Sequential(
                nn.Linear(3,25),
                nn.ReLU(),
                nn.Linear(25, 128), nn.ReLU(),
                nn.Linear(128, 1000), nn.ReLU(),
            nn.Linear(1000, 1000), nn.ReLU(),
                nn.Unflatten(dim=1, unflattened_size=(1,10,10,10)),
                nn.ConvTranspose3d(in_channels=1, out_channels=16, kernel_size=(7,7,7), dilation=2),
                nn.BatchNorm3d(16),
                nn.ReLU(),
                nn.ConvTranspose3d(in_channels=16, out_channels=16, kernel_size=(5,5,5), dilation=2),
                nn.BatchNorm3d(16),
                nn.ReLU(),
                nn.ConvTranspose3d(in_channels=16, out_channels=1, kernel_size=(5,5,5), dilation=1, 
                                   output_padding=0,
                                   padding=1,
                                   stride=1),
#                 nn.Tanh()
                
        )
        

                
    def forward(self, x):
        x = self.VoxNetEncoder(x)
        x = self.VoxNetDecoder(x)
        return BinaryActication.apply(x) 
#         return 
        

In [77]:
# VoxNetDecoder(VoxNetEncoder(voxel.float())).shape

In [78]:
model = VoxNetAutoencoder()

In [79]:
from torchsummary import summary

In [80]:
# the input size is = (5,1, 32,32,32), in which 5 is batch_size, 1 = channel. 
summary(model, (1,32,32,32), batch_size=5)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1        [5, 16, 14, 14, 14]           2,016
       BatchNorm3d-2        [5, 16, 14, 14, 14]              32
              ReLU-3        [5, 16, 14, 14, 14]               0
            Conv3d-4        [5, 32, 12, 12, 12]          13,856
       BatchNorm3d-5        [5, 32, 12, 12, 12]              64
              ReLU-6        [5, 32, 12, 12, 12]               0
         AvgPool3d-7           [5, 32, 6, 6, 6]               0
           Flatten-8                  [5, 6912]               0
            Linear-9                   [5, 128]         884,864
             ReLU-10                   [5, 128]               0
           Linear-11                    [5, 25]           3,225
             ReLU-12                    [5, 25]               0
           Linear-13                     [5, 3]              78
             ReLU-14                   

In [81]:
voxel.shape

torch.Size([1, 1, 32, 32, 32])

In [82]:
model = VoxNetAutoencoder()


In [83]:
model(voxel.float()).shape

torch.Size([1, 1, 32, 32, 32])

In [99]:
model.eval()

print('Test Sample:')
for sample in train_dataloader:
    sample_x, sample_y = sample['voxel'], sample['cls_idx']
#     sample_y_hat = torch.argmax(
    bottle_neck = model.VoxNetEncoder(sample_x.float())
    print(bottle_neck)
    print('Ground Truth  =', sample_y)
#     print('Predicted     =', sample_y_hat)
#     print(voxel.shape)
    
    reconstructed_sample = BinaryActication.apply(model.VoxNetDecoder(bottle_neck)).reshape(32,32,32)
    sample = np.empty((1,3))
    for i in range(32):
        for j in range(32):
            for k in range(32):
                if reconstructed_sample[i][j][k]>0:
                    print(reconstructed_sample[i][j][k]>0)
                if (reconstructed_sample[i][j][k]>0):
                    sample = np.vstack((sample, np.array([i,j,k])))
#                 print(i,j,k,': ', reconstructed_sample[i][j][k] )
    break
# print(sample.shape)    
print(reconstructed_sample.shape)
np.savetxt('sample.xyz',sample)    

Test Sample:
tensor([[0.0000, 1.5216, 0.0000]], grad_fn=<ReluBackward0>)
Ground Truth  = tensor([9])
torch.Size([32, 32, 32])


In [85]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.003)

In [86]:
loss = nn.MSELoss()

In [101]:
print("Start training ...")
model.train()

for epoch in range(30):
    overall_loss = 0
    for batch_idx, sample in enumerate(train_dataloader):
        x,y =  sample['voxel'], sample['cls_idx']
        
        optimizer.zero_grad()
        y_hat = model(x.float()).float()
        ll = loss(y_hat, x.float())
#         print(ll)
        overall_loss += ll.item()
        ll.backward()
        optimizer.step()
        
#         break
        
#     print("\tEpoch", epoch + 1, "complete!", "\tAverage Loss: ", overall_loss / (batch_idx*batch_size))
    print(overall_loss)
    
print("Finish!!")

Start training ...
tensor(1.1876, grad_fn=<MseLossBackward0>)
1.187591552734375
tensor(1.1700, grad_fn=<MseLossBackward0>)
1.170013427734375
tensor(1.2194, grad_fn=<MseLossBackward0>)
1.2193603515625
tensor(1.0430, grad_fn=<MseLossBackward0>)
1.04302978515625
tensor(1.0530, grad_fn=<MseLossBackward0>)
1.053009033203125
tensor(1.3318, grad_fn=<MseLossBackward0>)
1.331787109375
tensor(1.0005, grad_fn=<MseLossBackward0>)
1.00054931640625
tensor(1.3253, grad_fn=<MseLossBackward0>)
1.325286865234375
tensor(1.2501, grad_fn=<MseLossBackward0>)
1.2501220703125
tensor(1.1415, grad_fn=<MseLossBackward0>)
1.14154052734375
tensor(1.2598, grad_fn=<MseLossBackward0>)
1.25982666015625
tensor(1.3057, grad_fn=<MseLossBackward0>)
1.305694580078125
tensor(1.1198, grad_fn=<MseLossBackward0>)
1.1197509765625
tensor(1.5615, grad_fn=<MseLossBackward0>)
1.561492919921875
tensor(2.1978, grad_fn=<MseLossBackward0>)
2.197784423828125
tensor(1.3864, grad_fn=<MseLossBackward0>)
1.386444091796875
tensor(1.0780, gra

# Test Sample:

In [96]:
model.eval()

print('Test Sample:')
for sample in train_dataloader:
    sample_x, sample_y = sample['voxel'], sample['cls_idx']
#     sample_y_hat = torch.argmax(
    bottle_neck = model.VoxNetEncoder(sample_x.float())
    print(bottle_neck)
    print('Ground Truth  =', sample_y)
#     print('Predicted     =', sample_y_hat)
#     print(voxel.shape)
    break

Test Sample:
tensor([[0.0000, 2.5145, 0.0000]], grad_fn=<ReluBackward0>)
Ground Truth  = tensor([4])


In [97]:
# Saving VoxNet model and its parameters.
torch.save(model.state_dict(), "model/AutoEncoderVoxNetmodel.params")
torch.save(model.state_dict(), "model/AutoEncoderVoxNetmodel.params.pt")
torch.save(model, "model/enitre_AutoEncoderVoxNetmodell")

In [98]:
model.eval()

print('Test Sample:')
for sample in train_dataloader:
    sample_x, sample_y = sample['voxel'], sample['cls_idx']
#     sample_y_hat = torch.argmax(
    bottle_neck = model.VoxNetEncoder(sample_x.float())
    print(bottle_neck)
    print('Ground Truth  =', sample_y)
#     print('Predicted     =', sample_y_hat)
#     print(voxel.shape)
    
    reconstructed_sample = BinaryActication.apply(model.VoxNetDecoder(bottle_neck)).reshape(32,32,32)
    sample = np.empty((1,3))
    for i in range(32):
        for j in range(32):
            for k in range(32):
                if reconstructed_sample[i][j][k]>0:
                    print(reconstructed_sample[i][j][k]>0)
                if (reconstructed_sample[i][j][k]>0):
                    sample = np.vstack((sample, np.array([i,j,k])))
#                 print(i,j,k,': ', reconstructed_sample[i][j][k] )
    break
# print(sample.shape)    
print(reconstructed_sample.shape)
np.savetxt('sample.xyz',sample)    

Test Sample:
tensor([[0.0000, 2.4059, 0.0000]], grad_fn=<ReluBackward0>)
Ground Truth  = tensor([4])
torch.Size([32, 32, 32])


In [None]:
#  (Uncomment this if you want):
# # Whenever it's needed you can do loading the trained model with*:
# model = torch.load("model/enitre_VoxNetmodel")
# model.eval()

In [None]:
# print('Test Sample:')
# for sample in train_dataloader:
#     sample_x, sample_y = sample['voxel'], sample['cls_idx']
#     sample_y_hat = torch.argmax(model(sample_x.float()))
#     print('Ground Truth  =', sample_y)
#     print('Predicted     =', sample_y_hat)
# #     print(voxel.shape)
#     break