In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from keras.applications import VGG16
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D, Dropout, Input
from tensorflow.keras.applications import MobileNetV2

def create_6DoF_model_quaternion(input_shape=(224, 224, 3)):
    base_model = MobileNetV2(input_shape=input_shape, include_top=False, weights='imagenet')
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)

    # Predicting a quaternion (4 values) for rotation and a 3D vector for translation.
    pose_output = Dense(7, name='6DoF_output')(x)

    model = Model(inputs=base_model.input, outputs=pose_output)
    return model



In [4]:
from keras.applications.resnet50 import ResNet50
from keras.layers import GlobalAveragePooling2D, Dense, Concatenate

def create_resnet_6dof_model(input_shape=(224, 224, 3)):
    # Load the ResNet50 model with weights pre-trained on ImageNet
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)

    # Freeze the layers of the ResNet model
    for layer in base_model.layers:
        layer.trainable = False

    # Extract features using the ResNet50 model
    x = base_model.output

    # Add some custom layers on top
    x = GlobalAveragePooling2D()(x)
    x = Dense(512, activation='relu')(x)

    # Predict the 3 translation values
    translation = Dense(3, name="translation")(x)

    # Predict the 4 quaternion values for rotation
    quaternion = Dense(4, activation='tanh', name="quaternion")(x)

    # Combine the translation and rotation into a single output
    final_output = Concatenate(name="6DoF_output")([translation, quaternion])

    # Construct the full model
    model = Model(inputs=base_model.input, outputs=final_output)

    return model


In [5]:
import os
import json
import cv2
import numpy as np

def load_images_from_folder(folder_path, img_size=(224, 224)):
    """Load images from a folder and resize them."""

    images = []
    image_names = sorted(os.listdir(folder_path))

    for image_name in image_names:
        img_path = os.path.join(folder_path, image_name)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, img_size)
        images.append(img)

    return images

def load_json(file_path):
    """Load a JSON file."""

    with open(file_path, 'r') as f:
        data = json.load(f)

    return data
def process_gt_data(scene_gt, scene_camera):
    """Extract 6DoF pose and camera parameters from scene_gt and scene_camera."""

    poses = []
    camera_params = []

    for image_id, annotations in scene_gt.items():
        image_data = []
        for annotation in annotations:
            R = np.array(annotation["cam_R_m2c"]).reshape(3, 3)
            t = np.array(annotation["cam_t_m2c"]).reshape(3, 1)
            pose = np.hstack([R, t])
            image_data.append(pose)
        poses.append(image_data)

        cam_data = scene_camera[image_id]
        K = np.array(cam_data["cam_K"]).reshape(3, 3)
        camera_params.append(K)

    return poses, camera_params
from tensorflow.keras.preprocessing import image

def preprocess_images(images):
    """Normalize the images to [0, 1]."""
    return np.array(images, dtype=np.float32) / 255.0
!wget https://bop.felk.cvut.cz/media/data/bop_datasets/lm_test_all.zip
#!mkdir /content/drive/MyDrive/trainable_pose
#!mv /content/lm_test_all.zip /content/drive/MyDrive/trainable_pose/
!unzip lm_test_all.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: test/000014/mask_visib/001220_000000.png  
  inflating: test/000014/rgb/001221.png  
  inflating: test/000014/depth/001221.png  
  inflating: test/000014/mask/001221_000000.png  
  inflating: test/000014/mask_visib/001221_000000.png  
  inflating: test/000014/rgb/001222.png  
  inflating: test/000014/depth/001222.png  
  inflating: test/000014/mask/001222_000000.png  
  inflating: test/000014/mask_visib/001222_000000.png  
  inflating: test/000014/rgb/001223.png  
  inflating: test/000014/depth/001223.png  
  inflating: test/000014/mask/001223_000000.png  
  inflating: test/000014/mask_visib/001223_000000.png  
  inflating: test/000014/rgb/001224.png  
  inflating: test/000014/depth/001224.png  
  inflating: test/000014/mask/001224_000000.png  
  inflating: test/000014/mask_visib/001224_000000.png  
  inflating: test/000014/rgb/001225.png  
  inflating: test/000014/depth/001225.png  
  inflating: test/000014/

In [6]:
from scipy.spatial.transform import Rotation

def poses_to_quaternion_format(poses):
    """
    Convert a list of poses in the format:
    [[R11, R12, R13, tx],
     [R21, R22, R23, ty],
     [R31, R32, R33, tz]]
    into a list of poses in the format [tx, ty, tz, qx, qy, qz, qw].
    """
    quaternion_poses = []

    for pose in poses:
        pose_np = np.array(pose[0])  # Convert to numpy array.
        R = pose_np[:, :3]  # Extract the 3x3 rotation matrix.
        t = pose_np[:, 3]  # Extract the translation vector.

        # Convert rotation matrix to quaternion.
        rotation = Rotation.from_matrix(R)
        quaternion = rotation.as_quat()

        quaternion_pose = list(t) + list(quaternion)
        quaternion_poses.append(quaternion_pose)

    return quaternion_poses

In [7]:
# Paths
IMAGE_FOLDER = "test/000001/rgb"
SCENE_GT_PATH = "test/000001/scene_gt.json"
SCENE_CAMERA_PATH = "test/000001/scene_camera.json"

# Load images and JSON data
images = load_images_from_folder(IMAGE_FOLDER)
scene_gt = load_json(SCENE_GT_PATH)
scene_camera = load_json(SCENE_CAMERA_PATH)

# Process ground truth and camera parameters
poses, camera_params = process_gt_data(scene_gt, scene_camera)

# Preprocess images
images = preprocess_images(images)



In [8]:
quaternion_poses = poses_to_quaternion_format(poses)


In [9]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# Ensure TensorFlow is utilizing GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
else:
    print("No GPU found, model will be trained on CPU.")


In [10]:
# Create a MirroredStrategy.
strategy = tf.distribute.MirroredStrategy()

print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

# Open a strategy scope.
with strategy.scope():
    # Define and compile the model.
    #model = create_6DoF_model_quaternion()
    model = create_resnet_6dof_model()
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='mse')


Number of devices: 1
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [11]:
from sklearn.model_selection import train_test_split

# Split the data into 70% training data, 15% validation data, and 15% test data

# First, separate out the test set (15% of the total data)
X_temp, X_test, y_temp, y_test_quat = train_test_split(images, quaternion_poses, test_size=0.15, random_state=42)

# Now, split the remaining data (X_temp, y_temp) into training and validation sets
X_train, X_val, y_train_quat, y_val_quat = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42)  # 0.1765 of 85% is roughly 15%

# Convert lists to numpy arrays
y_train_quat = np.array(y_train_quat)
y_val_quat = np.array(y_val_quat)
y_test_quat = np.array(y_test_quat)


In [12]:
history = model.fit(X_train, y_train_quat,
                    validation_data=(X_val, y_val_quat),
                    epochs=150, batch_size=32)


Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [13]:
from tensorflow.keras.preprocessing import image

def load_and_preprocess_image(img_path, target_size=(224, 224)):
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = img_array / 255.0  # normalize to [0,1]
    img_array = np.expand_dims(img_array, axis=0)  # model.predict expects a batch of images
    return img_array


In [14]:
def get_ground_truth_pose_for_image(image_filename, json_path):
    """
    Retrieve the ground truth 6DoF pose for a given image from the LineMOD dataset.

    Parameters:
    - image_filename: The filename of the image, e.g., '000123.jpg'.
    - json_path: Path to the scene_gt.json file for the corresponding scene.

    Returns:
    - pose: A dictionary containing the ground truth pose (rotation and translation).
    """

    # Extract image ID from the filename
    image_id = int(image_filename.split('.')[0])

    # Load the JSON data
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Extract pose for the given image ID
    pose_data = data[str(image_id)][0]  # Assuming one primary object per image in LineMOD
    rotation_matrix = pose_data['cam_R_m2c']
    translation_vector = pose_data['cam_t_m2c']

    pose = {
        'rotation': rotation_matrix,
        'translation': translation_vector
    }

    return pose
import numpy as np
from scipy.spatial.transform import Rotation

def dict_to_7d_format(pose_dict):
    """
    Convert a dictionary with 'rotation' and 'translation' keys to a 7D array format:
    [tx, ty, tz, qx, qy, qz, qw].
    """
    R = np.array(pose_dict['rotation']).reshape(3, 3)  # Convert rotation list to a 3x3 matrix.

    # Convert the rotation matrix to a quaternion.
    rotation = Rotation.from_matrix(R)
    quaternion = rotation.as_quat()

    # Combine translation and quaternion into a single array.
    pose_7d = np.concatenate([pose_dict['translation'], quaternion])

    return pose_7d
import numpy as np

def pose_error(gt_pose, pred_pose):
    """
    Calculate and print the translation and rotation error between a ground truth pose and a predicted pose.

    Parameters:
    - gt_pose: Ground truth pose as a 7D array [tx, ty, tz, qx, qy, qz, qw].
    - pred_pose: Predicted pose as a 7D array [tx, ty, tz, qx, qy, qz, qw].
    """

    # Translation error
    trans_error = np.linalg.norm(gt_pose[:3] - pred_pose[:3])

    # Rotation error
    q_gt = gt_pose[3:]
    q_pred = pred_pose[3:]
    dot_product = np.dot(q_gt, q_pred)

    # Clip to ensure dot_product is within the valid range for arccos
    dot_product = np.clip(dot_product, -1.0, 1.0)

    rotation_error = 2 * np.arccos(np.abs(dot_product))

    #print(f"Translation Error: {trans_error:.4f} units")
    #print(f"Rotation Error: {rotation_error:.4f} radians")

    return trans_error, rotation_error
img_path='test/000001/rgb/000150.png'
img_array = load_and_preprocess_image(img_path)
predicted_6dof = model.predict(img_array)[0]
print(predicted_6dof)
image_filename = '000150.jpg'
json_path = 'test/000001/scene_gt.json'
ground_truth_pose = get_ground_truth_pose_for_image(image_filename, json_path)
ground_truth_pose = dict_to_7d_format(ground_truth_pose)
print(ground_truth_pose)


[-3.6818153e+01  4.2227054e+00  8.8183917e+02  4.8923311e-01
  4.2052439e-01 -1.0961914e-01  2.7369320e-01]
[ 2.79832120e+01  1.61767631e+00  8.69436480e+02  7.24655619e-01
  5.35102823e-01 -2.17892054e-01  3.75582556e-01]


In [15]:
trans_error, rot_error = pose_error(ground_truth_pose, predicted_6dof)


In [16]:
model.save('drive/MyDrive/trainable_pose/model.h5')
model.save('drive/MyDrive/trainable_pose/model.keras')

  saving_api.save_model(


In [17]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Concatenate, Flatten
from keras.applications import ResNet50

def pose_refinement_network(input_shape=(224, 224, 3)):
    # Initial pose input (7D: [tx, ty, tz, qx, qy, qz, qw])
    initial_pose_input = Input(shape=(7,), name="initial_pose")

    # RGB Image input
    image_input = Input(shape=input_shape, name="image_input")
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    for layer in base_model.layers:
        layer.trainable = False
    image_features = base_model(image_input)
    image_features = Flatten()(image_features)

    # Concatenate image features and initial pose
    x = Concatenate()([image_features, initial_pose_input])

    # Dense layers for refinement
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(256, activation='relu')(x)

    # Output layer for refined pose
    refined_pose = Dense(7, name="refined_pose")(x)

    model = Model(inputs=[image_input, initial_pose_input], outputs=refined_pose)

    return model


In [18]:
# Predict the initial poses using the trained regression model
initial_train_poses = model.predict(X_train)
initial_val_poses = model.predict(X_val)
initial_test_poses = model.predict(X_test)

# Create and compile the pose refinement network
refinement_model = pose_refinement_network()
refinement_model.compile(optimizer='adam', loss='mse')

# Train the refinement model
refinement_history = refinement_model.fit([X_train, initial_train_poses], y_train_quat,
                                         validation_data=([X_val, initial_val_poses], y_val_quat),
                                         epochs=50, batch_size=64)

# Optionally, evaluate the refinement model
final_test_predictions = refinement_model.predict([X_test, initial_test_poses])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [20]:
# Predict poses on the test set
y_pred_quat = model.predict(X_test)
y_pred_quat = refinement_model.predict([X_test,y_pred_quat])
# Initialize lists to store errors
trans_errors = []
rot_errors = []

# Compute errors for each prediction against the ground truth
for gt_pose, pred_pose in zip(y_test_quat, y_pred_quat):
    trans_error, rot_error = pose_error(gt_pose, pred_pose)
    trans_errors.append(trans_error)
    rot_errors.append(rot_error)

# Convert lists to numpy arrays for further analysis
trans_errors = np.array(trans_errors)
rot_errors = np.array(rot_errors)

# Report overall mean and standard deviation for both translation and rotation errors
print(f"Mean Translation Error: {trans_errors.mean():.4f} units")
print(f"Standard Deviation of Translation Error: {trans_errors.std():.4f} units")
print(f"Mean Rotation Error: {rot_errors.mean():.4f} radians")
print(f"Standard Deviation of Rotation Error: {rot_errors.std():.4f} radians")


Mean Translation Error: 79.5654 units
Standard Deviation of Translation Error: 42.2922 units
Mean Rotation Error: 0.3119 radians
Standard Deviation of Rotation Error: 0.7801 radians


In [21]:
refinement_model.save('drive/MyDrive/trainable_pose/refinement_model.h5')
refinement_model.save('drive/MyDrive/trainable_pose/refinement_model.keras')

  saving_api.save_model(
