### Example of keypoint selection using homography with a custom loss function

The idea is to select keypoints on both images that match well after applying homography between the keypoints. The points that match up nicely are used for calculating the fundamental matrix. I tested this approach and did not get better results than using the keypoints directly because of the power of random sample consensus (RANSAC) that is used in the OpenCV function that calculates the fundamental matrix. However the idea might still be usefull. Perhaps the best improvement can be gained by modifying the RANSAC (since the distance to epipolar lines is used as criterion, this can be modified as well).

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import skimage.io as io
import cv2
import os
import numpy as np

src = '/kaggle/input/image-matching-challenge-2022/train/grand_place_brussels'
covisibility = pd.read_csv(os.path.join(src, "pair_covisibility.csv"))
print("Covisibility between the two pairs of images: {:.2f}".format(covisibility['covisibility'][0]))
pair = covisibility['pair'][0].split('-')
img0 = io.imread(os.path.join(src, "images", pair[0]+'.jpg'))
img1 = io.imread(os.path.join(src, "images", pair[1]+'.jpg'))
f_matrix = np.array(covisibility['fundamental_matrix'][0].split(" "), dtype="float").reshape((3, 3))
print('Fundamental Matrix:')
print(f_matrix)

fig, ax = plt.subplots(ncols=2)
ax[0].imshow(img0)
ax[1].imshow(img1)

We will use the SIFT detector to determine the keypoints, we will use the same parameters as used in the evaluation script that determines the leaderboard score. Found the FLANN matcher in one of the shared notebooks (don't know which one anymore).

In [None]:
img = cv2.imread(os.path.join(src, "images", pair[0]+'.jpg'))
img3 = cv2.imread(os.path.join(src, "images", pair[1]+'.jpg'))
# Initiate SIFT detector
sift = cv2.SIFT_create()
# find the keypoints and descriptors with SIFT
kp1, des1 = sift.detectAndCompute(img,None)
kp2, des2 = sift.detectAndCompute(img3,None)
img2 = cv2.drawKeypoints(img, kp1, None, color=(255,0,0), flags=0)
img4 = cv2.drawKeypoints(img3, kp2, None, color=(255,0,0), flags=0)

fig, ax = plt.subplots(ncols=2, figsize=(20, 10))
ax[0].imshow(img2)
ax[1].imshow(img4)
fig.suptitle('SIFT keypoints on image')

FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
search_params = dict(checks = 50)

flann = cv2.FlannBasedMatcher(index_params, search_params)
matches = flann.knnMatch(des1,des2,k=2)
# store all the good matches as per Lowe's ratio test.
good = []
for m,n in matches:
    if m.distance < 0.7*n.distance:
        good.append(m)

src_pts = np.float32([ kp1[m.queryIdx].pt for m in good ]).reshape(-1,1,2)
dst_pts = np.float32([ kp2[m.trainIdx].pt for m in good ]).reshape(-1,1,2)

# Calculate homography using the keypoints of both images.
M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC,5.0)

print('Homography Matrix')
print(M)

im_dst2 = cv2.warpPerspective(img, M, (img4.shape[1], img4.shape[0]))

fig, ax = plt.subplots(ncols = 2, figsize=(20, 10))
ax[0].imshow(im_dst2)
ax[1].imshow(img3)
fig.suptitle('Image after homography transform (OpenCV function)')

### FOLLOWING BLOCK OF CODE IS JUST COPIED FROM THE EVALUATION SCRIPT -> SKIP IT

In [None]:
"""
Evaluate performance -- COPIED FROM EVALUATION CODE
"""

import os
import numpy as np
import cv2
import csv
from glob import glob
import matplotlib.pyplot as plt
from collections import namedtuple
from copy import deepcopy
from tqdm import tqdm
import random

# Check that you're using a recent OpenCV version.
assert cv2.__version__ > '4.5', 'Please use OpenCV 4.5 or later.'

# Some useful functions and definitions. You can skip this for now.

# A named tuple containing the intrinsics (calibration matrix K) and extrinsics (rotation matrix R, translation vector T) for a given camera.
Gt = namedtuple('Gt', ['K', 'R', 'T'])

# A small epsilon.
eps = 1e-15

# We use two different sets of thresholds over rotation and translation. Do not change this -- these are the values used by the scoring back-end.
thresholds_q = np.linspace(1, 10, 10)
thresholds_t = np.geomspace(0.2, 5, 10)

def LoadCalibration(filename):
    '''Load calibration data (ground truth) from the csv file.'''
    
    calib_dict = {}
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        for i, row in enumerate(reader):
            # Skip header.
            if i == 0:
                continue

            camera_id = row[0]
            K = np.array([float(v) for v in row[1].split(' ')]).reshape([3, 3])
            R = np.array([float(v) for v in row[2].split(' ')]).reshape([3, 3])
            T = np.array([float(v) for v in row[3].split(' ')])
            calib_dict[camera_id] = Gt(K=K, R=R, T=T)
    
    return calib_dict


def ReadCovisibilityData(filename):
    covisibility_dict = {}
    with open(filename) as f:
        reader = csv.reader(f, delimiter=',')
        for i, row in enumerate(reader):
            # Skip header.
            if i == 0:
                continue
            covisibility_dict[row[0]] = float(row[1])

    return covisibility_dict


def NormalizeKeypoints(keypoints, K):
    C_x = K[0, 2]
    C_y = K[1, 2]
    f_x = K[0, 0]
    f_y = K[1, 1]
    keypoints = (keypoints - np.array([[C_x, C_y]])) / np.array([[f_x, f_y]])
    return keypoints


def ComputeEssentialMatrix(F, K1, K2, kp1, kp2):
    '''Compute the Essential matrix from the Fundamental matrix, given the calibration matrices. Note that we ask participants to estimate F, i.e., without relying on known intrinsics.'''
    
    # Warning! Old versions of OpenCV's RANSAC could return multiple F matrices, encoded as a single matrix size 6x3 or 9x3, rather than 3x3.
    # We do not account for this here, as the modern RANSACs do not do this:
    # https://opencv.org/evaluating-opencvs-new-ransacs
    assert F.shape[0] == 3, 'Malformed F?'

    # Use OpenCV's recoverPose to solve the cheirality check:
    # https://docs.opencv.org/4.5.4/d9/d0c/group__calib3d.html#gadb7d2dfcc184c1d2f496d8639f4371c0
    E = np.matmul(np.matmul(K2.T, F), K1).astype(np.float64)
    
    kp1n = NormalizeKeypoints(kp1, K1)
    kp2n = NormalizeKeypoints(kp2, K2)
    num_inliers, R, T, mask = cv2.recoverPose(E, kp1n, kp2n)

    return E, R, T


def ArrayFromCvKps(kps):
    '''Convenience function to convert OpenCV keypoints into a simple numpy array.'''
    
    return np.array([kp.pt for kp in kps])


def QuaternionFromMatrix(matrix):
    '''Transform a rotation matrix into a quaternion.'''

    M = np.array(matrix, dtype=np.float64, copy=False)[:4, :4]
    m00 = M[0, 0]
    m01 = M[0, 1]
    m02 = M[0, 2]
    m10 = M[1, 0]
    m11 = M[1, 1]
    m12 = M[1, 2]
    m20 = M[2, 0]
    m21 = M[2, 1]
    m22 = M[2, 2]

    K = np.array([[m00 - m11 - m22, 0.0, 0.0, 0.0],
              [m01 + m10, m11 - m00 - m22, 0.0, 0.0],
              [m02 + m20, m12 + m21, m22 - m00 - m11, 0.0],
              [m21 - m12, m02 - m20, m10 - m01, m00 + m11 + m22]])
    K /= 3.0

    # The quaternion is the eigenvector of K that corresponds to the largest eigenvalue.
    w, V = np.linalg.eigh(K)
    q = V[[3, 0, 1, 2], np.argmax(w)]

    if q[0] < 0:
        np.negative(q, q)

    return q


def ExtractSiftFeatures(image, detector, num_features):
    '''Compute SIFT features for a given image.'''
    
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    kp, desc = detector.detectAndCompute(gray, None)
    return kp[:num_features], desc[:num_features]


def ComputeErrorForOneExample(q_gt, T_gt, q, T, scale):
    '''Compute the error metric for a single example.
    
    The function returns two errors, over rotation and translation. These are combined at different thresholds by ComputeMaa in order to compute the mean Average Accuracy.'''
    
    q_gt_norm = q_gt / (np.linalg.norm(q_gt) + eps)
    q_norm = q / (np.linalg.norm(q) + eps)

    loss_q = np.maximum(eps, (1.0 - np.sum(q_norm * q_gt_norm)**2))
    err_q = np.arccos(1 - 2 * loss_q)

    # Apply the scaling factor for this scene.
    T_gt_scaled = T_gt * scale
    T_scaled = T * np.linalg.norm(T_gt) * scale / (np.linalg.norm(T) + eps)

    err_t = min(np.linalg.norm(T_gt_scaled - T_scaled), np.linalg.norm(T_gt_scaled + T_scaled))

    return err_q * 180 / np.pi, err_t


def ComputeMaa(err_q, err_t, thresholds_q, thresholds_t):
    '''Compute the mean Average Accuracy at different tresholds, for one scene.'''
    
    assert len(err_q) == len(err_t)
    
    acc, acc_q, acc_t = [], [], []
    for th_q, th_t in zip(thresholds_q, thresholds_t):
        acc += [(np.bitwise_and(np.array(err_q) < th_q, np.array(err_t) < th_t)).sum() / len(err_q)]
        acc_q += [(np.array(err_q) < th_q).sum() / len(err_q)]
        acc_t += [(np.array(err_t) < th_t).sum() / len(err_t)]
    return np.mean(acc), np.array(acc), np.array(acc_q), np.array(acc_t)

# Load ground truth data.
src = "/kaggle/input/image-matching-challenge-2022/train"
scene = "grand_place_brussels"
calib_dict = LoadCalibration(f'{src}/{scene}/calibration.csv')
scaling_dict = {}
with open(f'{src}/scaling_factors.csv') as f:
    reader = csv.reader(f, delimiter=',')
    for i, row in enumerate(reader):
        # Skip header.
        if i == 0:
            continue
        scaling_dict[row[0]] = float(row[1])

calib_id1 = calib_dict[pair[0]]
calib_id2 = calib_dict[pair[1]]
scale = scaling_dict[scene]

def compute_evaluation_one_scene(F, calib_id1, calib_id2, src, scene, id1, id2, scale, num_features=5000, contrastThreshold=-10000):
    # Instantiate the matcher.
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
    # You may want to lower the detection threshold, as small images may not be able to reach the budget otherwise.
    # Note that you may actually get more than num_features features, as a feature for one point can have multiple orientations (this is rare).
    sift_detector = cv2.SIFT_create(num_features, contrastThreshold=contrastThreshold, edgeThreshold=-10000)
    
    # Compute descriptors between image pair
    image1 = cv2.cvtColor(cv2.imread(f'{src}/{scene}/images/{id1}.jpg'), cv2.COLOR_BGR2RGB)
    image2 = cv2.cvtColor(cv2.imread(f'{src}/{scene}/images/{id2}.jpg'), cv2.COLOR_BGR2RGB)
    kp1, desc1 = ExtractSiftFeatures(image1, sift_detector, 2000)
    kp2, desc2 = ExtractSiftFeatures(image2, sift_detector, 2000)
    """
    for kp in kp2:
        kp.pt = (kp.pt[0] + x_offset, kp.pt[1])
    """
    
    # Compute matches by brute force.
    cv_matches = bf.match(desc1, desc2)
    matches = np.array([[m.queryIdx, m.trainIdx] for m in cv_matches])
    cur_kp_1 = ArrayFromCvKps([kp1[m[0]] for m in matches])
    cur_kp_2 = ArrayFromCvKps([kp2[m[1]] for m in matches])
    
    # Filter matches with RANSAC.
    _F, inlier_mask = cv2.findFundamentalMat(cur_kp_1, cur_kp_2, cv2.USAC_MAGSAC, 0.25, 0.99999, 10000)
    inlier_mask = inlier_mask.astype(bool).flatten()

    matches_after_ransac = np.array([match for match, is_inlier in zip(matches, inlier_mask) if is_inlier])
    inlier_kp_1 = ArrayFromCvKps([kp1[m[0]] for m in matches_after_ransac])
    inlier_kp_2 = ArrayFromCvKps([kp2[m[1]] for m in matches_after_ransac])
    
    
    # Compute the essential matrix.
    E, R, T = ComputeEssentialMatrix(F, calib_id1.K, calib_id2.K, inlier_kp_1, inlier_kp_2)
    q = QuaternionFromMatrix(R)
    T = T.flatten()
    
    # Get the relative rotation and translation between these two cameras, given their R and T in the global reference frame.
    R1_gt, T1_gt = calib_id1.R, calib_id1.T.reshape((3, 1))
    R2_gt, T2_gt = calib_id2.R, calib_id2.T.reshape((3, 1))
    dR_gt = np.dot(R2_gt, R1_gt.T)
    dT_gt = (T2_gt - np.dot(dR_gt, T1_gt)).flatten()
    q_gt = QuaternionFromMatrix(dR_gt)
    q_gt = q_gt / (np.linalg.norm(q_gt) + eps)

    # Compute the error for this example.
    err_q, err_t = ComputeErrorForOneExample(q_gt, dT_gt, q, T, scale)
    print("Error rotation (degrees): {}".format(err_q))
    print("Error translation (meters): {}".format(err_t))
    return inlier_kp_1, inlier_kp_2
    

### Compare accuracy of ground truth fundamental matrix and the one estimated using OpenCV's function on the FLANN matched SIFT keypoints
As we can see the ground truth indeed has an error in both rotation and translation less than 0.001, whereas the estimated one has a significantly higher error.

In [None]:
print("---- Ground Truth ----")
_a = compute_evaluation_one_scene(f_matrix, calib_id1, calib_id2, src, scene, pair[0], pair[1], scale)
print("---- OpenCV - SIFT + FLANN + MAGSAC")
F, inliers = cv2.findFundamentalMat(src_pts, dst_pts, cv2.USAC_MAGSAC, 0.5, 0.999, 100000)
_a = compute_evaluation_one_scene(F, calib_id1, calib_id2, src, scene, pair[0], pair[1], scale)

### Now we calculate the homography using a custom loss function that is relatively robust to outliers.
Used a custom loss function based on the absolute distance between the points for homography and minimizing this. Least squares uses the quadratic and is more susceptible to outliers. The custom one should be more robust for outliers. As we can see the OpenCV fit is still better.

In [None]:
from scipy.optimize import minimize
from scipy.optimize import basinhopping

def homography_transform_points(ikp, m_arr):
    M = np.ones(9)
    M[:-1] = m_arr
    M = M.reshape((3, 3))
    ikp_expanded = np.ones((3, ikp.shape[0]))
    ikp_expanded[:2, :] = ikp.T
    ikp_homography = np.matmul(M, ikp_expanded).T
    return ikp_homography
    
def loss(m_arr, src_pts, dst_pts):
    src_ho = homography_transform_points(np.squeeze(src_pts), m_arr)
    coords = np.vstack((src_ho[:, 0]/src_ho[:, 2], src_ho[:, 1]/src_ho[:, 2])).T
    loss = np.sum(np.abs((coords - np.squeeze(dst_pts))))
    return loss

def calc_homography_matrix(m_arr):
    M = np.ones(9)
    M[:-1] = m_arr
    M = M.reshape((3, 3))
    return M
    

res = minimize(loss, [1, 0, 0, 0, 1, 0, 0, 0], args=(src_pts, dst_pts), tol=1e-6)
print(calc_homography_matrix(res.x))
minimizer_kwargs = {"method": "BFGS"}

# Remove the redundant dimension from the keypoints
src_pts = np.squeeze(src_pts)
dst_pts = np.squeeze(dst_pts)


ikp1_ho = homography_transform_points(src_pts, res.x)
# M was obtained by OpenCV homography as calculated in one of the previous cells.
ikp1_hom = homography_transform_points(src_pts, M.reshape(-1)[:-1])
fig, ax = plt.subplots(ncols=2, figsize=(20, 10))
ax[0].set_title('Scatterplot of the keypoints over the full image')
ax[0].plot(dst_pts[:, 0], dst_pts[:, 1], '.')
ax[0].plot(ikp1_ho[:, 0]/ikp1_ho[:, 2], ikp1_ho[:, 1]/ikp1_ho[:, 2], '.')
ax[0].plot(ikp1_hom[:, 0]/ikp1_hom[:, 2], ikp1_hom[:, 1]/ikp1_hom[:, 2], '.')
ax[0].legend(['Keypoints Image 2', 'Custom Homography Keypoints Image 1', 'OpenCV Homography Keypoints Image 1'])
ax[1].set_title('Scatterplot of the keypoints over a zoomed in portion of the image')
ax[1].plot(dst_pts[:, 0], dst_pts[:, 1], '.')
ax[1].plot(ikp1_ho[:, 0]/ikp1_ho[:, 2], ikp1_ho[:, 1]/ikp1_ho[:, 2], '.')
ax[1].plot(ikp1_hom[:, 0]/ikp1_hom[:, 2], ikp1_hom[:, 1]/ikp1_hom[:, 2], '.')
ax[1].set_xlim([400, 600])
ax[1].set_ylim([800, 1000])
ax[1].legend(['Keypoints Image 2', 'Custom Homography Keypoints Image 1', 'OpenCV Homography Keypoints Image 1'])


### Now we calculate the absolute delta between the points and keep the ones with a small absolute delta.
Note that in my submission I decided to keep the 20 points with the smallest absolute delta, if there were less than 20 points all of them were kept.

In [None]:
#abs_delta = np.sum(np.abs(np.vstack((src_ho[:, 0]/src_ho[:, 2], src_ho[:, 1]/src_ho[:, 2])).T - np.squeeze(dst_pts)), axis=1)
key_ho = np.array([ikp1_ho[:, 0]/ikp1_ho[:, 2], ikp1_ho[:, 1]/ikp1_ho[:, 2]]).T
abs_delta = np.sum(np.abs(key_ho - np.squeeze(dst_pts)), axis=1)


fig, ax = plt.subplots(ncols=2, figsize=(20, 10))
ax[0].set_title('Histogram of the absolute delta of the keypoints after custom homography')
ax[0].hist(abs_delta, bins=100)

sel_bool = abs_delta < 20

ax[1].set_title('Filtered points that have an absolute delta less than 20')
ax[1].plot(np.squeeze(dst_pts)[sel_bool, 0], np.squeeze(dst_pts)[sel_bool, 1], '.')
ax[1].plot(key_ho[sel_bool, 0], key_ho[sel_bool, 1], '.')

### Next the custom homography is calculated based on the filtered points, the fit afterwards is much better.

In [None]:
src_pts_filtered = np.squeeze(src_pts)[sel_bool, :]
dst_pts_filtered = np.squeeze(dst_pts)[sel_bool, :]


res = minimize(loss, [1, 0, 0, 0, 1, 0, 0, 0], args=(src_pts_filtered, dst_pts_filtered), tol=1e-6)

src_ho = homography_transform_points(np.squeeze(src_pts_filtered), res.x)

plt.figure(figsize=(10, 10))
plt.plot(np.squeeze(dst_pts_filtered)[:, 0], np.squeeze(dst_pts_filtered)[:, 1], '.')
plt.plot(src_ho[:, 0]/src_ho[:, 2], src_ho[:, 1]/src_ho[:, 2], '.')
plt.title('Custom Homography on Filtered Points')

### Now the errors in translation and rotation are calculated again for this image pair
We can see that the fit for rotation worsens but the fit for translation improves considerably.

In [None]:
print("---- Ground Truth ----")
_a = compute_evaluation_one_scene(f_matrix, calib_id1, calib_id2, src, scene, pair[0], pair[1], scale)
print("---- OpenCV - SIFT + FLANN + MAGSAC")
F, inliers = cv2.findFundamentalMat(src_pts, dst_pts, cv2.USAC_MAGSAC, 0.5, 0.999, 100000)
_a = compute_evaluation_one_scene(F, calib_id1, calib_id2, src, scene, pair[0], pair[1], scale)
print('---- Custom Homography Filtering - SIFT + FLANN')
F_filtered, inliers = cv2.findFundamentalMat(src_pts_filtered, dst_pts_filtered, cv2.USAC_MAGSAC, 0.5, 0.999, 100000)
_a = compute_evaluation_one_scene(F_filtered, calib_id1, calib_id2, src, scene, pair[0], pair[1], scale)

### Let's plot the homography warped image of the final solution (we can see the homography is quite nice)

In [None]:
im_dst3 = cv2.warpPerspective(img, calc_homography_matrix(res.x), (img4.shape[1], img4.shape[0]))

fig, ax = plt.subplots(ncols = 2, figsize=(20, 10))
ax[0].imshow(im_dst3)
ax[1].imshow(img3)
fig.suptitle('Image after homography transform (OpenCV function)')