# Validation settings

We want to check the CV-LB correlation, so we need to carefully screen the data for validation.

In [None]:
# List of scenes you want to use for validation
scene_list = ["british_museum", "brandenburg_gate", "buckingham_palace",
 "colosseum_exterior", "grand_place_brussels", "lincoln_memorial_statue",
 "notre_dame_front_facade", "pantheon_exterior", "piazza_san_marco",
 "sacre_coeur", "sagrada_familia", "st_pauls_cathedral", "st_peters_square",
 "taj_mahal", "temple_nara_japan", "trevi_fountain"]

cfg = {
    "covisibility_thr_min": 0.3, # Exclude low covisibility pairs
    "covisibility_thr_max": 0.7, # Exclude high covisibility pairs
    "min_longest_edge": 700, # Exclude small images
    "dZ_thr": 100.0, # Exclude camera pairs that are far apart in the Z direction
    "rotation_thr": 0.5, # Exclude images that are heavily tilted
    "max_num_pairs": 500,
}

# Useful Functions

Thanks to the following Notebook.

https://www.kaggle.com/code/eduardtrulls/imc2022-training-set-eval-one-function/notebook

In [None]:
import csv
import numpy as np
import os
import pandas as pd

from collections import namedtuple

Gt = namedtuple('Gt', ['K', 'R', 'T'])
eps = 1e-15

def LoadCalibration(filename):
    '''Load calibration data (ground truth) from the csv file.'''

    calib_dict = {}
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        for i, row in enumerate(reader):
            # Skip header.
            if i == 0:
                continue

            camera_id = row[0]
            K = np.array([float(v) for v in row[1].split(' ')]).reshape([3, 3])
            R = np.array([float(v) for v in row[2].split(' ')]).reshape([3, 3])
            T = np.array([float(v) for v in row[3].split(' ')])
            calib_dict[camera_id] = Gt(K=K, R=R, T=T)

    return calib_dict

def DecomposeFundamentalMatrixWithIntrinsics(F, K1, K2):
    '''Decompose the fundamental matrix into R and T, given the intrinsics.'''

    # This fundamentally reimplements this function: https://github.com/opencv/opencv/blob/be38d4ea932bc3a0d06845ed1a2de84acc2a09de/modules/calib3d/src/five-point.cpp#L742
    # This is a pre-requisite of OpenCV's recoverPose: https://github.com/opencv/opencv/blob/be38d4ea932bc3a0d06845ed1a2de84acc2a09de/modules/calib3d/src/five-point.cpp#L559
    # Instead of the cheirality check with correspondences, we keep and evaluate the different hypotheses downstream, and pick the best one.
    # Note that our metric does not care about the sign of the translation vector, so we only need to evaluate the two rotation matrices.
    E = np.matmul(K2.T, np.matmul(F, K1))

    U, S, Vh = np.linalg.svd(E)
    if np.linalg.det(U) < 0:
        U *= -1
    if np.linalg.det(Vh) < 0:
        Vh *= -1

    W = np.array([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
    R_a = np.matmul(U, np.matmul(W, Vh))
    R_b = np.matmul(U, np.matmul(W.T, Vh))
    T = U[:, -1]

    return R_a, R_b, T

def QuaternionFromMatrix(matrix):
    '''Transform a rotation matrix into a quaternion.'''

    M = np.array(matrix, dtype=np.float64, copy=False)[:4, :4]
    m00 = M[0, 0]
    m01 = M[0, 1]
    m02 = M[0, 2]
    m10 = M[1, 0]
    m11 = M[1, 1]
    m12 = M[1, 2]
    m20 = M[2, 0]
    m21 = M[2, 1]
    m22 = M[2, 2]

    K = np.array([[m00 - m11 - m22, 0.0, 0.0, 0.0],
              [m01 + m10, m11 - m00 - m22, 0.0, 0.0],
              [m02 + m20, m12 + m21, m22 - m00 - m11, 0.0],
              [m21 - m12, m02 - m20, m10 - m01, m00 + m11 + m22]])
    K /= 3.0

    # The quaternion is the eigenvector of K that corresponds to the largest eigenvalue.
    w, V = np.linalg.eigh(K)
    q = V[[3, 0, 1, 2], np.argmax(w)]

    if q[0] < 0:
        np.negative(q, q)

    return q


def ComputeErrorForOneExample(q_gt, T_gt, q, T, scale):
    '''Compute the error metric for a single example. The function returns two errors, over rotation and translation. These are combined at different thresholds by ComputeMaa in order to compute the mean Average Accuracy.'''

    q_gt_norm = q_gt / (np.linalg.norm(q_gt) + eps)
    q_norm = q / (np.linalg.norm(q) + eps)

    loss_q = np.maximum(eps, (1.0 - np.sum(q_norm * q_gt_norm)**2))
    err_q = np.arccos(1 - 2 * loss_q)

    # Apply the scaling factor for this scene.
    T_gt_scaled = T_gt * scale
    T_scaled = T * np.linalg.norm(T_gt) * scale / (np.linalg.norm(T) + eps)

    err_t = min(np.linalg.norm(T_gt_scaled - T_scaled), np.linalg.norm(T_gt_scaled + T_scaled))

    return err_q * 180 / np.pi, err_t

def ComputeMaa(err_q, err_t, thresholds_q, thresholds_t):
    '''Compute the mean Average Accuracy at different tresholds, for one scene.'''

    assert len(err_q) == len(err_t)

    acc, acc_q, acc_t = [], [], []
    for th_q, th_t in zip(thresholds_q, thresholds_t):
        acc += [(np.bitwise_and(np.array(err_q) < th_q, np.array(err_t) < th_t)).sum() / len(err_q)]
        acc_q += [(np.array(err_q) < th_q).sum() / len(err_q)]
        acc_t += [(np.array(err_t) < th_t).sum() / len(err_t)]
    return np.mean(acc), np.array(acc), np.array(acc_q), np.array(acc_t)

def EvaluateSubmission(predictions, input_dir, scaling_dict, thresholds_q, thresholds_t, output_dir=None):
    '''Evaluate a prediction file against the ground truth.

    Note that only the subset of entries in the prediction file will be evaluated.'''

    # Extract a list of scenes from the predictions file. Note that there is a single dataset, so we do not keep track of it.
    scenes = []
    for prediction in predictions.keys():
        dataset, scene, pair = prediction.split(';')
        if scene not in scenes:
            scenes += [scene]

    # Load the ground truth.
    calib_dict = {}
    for scene in scenes:
        calib_dict[scene] = LoadCalibration(os.path.join(input_dir, scene, "calibration.csv"))

    errors_dict_q = {scene: {} for scene in scenes}
    errors_dict_t = {scene: {} for scene in scenes}
    for prediction_key, F_predicted in predictions.items():
        dataset, scene, pair = prediction_key.split(';')
        image_id_1, image_id_2 = pair.split('-')

        K1, R1_gt, T1_gt = calib_dict[scene][image_id_1].K, calib_dict[scene][image_id_1].R, calib_dict[scene][image_id_1].T.reshape((3, 1))
        K2, R2_gt, T2_gt = calib_dict[scene][image_id_2].K, calib_dict[scene][image_id_2].R, calib_dict[scene][image_id_2].T.reshape((3, 1))

        R_pred_a, R_pred_b, T_pred = DecomposeFundamentalMatrixWithIntrinsics(F_predicted, K1, K2)
        q_pred_a = QuaternionFromMatrix(R_pred_a)
        q_pred_b = QuaternionFromMatrix(R_pred_b)

        dR_gt = np.dot(R2_gt, R1_gt.T)
        dT_gt = (T2_gt - np.dot(dR_gt, T1_gt)).flatten()
        q_gt = QuaternionFromMatrix(dR_gt)
        q_gt = q_gt / (np.linalg.norm(q_gt) + eps)

        # blah blah cheirality...
        err_q_a, err_t_a = ComputeErrorForOneExample(q_gt, dT_gt, q_pred_a, T_pred, scaling_dict[scene])
        err_q_b, err_t_b = ComputeErrorForOneExample(q_gt, dT_gt, q_pred_b, T_pred, scaling_dict[scene])
        assert err_t_a == err_t_b
        errors_dict_q[scene][pair] = min(err_q_a, err_q_b)
        errors_dict_t[scene][pair] = err_t_a

    # Aggregate the results by computing the final metric for each scene, and then averaging across all scenes.
    maa_per_scene = {}
    for scene in scenes:
        maa_per_scene[scene], _, _, _ = ComputeMaa(list(errors_dict_q[scene].values()), list(errors_dict_t[scene].values()), thresholds_q, thresholds_t)

    if output_dir is not None:
        scene_list = []
        pair_list = []
        maa_list = []
        for prediction_key, F_predicted in predictions.items():
            dataset, scene, pair = prediction_key.split(';')
            image_id_1, image_id_2 = pair.split('-')

            e_dict_q = errors_dict_q[scene][pair]
            e_dict_t = errors_dict_t[scene][pair]

            maa_pair, _, _, _ = ComputeMaa([e_dict_q], [e_dict_t], thresholds_q, thresholds_t)

            scene_list.append(scene)
            pair_list.append(pair)
            maa_list.append(maa_pair)

        df = pd.DataFrame({"scene":scene_list, "pair":pair_list, "maa":maa_list})
        outpur_csv = os.path.join(output_dir, "validation_all.csv")
        df.to_csv(outpur_csv, index=False)

    return np.mean(list(maa_per_scene.values())), maa_per_scene, errors_dict_q, errors_dict_t

def evaluate(input_dir, sample_id_list, fund_matrix_list):
    thresholds_q = np.linspace(1, 10, 10)
    thresholds_t = np.geomspace(0.2, 5, 10)

    # Load per-scene scaling factors.
    scaling_dict = {}
    with open(os.path.join(input_dir, f"scaling_factors.csv")) as f:
        reader = csv.reader(f, delimiter=',')
        for i, row in enumerate(reader):
            # Skip header.
            if i == 0:
                continue
            scaling_dict[row[0]] = float(row[1])

    predictions = {}
    for sample_id, fundamental_matrix in zip(sample_id_list, fund_matrix_list):
        predictions[sample_id] = np.array([float(v) for v in fundamental_matrix.split(' ')]).reshape([3, 3])

    maa, _, _, _ = EvaluateSubmission(
        predictions,
        input_dir,
        scaling_dict,
        thresholds_q,
        thresholds_t)

    return maa

def FlattenMatrix(M, num_digits=8):
    '''Convenience function to write CSV files.'''
    
    return ' '.join([f'{v:.{num_digits}e}' for v in M.flatten()])

# Model

Define here the model you want to validate.

## Feature matching (LoFTR model)

LoFTR: Detector-Free Local Feature Matching with Transformers

https://arxiv.org/abs/2104.00680

https://github.com/zju3dv/LoFTR

In [None]:
!pip install ../input/kornia-loftr/kornia-0.6.4-py2.py3-none-any.whl

In [None]:
import cv2
import kornia as K
import kornia.feature as KF
import os
import numpy as np
import pandas as pd
import time
import torch

import warnings
warnings.simplefilter('ignore', UserWarning)

def load_model_LoFTR(ckpt_path, device):
    model = KF.LoFTR(pretrained=None)
    model.load_state_dict(torch.load(ckpt_path)["state_dict"])
    model = model.to(device).eval()
    return model

def _convert_image_loftr(img, img_size=-1, lrflip=False, ulflip=False):

    # resize
    if img_size > 0:
        height, width = img.shape[:2]
        scale = img_size / max(width, height)
        w = int(width * scale + 0.5)
        h = int(height * scale + 0.5)
        if scale > 1.0:
            interpolation=cv2.INTER_CUBIC
        else:
            interpolation=cv2.INTER_AREA
        img = cv2.resize(img, (w, h), interpolation=interpolation)
    else:
        scale = 1.0

    # crop
    height, width = img.shape[:2]
    img = img[:height//8*8, :width//8*8]

    # flip
    if lrflip:
        img = cv2.flip(img, 1)
    if ulflip:
        img = cv2.flip(img, 0)

    # convert
    img = K.image_to_tensor(img, False).float() / 255.
    img = K.color.bgr_to_rgb(img)

    return img, scale

def matching_LoFTR(input_image_1, input_image_2, param):

    img_size = param["img_size"]
    matcher = param["model"]
    device = param["device"]

    image_1_list = []
    image_2_list = []
    scale_1 = 1
    scale_2 = 1
    for lrflip in [False, True]:
        for ulflip in [False]:
            _image_1, _scale_1 = _convert_image_loftr(input_image_1, img_size=img_size, lrflip=lrflip, ulflip=ulflip)
            _image_2, _scale_2 = _convert_image_loftr(input_image_2, img_size=img_size, lrflip=lrflip, ulflip=ulflip)
            image_1_list.append(_image_1)
            image_2_list.append(_image_2)
            scale_1 = _scale_1
            scale_2 = _scale_2

    image_1 = torch.cat(image_1_list, dim=0)
    image_2 = torch.cat(image_2_list, dim=0)

    input_dict = {"image0": K.color.rgb_to_grayscale(image_1).to(device),
                  "image1": K.color.rgb_to_grayscale(image_2).to(device)}

    with torch.no_grad():
        correspondences = matcher(input_dict)

    mkpts1 = correspondences["keypoints0"].cpu().numpy()
    mkpts2 = correspondences["keypoints1"].cpu().numpy()
    confidence = correspondences["confidence"].cpu().numpy()
    batch_indexes = correspondences["batch_indexes"].cpu().numpy()

    mkpts1_all = []
    mkpts2_all = []
    confidence_all = []
    batch_id = 0
    for lrflip in [False, True]:
        for ulflip in [False]:

            idx = batch_indexes == batch_id
            _mkpts1 = mkpts1[idx]
            _mkpts2 = mkpts2[idx]
            _confidence = confidence[idx]
            if lrflip:
                for i in range(len(_mkpts1)):
                    _mkpts1[i][0] = image_1.shape[3] - _mkpts1[i][0]
                    _mkpts2[i][0] = image_2.shape[3] - _mkpts2[i][0]
            if ulflip:
                for i in range(len(_mkpts1)):
                    _mkpts1[i][1] = image_1.shape[2] - _mkpts1[i][1]
                    _mkpts2[i][1] = image_2.shape[2] - _mkpts2[i][1]

            mkpts1_all.append(_mkpts1)
            mkpts2_all.append(_mkpts2)
            confidence_all.append(_confidence)

            batch_id += 1

    mkpts1 = np.concatenate(mkpts1_all, axis=0)
    mkpts2 = np.concatenate(mkpts2_all, axis=0)
    confidence = np.concatenate(confidence_all, axis=0)

    mkpts1 /= scale_1
    mkpts2 /= scale_2

    return mkpts1, mkpts2, confidence

# Weighted random sampling
# https://github.com/Parskatt/DKM/blob/988ccbc1021459b807411c3eb683d0e3432b2a15/dkm/models/dkm.py#L629-L645
def points_sample(dense_matches_1, dense_matches_2, dense_confidence, num = 2000, relative_confidence_threshold = 0.0):
    matches_1 = dense_matches_1
    matches_2 = dense_matches_2
    confidence = dense_confidence
    relative_confidence = confidence/confidence.max()
    matches_1, matches_2, confidence = (
        matches_1[relative_confidence > relative_confidence_threshold],
        matches_2[relative_confidence > relative_confidence_threshold],
        confidence[relative_confidence > relative_confidence_threshold],
    )
    good_samples = np.random.choice(
        np.arange(len(matches_1)),
        size=min(num, len(confidence)),
        replace=False,
        p=confidence/np.sum(confidence),
    )
    return matches_1[good_samples], matches_2[good_samples], confidence[good_samples]

# Validation

In [None]:
import os
import pandas as pd
from tqdm.notebook import tqdm as tqdm

input_dir = "/kaggle/input/image-matching-challenge-2022/train"
csv_dir = "/kaggle/input/imc2022-validation-csv"
weight_dir = "/kaggle/input"
output_dir = "/kaggle/working"
output_csv = "submission.csv"

device = torch.device("cuda")

LoFTR_param = {
    "device": device,
    "model": load_model_LoFTR(os.path.join(weight_dir, "kornia-loftr", "loftr_outdoor.ckpt"), device),
    "img_size": 840,
}
findFMat_param = {
    "ransacReprojThreshold": 0.15,
    "confidence": 0.9999,
    "maxIters": 10000,
}

total_time = 0
sample_id_list_all = []
fund_matrix_list_all = []
for scene in scene_list:

    image_dir = os.path.join(input_dir, scene, "images")

    # load validation.csv
    df = pd.read_csv(os.path.join(csv_dir, f"{scene}.csv"))
    # Exclude low covisibility pairs
    df = df[(df["covisibility"] > cfg["covisibility_thr_min"]) & (df["covisibility"] < cfg["covisibility_thr_max"])]
    # Exclude small images
    df = df[df["min_longest_edge"] > cfg["min_longest_edge"]]
    # Exclude camera pairs that are far apart in the Z direction
    df = df[df["dZ"] < cfg["dZ_thr"]]
    # Exclude images that are heavily tilted
    df = df[(df["image_1_rotation"] < cfg["rotation_thr"]) & (df["image_2_rotation"] < cfg["rotation_thr"])]

    # random sample
    df = df.sample(n=min(len(df), cfg["max_num_pairs"]), random_state=42)

    sample_ids = df["sample_id"].values
    image_1_ids = df["image_1_id"].values
    image_2_ids = df["image_2_id"].values

    sample_id_list = []
    fund_matrix_list = []
    for sample_id, image_1_id, image_2_id in tqdm(zip(sample_ids, image_1_ids, image_2_ids), desc=scene, total=len(sample_ids), dynamic_ncols=True):
        
        start = time.time()
        # 1. Preprocess(e.g. load image)
        input_image_1 = cv2.imread(os.path.join(image_dir, f"{image_1_id}.jpg"))
        input_image_2 = cv2.imread(os.path.join(image_dir, f"{image_2_id}.jpg"))

        # 2. Inference(e.g. matcher)
        mkpts1, mkpts2, confidence = matching_LoFTR(input_image_1, input_image_2, LoFTR_param)

        # 3 . Postprocess(e.g. RANSAC)
        mkpts1, mkpts2, _ = points_sample(mkpts1, mkpts2, confidence)
        if len(mkpts1) > 7:
            F, _ = cv2.findFundamentalMat(
                mkpts1, mkpts2, cv2.USAC_MAGSAC,
                findFMat_param["ransacReprojThreshold"],
                findFMat_param["confidence"],
                findFMat_param["maxIters"])
            if F.shape != (3, 3):
                F = np.zeros((3, 3))
        else:
            F = np.zeros((3, 3))    
            
        sample_id_list.append(sample_id)
        fund_matrix_list.append(FlattenMatrix(F))

        end = time.time()
        total_time = total_time + (end - start)

    if sample_id_list_all == []:
        sample_id_list_all = sample_id_list
        fund_matrix_list_all = fund_matrix_list
    else:
        sample_id_list_all.extend(sample_id_list)
        fund_matrix_list_all.extend(fund_matrix_list)

# Evaluation
maa = evaluate(input_dir, sample_id_list_all, fund_matrix_list_all)
print(f'mAA={maa:.05f} (n={len(sample_id_list_all)}), elapsed time: {total_time/60.0:.2f} min -> {(total_time/len(sample_id_list_all)):.2f} sec/pair')