# Validation of labelling Methods
## Comparing two proposed methods, mahalanobis base method and KRR+mahalanobis base method

# ------------------------------------------------------------------------------------------

## Prepare

In [None]:
#import
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPool2D
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from keras.callbacks import EarlyStopping
from tensorflow.keras.models import save_model, load_model

from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Lambda
from tensorflow.keras.utils import plot_model, to_categorical
from keras.callbacks import TensorBoard

import matplotlib.pyplot as plt
import plotly.express as px
import glob
import cv2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
import time
from mpl_toolkits.mplot3d import Axes3D

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.spatial import distance

from tensorflow.keras.applications import MobileNet
from tensorflow.keras.optimizers import SGD
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import seaborn as sns

import tensorflow.keras.backend as K
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from keras.models import Model
from gradcamutils import GradCam, GradCamPlusPlus, ScoreCam, GuidedBackPropagation, superimpose, read_and_preprocess_img, build_guided_model

from affine_a_method_det import set_deterministic, fit_affine_A_method_deterministic, inverse_transform_ridge, mahalanobis_sq

In [None]:
#fuction for making label
def create_image_labels(n_light, n_water, n_blackline, n_discoloration, n_dotsonline, n_adhesion, n_scratch):
    """
    function for making label corresponding to each abnormal mode
    
    Args:
        n_light (int): the number of light images
        n_water (int): the number of water images
        n_blackline (int): the number of black line images
        n_discoloration (int): the number of discoloration images
        n_dotsonline (int): the number of dots on line images
        n_adhesion (int): the number of adhesion images
        n_scrtch (int): the number of surface scratches images
    
    Returns:
        pd.DataFrame: 1collumn=image、2collumn=label
    """
    # Name list for each abnormal mode
    light_data = [f"Light-{i}" for i in range(1, n_light + 1)]
    water_data = [f"Water-{i}" for i in range(1, n_water + 1)]
    blackline_data = [f"BlackLine-{i}" for i in range(1, n_blackline + 1)]
    discoloration_data = [f"Discoloration-{i}" for i in range(1, n_discoloration + 1)]
    dotsonline_data = [f"DotsOnLine-{i}" for i in range(1, n_dotsonline + 1)]
    copper_data = [f"Adhesion-{i}" for i in range(1, n_adhesion + 1)]
    spark_data = [f"SurfaceScratch-{i}" for i in range(1, n_scratch + 1)]

    # Label list
    light_labels = [0] * n_light
    water_labels = [1] * n_water
    blackline_labels = [2] * n_blackline
    discoloration_labels = [3] * n_discoloration
    dotsonline_labels = [4] * n_dotsonline
    adhesion_labels = [5] * n_adhesion
    scratch_labels = [6] * n_scratch

    # make dataframe by combining data
    data = list(zip(light_data + water_data + blackline_data + discoloration_data + dotsonline_data + copper_data + spark_data, 
                    light_labels + water_labels + blackline_labels + discoloration_labels + dotsonline_labels + copper_labels + spark_labels))
    df = pd.DataFrame(data, columns=["image", "label"])

    return df

def load_7mode_images(src_dirs):
    """
    src_dirs: list containing full pass of seven folders
              ex: [
                   r"C:/path/mode0/*jpg",
                   r"C:/path/mode1/*jpg",
                   ...
                  ]

    return:
      all_images      :list of all images(cv2) 
      images_by_class : list of each abnormal mode images [list0, list1, ..., list6]
      labels          : list of labels corresponding to each image
      nums            : list of the number of each class images [n0,n1,...,n6]
    """

    assert len(src_dirs) == 7, "Designate 7 folder pass"

    images_by_class = []
    nums = []

    # process
    for i, path in enumerate(src_dirs):
        filepaths = glob.glob(path)
        print(f"Class {i}: {len(filepaths)} files")

        imgs = []
        for fp in filepaths:
            img = cv2.imread(fp)
            if img is not None:
                imgs.append(img)

        images_by_class.append(imgs)
        nums.append(len(imgs))

    # combine
    all_images = []
    for cls_imgs in images_by_class:
        all_images.extend(cls_imgs)

    # make label
    labels = create_image_labels(nums) 

    return all_images, images_by_class, labels, nums

# =========================
# split the data to 7 clusters and make X_clusters / Y_clusters 
# =========================
def prepare_clusters(source_features, target_feature, n_src=240, n_tgt=240, K=7):

    X_all = np.asarray(source_features, dtype=np.float64)
    Y_all = np.asarray(target_feature,   dtype=np.float64)
    if X_all.shape[0] != n_src*K or Y_all.shape[0] != n_tgt*K:
        raise ValueError(f"the number of sample is different: X_all={X_all.shape}, Y_all={Y_all.shape}, Ideal=({n_src*K},{n_tgt*K})")

    X_clusters, Y_clusters = [], []
    for k in range(K):
        X_clusters.append(X_all[k*n_src:(k+1)*n_src])
        Y_clusters.append(Y_all[k*n_tgt:(k+1)*n_tgt])
    return X_clusters, Y_clusters
    

In [None]:
#get pass

#wire raw images
#TrainingData
source_dirs = [
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
]
#TestData
source_test_dirs = [
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
]

#bright images
#TrainingData
target1_dirs = [
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
]
#TestData
test_target1_dirs = [
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
]

#camera dust images
#TrainingData
target2_dirs = [
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
]
#TestData
test_target2_dirs = [
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
    r"C:\Users\pass",
]


In [None]:
#get data and label
source_all_imgs, source_imgs_by_class, source_label, source_num = load_7mode_images(source_dirs)
source_test_all_imgs, source_test_imgs_by_class, source_test_label, source_test_num = load_7mode_images(source_test_dirs)
target1_all_imgs, target1_imgs_by_class, target1_label, target1_num = load_7mode_images(target1_dirs)
test_target1_all_imgs, test_target1_imgs_by_class, test_target1_label, test_target1_num = load_7mode_images(test_target1_dirs)
target2_all_imgs, target2_imgs_by_class, target2_label, target2_num = load_7mode_images(target2_dirs)
test_target2_all_imgs, test_target2_imgs_by_class, test_target2_label, test_target2_num = load_7mode_images(test_target2_dirs)

In [None]:
#normalization of images
sourcefile_list = [file.astype(float)/255 for file in source_all_imgs]
sourcefile_list = [cv2.resize(file, (360, 270)) for file in sourcefile_list]
sourcefile_test_list = [file.astype(float)/255 for file in source_test_all_imgs]
sourcefile_test_list = [cv2.resize(file, (360, 270)) for file in sourcefile_test_list]
targetfile1_list = [file.astype(float)/255 for file in target1_all_imgs]
targetfile1_list = [cv2.resize(file, (360, 270)) for file in targetfile1_list]
test_targetfile1_list = [file.astype(float)/255 for file in test_target1_all_imgs]
test_targetfile1_list = [cv2.resize(file, (360, 270)) for file in test_targetfile1_list]
targetfile2_list = [file.astype(float)/255 for file in target2_all_imgs]
targetfile2_list = [cv2.resize(file, (360, 270)) for file in targetfile2_list]
test_targetfile2_list = [file.astype(float)/255 for file in test_target2_all_imgs]
test_targetfile2_list = [cv2.resize(file, (360, 270)) for file in test_targetfile2_list]

#numpy list
original_source_label = source_label["label"]
original_source_label = np.array(original_source_label)
original_source_test_label = source_test_label["label"]
original_source_test_label = np.array(original_source_test_label)
original_target1_label = target1_label["label"]
original_target1_label = np.array(original_target1_label)
original_test_target1_label = test_target1_label["label"]
original_test_target1_label = np.array(original_test_target1_label)
original_target2_label = target2_label["label"]
original_target2_label = np.array(original_target2_label)
original_test_target2_label = test_target2_label["label"]
original_test_target2_label = np.array(original_test_target2_label)


#dummy parameter 
source_label = to_categorical(source_label["label"])
source_test_label = to_categorical(source_test_label["label"])
target1_label = to_categorical(target1_label["label"])
test_target1_label = to_categorical(test_target1_label["label"])
target2_label = to_categorical(target2_label["label"])
test_target2_label = to_categorical(test_target2_label["label"])

#change the data to numpy list
#save original data
raw_sourcefile_list = sourcefile_list
raw_sourcefile_test_list = sourcefile_test_list
raw_targetfile1_list = targetfile1_list
raw_test_targetfile1_list = test_targetfile1_list
raw_targetfile2_list = targetfile2_list
raw_test_targetfile2_list = test_targetfile2_list

#numpy list
sourcefile_list = np.array(sourcefile_list)
sourcefile_test_list = np.array(sourcefile_test_list)
targetfile1_list = np.array(targetfile1_list)
test_targetfile1_list = np.array(test_targetfile1_list)
targetfile2_list = np.array(targetfile2_list)
test_targetfile2_list = np.array(test_targetfile2_list)

In [None]:
# read base model
base_model = load_model('BaseModel.h5')
# separate feature extractor and output layer
feature_output = base_model.layers[-5].output

# GlobalAveragePooling
pooled_output = layers.GlobalAveragePooling2D()(feature_output)

# make feature extractor from base model
feature_extractor = Model(inputs=base_model.input, outputs=pooled_output)
feature_extractor.summary()

In [None]:
#extract features(feature vector) from each image
def extract_features(model, images):
    return model.predict(images, batch_size=32)

source_features = extract_features(feature_extractor, sourcefile_list)
target1_features = extract_features(feature_extractor, targetfile1_list)
target2_features = extract_features(feature_extractor, targetfile2_list)

source_test_features = extract_features(feature_extractor, sourcefile_test_list)
test_target1_features = extract_features(feature_extractor, test_targetfile1_list)
test_target2_features = extract_features(feature_extractor, test_targetfile2_list)

In [None]:
#reduce the dimension from 1024 to 100
n_comp = 100
pca = PCA(n_components=n_comp, svd_solver='full', random_state=42)
features_pca = pca.fit_transform(source_features)
# mapping data to the pca space made above
target1_features_pca = pca.transform(target1_features)
target2_features_pca = pca.transform(target2_features)
test_features_pca = pca.transform(source_test_features)
test_target1_features_pca = pca.transform(test_target1_features)
test_target2_features_pca = pca.transform(test_target2_features)

# ------------------------------------------------------------------------------------------

## Mahalanobis base labelling method

In [None]:
#semi-auto labelling to bright images
# ===== calculate the center of clusters =====
cluster_stats = {}
for label in np.unique(original_target1_label):
    cluster_data = target1_features_pca[original_target1_label == label]
    center = np.mean(cluster_data, axis=0)
    cov_matrix = np.cov(cluster_data, rowvar=False)
    cov_inv = np.linalg.pinv(cov_matrix) 
    cluster_stats[label] = {
        "center": center,
        "cov_inv": cov_inv
    }

# ===== calculate distance between target1_features_pca and test_target1_features_pca =====
raw_drift_scores = []
target1_drift_scores = []
target2_drift_scores = []
results = []
full_results = []

print("Semi-auto labelling to bright images")
for i, vec in enumerate(test_target1_features_pca):  
    distances = {}
    for label, stats in cluster_stats.items():
        center = stats["center"]
        cov_inv = stats["cov_inv"]
        dist = distance.mahalanobis(vec, center, cov_inv)
        distances[label] = dist

    min_dist_label = min(distances, key=distances.get)
    min_dist = distances[min_dist_label]

    results.append({
        "index": i,
        "nearest_cluster": min_dist_label,
        "mahalanobis_distance": round(min_dist, 4)
    })
    full_results.append({
        "index": i,
        "distance_from_0": distances[0],
        "distance_from_1": distances[1],
        "distance_from_2": distances[2],
        "distance_from_3": distances[3],
        "distance_from_4": distances[4],
        "distance_from_5": distances[5],
        "distance_from_6": distances[6]
    })

df_labels = pd.DataFrame(results)[["nearest_cluster"]].rename(columns={"nearest_cluster": "pred_label"})

save_path = r"C:\Users\pass\example.csv"
df_labels.to_csv(save_path, index=False, encoding="utf-8-sig")  
print("CSV saved:", save_path)

In [None]:
#semi-auto labelling to camera dust images
# ===== calculate the center of clusters =====
cluster_stats = {}
for label in np.unique(original_target2_label):
    cluster_data = target2_features_pca[original_target2_label == label]
    center = np.mean(cluster_data, axis=0)
    cov_matrix = np.cov(cluster_data, rowvar=False)
    cov_inv = np.linalg.pinv(cov_matrix) 
    cluster_stats[label] = {
        "center": center,
        "cov_inv": cov_inv
    }

# ===== calculate distance between target2_features_pca and test_target2_features_pca =====
raw_drift_scores = []
target1_drift_scores = []
target2_drift_scores = []
results = []
full_results = []

print("Semi-auto labelling to camera dust images")
for i, vec in enumerate(test_target2_features_pca):  
    distances = {}
    for label, stats in cluster_stats.items():
        center = stats["center"]
        cov_inv = stats["cov_inv"]
        dist = distance.mahalanobis(vec, center, cov_inv)
        distances[label] = dist

    min_dist_label = min(distances, key=distances.get)
    min_dist = distances[min_dist_label]

    results.append({
        "index": i,
        "nearest_cluster": min_dist_label,
        "mahalanobis_distance": round(min_dist, 4)
    })
    full_results.append({
        "index": i,
        "distance_from_0": distances[0],
        "distance_from_1": distances[1],
        "distance_from_2": distances[2],
        "distance_from_3": distances[3],
        "distance_from_4": distances[4],
        "distance_from_5": distances[5],
        "distance_from_6": distances[6]
    })

df_labels = pd.DataFrame(results)[["nearest_cluster"]].rename(columns={"nearest_cluster": "pred_label"})

save_path = r"C:\Users\pass\example.csv"
df_labels.to_csv(save_path, index=False, encoding="utf-8-sig")  
print("CSV saved:", save_path)

# ------------------------------------------------------------------------------------------

## KRR+mahalanobis base labelling method

In [None]:
# =============================
# Sinkhorn OT 
# =============================

def _sqeuclidean_cost(X, Y):
    """
    X: (Nx, d)
    Y: (Ny, d)
    return: C_ij = ||X_i - Y_j||^2  (Nx, Ny)
    """
    X = np.asarray(X, dtype=np.float64)
    Y = np.asarray(Y, dtype=np.float64)
    diff = X[:, None, :] - Y[None, :, :]
    C = np.sum(diff**2, axis=2)
    return C

def sinkhorn_ot_barycentric(X, Y, eps=0.1, n_iter=200, tol=1e-9):
    X = np.asarray(X, dtype=np.float64)
    Y = np.asarray(Y, dtype=np.float64)
    Nx, d = X.shape
    Ny = Y.shape[0]

    a = np.ones(Nx, dtype=np.float64) / Nx
    b = np.ones(Ny, dtype=np.float64) / Ny

    C = _sqeuclidean_cost(X, Y)  # (Nx, Ny)
    K = np.exp(-C / eps)

    K[K < 1e-300] = 1e-300

    u = np.ones(Nx, dtype=np.float64)
    v = np.ones(Ny, dtype=np.float64)

    for _ in range(n_iter):
        K_v = K @ v
        K_v[K_v < 1e-300] = 1e-300     
        u = a / K_v

        K_t_u = K.T @ u
        K_t_u[K_t_u < 1e-300] = 1e-300 
        v = b / K_t_u

    P = (u[:, None] * K) * v[None, :]

    col_sums = P.sum(axis=0)
    col_sums[col_sums < 1e-300] = 1e-300

    X_tilde = (X.T @ P / col_sums[None, :]).T
    return X_tilde, P


# =============================
# KRR
# =============================

def _rbf_kernel(Y1, Y2, sigma):
    """
    RBF kernal K_ij = exp(-||y_i - y_j||^2 / (2 sigma^2))
    Y1: (N1, d), Y2: (N2, d)
    return: (N1, N2)
    """
    Y1 = np.asarray(Y1, dtype=np.float64)
    Y2 = np.asarray(Y2, dtype=np.float64)
    diff = Y1[:, None, :] - Y2[None, :, :]
    dist2 = np.sum(diff**2, axis=2)
    K = np.exp(-dist2 / (2.0 * sigma**2))
    return K

def _safe_cov_simple(X, eps=1e-3):
    """
    X: (N, d)
    returan: covariance + eps * I
    """
    X = np.asarray(X, dtype=np.float64)
    N = X.shape[0]
    if N <= 1:
        d = X.shape[1]
        return eps * np.eye(d)
    Sigma = np.cov(X, rowvar=False, bias=False)
    d = Sigma.shape[0]
    Sigma = Sigma + eps * np.eye(d)
    return Sigma

def krr_train_one_cluster(Y_train, X_train, sigma, lam_krr, eps_cov=1e-3, X_for_stats=None):
    """
    Learn KRR for one cluster。
    Y_train: (Ny, d) input（new data）
    X_train: (Ny, d) output
    sigma: width of RBF kernel
    lam_krr: KRR normalization
    eps_cov: ridge
    return: model_k (dict)
    """
    Y_train = np.asarray(Y_train, dtype=np.float64)
    X_train = np.asarray(X_train, dtype=np.float64)
    Ny, d = Y_train.shape

    # Kernel
    K = _rbf_kernel(Y_train, Y_train, sigma=sigma)  # (Ny, Ny)
    A = K + lam_krr * np.eye(Ny)
    Alpha = np.linalg.solve(A, X_train)             # (Ny, d)

    if X_for_stats is None:
        X_for_stats = X_train
    X_for_stats = np.asarray(X_for_stats, dtype=np.float64)
    mu_X = X_for_stats.mean(axis=0)
    Sigma_X = _safe_cov_simple(X_for_stats, eps=eps_cov)

    model_k = {
        "Y_train": Y_train,
        "Alpha": Alpha,
        "sigma": sigma,
        "lam_krr": lam_krr,
        "mu_X": mu_X,
        "Sigma_X": Sigma_X,
    }
    return model_k

def krr_predict(model_k, Y_group):
    """
    predict Y_group -> X_hat by using trained KRR
    model_k: dict made by krr_train_one_cluster
    Y_group: (M, d)
    return: X_hat: (M, d)
    """
    Y_group = np.asarray(Y_group, dtype=np.float64)
    Y_train = model_k["Y_train"]
    Alpha = model_k["Alpha"]
    sigma = model_k["sigma"]

    K_new = _rbf_kernel(Y_group, Y_train, sigma=sigma)  # (M, Ny)
    X_hat = K_new @ Alpha                               # (M, d)
    return X_hat

# =============================
# learn Y->X for all 7 clusters by using OT+KRR
# =============================

def train_7_clusters_ot_krr(
    X_clusters,           # raw data cluster: list of (Nx_k, d)
    Y_clusters,           # new data cluster: list of (Ny_k, d)
    sigma=10.0,
    lam_krr=1e-3,
    eps_cov=1e-3,
    ot_eps=0.1,
    ot_n_iter=200,
):
    """
    For each cluster K:
      1. make Transportation Plan P_k between X_k and Y_k by using OT (Sinkhorn)
      2. make Y_k -> X̃_k by using barycentric
      3. learn f_k: Y -> X̃ by using KRR

    return:
      models_ot_krr: list [model_k]
    """
    assert len(X_clusters) == 7 and len(Y_clusters) == 7
    models = []

    for k in range(7):
        Xk = np.asarray(X_clusters[k], dtype=np.float64)
        Yk = np.asarray(Y_clusters[k], dtype=np.float64)

        X_tilde_k, P_k = sinkhorn_ot_barycentric(Xk, Yk, eps=ot_eps, n_iter=ot_n_iter)

        mk = krr_train_one_cluster(
            Y_train=Yk,
            X_train=X_tilde_k,
            sigma=sigma,
            lam_krr=lam_krr,
            eps_cov=eps_cov,
            X_for_stats=Xk,
        )
        mk["k"] = k
        mk["ot_eps"] = ot_eps
        mk["ot_n_iter"] = ot_n_iter
        models.append(mk)

    return models

# =============================
# Evaluatioon Function
# =============================

def evaluate_ot_krr_models_blockwise(
    models_ot_krr,
    X_clusters,    
    Y_clusters,
    use_common_sigma=False,
    hybrid_alpha=None,
):
    """
    evaluate models_ot_krr traied by OT+KRR
    index:
      - self_maha_mean
      - self_euclid_mean
      - self_is_min_rate
    """
    K = len(models_ot_krr)
    mus = [m["mu_X"] for m in models_ot_krr]
    sigmas = [m["Sigma_X"] for m in models_ot_krr]

    if use_common_sigma:
        Sigma_common = sum(sigmas) / len(sigmas)
    else:
        Sigma_common = None

    self_mahas = []
    self_eucs  = []
    is_min_flags = []

    for k in range(K):
        Yk = np.asarray(Y_clusters[k], dtype=np.float64)
        X_hat = krr_predict(models_ot_krr[k], Yk)

        mu_k = mus[k]
        Sigma_k = sigmas[k]

        if use_common_sigma:
            maha_self = np.array([mahalanobis_sq(x, mu_k, Sigma_common) for x in X_hat])
        else:
            maha_self = np.array([mahalanobis_sq(x, mu_k, Sigma_k) for x in X_hat])

        euc_self2 = np.sum((X_hat - mu_k[None, :])**2, axis=1)

        all_scores = []
        for j in range(K):
            mu_j = mus[j]
            if use_common_sigma:
                maha_j = np.array([mahalanobis_sq(x, mu_j, Sigma_common) for x in X_hat])
            else:
                Sigma_j = sigmas[j]
                maha_j = np.array([mahalanobis_sq(x, mu_j, Sigma_j) for x in X_hat])

            if hybrid_alpha is not None:
                euc_j2 = np.sum((X_hat - mu_j[None, :])**2, axis=1)
                score_j = hybrid_alpha * maha_j + (1.0 - hybrid_alpha) * euc_j2
            else:
                score_j = maha_j
            all_scores.append(score_j)

        all_scores = np.stack(all_scores, axis=1)
        pred_min = np.argmin(all_scores, axis=1)
        is_min = (pred_min == k)

        self_mahas.append(maha_self.mean())
        self_eucs.append(np.sqrt(euc_self2).mean())
        is_min_flags.append(is_min.mean())

    res = {
        "self_maha_mean": float(np.mean(self_mahas)),
        "self_euclid_mean": float(np.mean(self_eucs)),
        "self_is_min_rate": float(np.mean(is_min_flags)),
    }
    return res


import json
try:
    import pandas as pd
    _HAS_PANDAS = True
except Exception:
    _HAS_PANDAS = False

def run_ot_krr_experiments(
    X_clusters,
    Y_clusters,
    sigma_list=(10.0, 20.0),
    lam_krr_list=(1e-3, 1e-2),
    eps_cov_list=(1e-3,),
    ot_eps_list=(0.1, 0.2),
    ot_n_iter_list=(100,),
    use_common_sigma=False,
    hybrid_alpha=None,
):
    logs = []
    best = None

    total = (len(sigma_list) * len(lam_krr_list) *
             len(eps_cov_list) * len(ot_eps_list) *
             len(ot_n_iter_list))
    print(f"[OT+KRR] Total combinations: {total}")
    cnt = 0

    for sigma in sigma_list:
        for lam_krr in lam_krr_list:
            for eps_cov in eps_cov_list:
                for ot_eps in ot_eps_list:
                    for ot_it in ot_n_iter_list:
                        cnt += 1
                        print(f"  [{cnt}/{total}] sigma={sigma}, lam_krr={lam_krr}, "
                              f"eps_cov={eps_cov}, ot_eps={ot_eps}, ot_n_iter={ot_it}")

                        models = train_7_clusters_ot_krr(
                            X_clusters,
                            Y_clusters,
                            sigma=sigma,
                            lam_krr=lam_krr,
                            eps_cov=eps_cov,
                            ot_eps=ot_eps,
                            ot_n_iter=ot_it,
                        )

                        eval_res = evaluate_ot_krr_models_blockwise(
                            models,
                            X_clusters,
                            Y_clusters,
                            use_common_sigma=use_common_sigma,
                            hybrid_alpha=hybrid_alpha,
                        )


                        v1 = eval_res.get("self_maha_mean", np.nan)
                        v2 = eval_res.get("self_euclid_mean", np.nan)
                        v3 = eval_res.get("self_is_min_rate", np.nan)

                        if (not np.isfinite(v1)) or (not np.isfinite(v2)) or (not np.isfinite(v3)):
                            print("    → NaN / inf detected in eval_res, skip this combination.")

                            continue

                        row = {
                            "sigma": sigma,
                            "lam_krr": lam_krr,
                            "eps_cov": eps_cov,
                            "ot_eps": ot_eps,
                            "ot_n_iter": ot_it,
                            **eval_res,
                        }
                        logs.append(row)

                        if (best is None) or (eval_res["self_is_min_rate"] > best[0]["self_is_min_rate"]):
                            best = (eval_res, {
                                "sigma": sigma,
                                "lam_krr": lam_krr,
                                "eps_cov": eps_cov,
                                "ot_eps": ot_eps,
                                "ot_n_iter": ot_it,
                            })

    if best is None:
        print("there is no valid result")
        best_out = None
    else:
        best_eval, best_params = best
        best_out = {**best_params, **best_eval}

    return logs, best_out

def ot_krr_logs_to_df(logs):
    if _HAS_PANDAS:
        return pd.DataFrame(logs)
    return logs

# =============================
# semi-auto labelling to Y_new
# =============================

def semi_auto_label_with_ot_krr(
    Y_new,            # (M, d) 
    models_ot_krr,    # 7models trained by train_7_clusters_ot_krr
    use_common_sigma=False,
    hybrid_alpha=None,
):
    """
    For Y_new:
      calculate X_hat_k = f_k(Y_new) by using KRR model f_k
      X_hat_k is labelled based on nearest center of cluster

    return:
      pred_labels: (M,)    prediction mode（0〜6）
      min_scores:  (M,)    minimum score for the prediction mode
      all_scores:  (M, 7)  all scores
    """
    Y_new = np.asarray(Y_new, dtype=np.float64)
    M, d = Y_new.shape
    K = len(models_ot_krr)

    mus = [m["mu_X"] for m in models_ot_krr]
    sigmas = [m["Sigma_X"] for m in models_ot_krr]

    if use_common_sigma:
        Sigma_common = sum(sigmas) / len(sigmas)
    else:
        Sigma_common = None

    all_scores = []

    for k in range(K):
        X_hat_k = krr_predict(models_ot_krr[k], Y_new)  # (M, d)
        mu_k = mus[k]
        Sigma_k = sigmas[k]

        if use_common_sigma:
            maha_k = np.array([mahalanobis_sq(x, mu_k, Sigma_common) for x in X_hat_k])
        else:
            maha_k = np.array([mahalanobis_sq(x, mu_k, Sigma_k) for x in X_hat_k])

        if hybrid_alpha is not None:
            euc2_k = np.sum((X_hat_k - mu_k[None, :])**2, axis=1)
            score_k = hybrid_alpha * maha_k + (1.0 - hybrid_alpha) * euc2_k
        else:
            score_k = maha_k

        all_scores.append(score_k)

    all_scores = np.stack(all_scores, axis=1)  # (M, K)

    pred_labels = np.argmin(all_scores, axis=1)     # (M,)
    min_scores = np.min(all_scores, axis=1)         # (M,)

    return pred_labels, min_scores, all_scores

In [None]:
#semi-auto labelling to bright images

# 1. prepare clusters
#    X side: features_pca
#    Y side: target1_features_pca
X_clusters1, Y_clusters1 = prepare_clusters(
    features_pca,
    target1_features_pca,
    n_src=240,  # the number of X/mode
    n_tgt=48,   # the number of Y/mode
    K=7
)

# 2. parameter sweep about OT+KRR
ot_krr_logs1, ot_krr_best1 = run_ot_krr_experiments(
    X_clusters1, Y_clusters1,
    sigma_list=(10.0, 20.0),
    lam_krr_list=(1e-3, 1e-2),
    eps_cov_list=(1e-3,),
    ot_eps_list=(1.0, 2.0, 5.0),
    ot_n_iter_list=(100,),
    use_common_sigma=False,
    hybrid_alpha=None,
)
print("OT+KRR Best:", ot_krr_best1)
df_otkrr1 = ot_krr_logs_to_df(ot_krr_logs1)

# 3. train final model by using the most proper parameter
bp1 = ot_krr_best1
models_ot_krr_best1 = train_7_clusters_ot_krr(
    X_clusters1,
    Y_clusters1,
    sigma=bp1["sigma"],
    lam_krr=bp1["lam_krr"],
    eps_cov=bp1["eps_cov"],
    ot_eps=bp1["ot_eps"],
    ot_n_iter=bp1["ot_n_iter"],
)

# 4. semi-auto labelling to Y_new_features_pca
#    Y_new_features_pca: (M, 100) without label
Y_new_features_pca1 = test_target1_features_pca
pred_labels, min_scores, all_scores = semi_auto_label_with_ot_krr(
    Y_new_features_pca1,
    models_ot_krr_best1,
    use_common_sigma=False,
    hybrid_alpha=None,  
)

min_maha_dist = np.sqrt(min_scores)  # (M,)

df_result = pd.DataFrame({
    "pred_label": pred_labels,
    "min_maha_sq": min_scores,
    "min_maha": np.sqrt(min_scores),
})
# CSV output
save_path = r"C:\Users\pass\example.csv"
df_result.to_csv(save_path, index=False, encoding="utf-8")
print("CSV saved:", save_path)

df_result.head()

In [None]:
#semi-auto labelling to camera dust images

# 1. prepare clusters
#    X side: features_pca
#    Y side: target1_features_pca
X_clusters2, Y_clusters2 = prepare_clusters(
    features_pca,
    target2_features_pca,
    n_src=240,  # the number of X/mode
    n_tgt=48,   # the number of Y/mode
    K=7
)

# 2. parameter sweep about OT+KRR
ot_krr_logs2, ot_krr_best2 = run_ot_krr_experiments(
    X_clusters2, Y_clusters2,
    sigma_list=(10.0, 20.0),
    lam_krr_list=(1e-3, 1e-2),
    eps_cov_list=(1e-3,),
    ot_eps_list=(1.0, 2.0, 5.0),
    ot_n_iter_list=(100,),
    use_common_sigma=False,
    hybrid_alpha=None,
)
print("OT+KRR Best:", ot_krr_best2)
df_otkrr2 = ot_krr_logs_to_df(ot_krr_logs2)

# 3. train final model by using the most proper parameter
bp2 = ot_krr_best2
models_ot_krr_best2 = train_7_clusters_ot_krr(
    X_clusters2,
    Y_clusters2,
    sigma=bp2["sigma"],
    lam_krr=bp2["lam_krr"],
    eps_cov=bp2["eps_cov"],
    ot_eps=bp2["ot_eps"],
    ot_n_iter=bp2["ot_n_iter"],
)

# 4. semi-auto labelling to Y_new_features_pca
#    Y_new_features_pca: (M, 100) without label
Y_new_features_pca2 = test_target2_features_pca
pred_labels, min_scores, all_scores = semi_auto_label_with_ot_krr(
    Y_new_features_pca2,
    models_ot_krr_best2,
    use_common_sigma=False,
    hybrid_alpha=None,  
)

min_maha_dist = np.sqrt(min_scores)  # (M,)

df_result = pd.DataFrame({
    "pred_label": pred_labels,
    "min_maha_sq": min_scores,
    "min_maha": np.sqrt(min_scores),
})
# CSV output
save_path = r"C:\Users\pass\example.csv"
df_result.to_csv(save_path, index=False, encoding="utf-8")
print("CSV saved:", save_path)

df_result.head()