# CipherFace

In [1]:
# built-in dependencies
import os
import time
from typing import List
import pickle
import base64
import warnings
warnings.filterwarnings('ignore')

# 3rd party dependencies
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_lfw_pairs
from deepface import DeepFace
import tenseal as ts

2024-12-16 17:19:51.929023: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-16 17:19:52.030094: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-12-16 17:19:52.030126: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
print(f"Experiments done in DeepFace {DeepFace.__version__}")

Experiments done in DeepFace 0.0.94


In [3]:
detector_backend = "mtcnn"
model_name = "Facenet512" # Set to "Facenet", "Facenet512" or "VGG-Face"
distance_metric = "cosine" # Set to euclidean, cosine

In [4]:
# tenseal cryptosystem configurations - these are all offering 128-bit security
configs = [
    (8192, [60, 40, 40, 60], 40),
    (16384, [31, 60, 60, 60, 60, 60, 60, 31], 60),
]

# set your crytosystem's configuration here
cs_config = configs[1]

In [None]:
config = {
    "Facenet512": {
        "euclidean": 24.35,
        # "cosine": 0.026798495128309553,
        "cosine": 0.02232566879533769,
        
    },
    "Facenet": {
        "euclidean": 12.28,
        "cosine": 0.02431508799003538,
    },
    "VGG-Face": {
        "euclidean": 1.17,
        "cosine": 0.663973458216446,
    }
}

threshold = config[model_name][distance_metric]

# Prepare LFW Dataset

In [6]:
target_path = "../lfwe/test"
lfw_input_file = "../dataset/test_lfw.npy"
lfw_label_file = "../dataset/test_labels.npy"

In [7]:
if os.path.exists(lfw_input_file) is False:
    print("pulling LFW dataset")
    fetch_lfw_pairs = fetch_lfw_pairs(subset = 'test', color = True
     , resize = 2
     , funneled = False
     , slice_=None
    )

    pairs = fetch_lfw_pairs.pairs
    labels = fetch_lfw_pairs.target

    np.save(lfw_input_file, pairs)
    np.save(lfw_label_file, labels)
else:
    print("LFW dataset is already pulled")
    labels = np.load(lfw_label_file)

LFW dataset is already pulled


In [8]:
for i in tqdm(range(0, 1000)):
    img1_target = f"{target_path}/{i}_1.jpg"
    img2_target = f"{target_path}/{i}_2.jpg"
    
    if os.path.exists(img1_target) is False:
        img1 = pairs[i][0]
        # plt.imsave(img1_target, img1/255) #works for my mac
        plt.imsave(img1_target, img1) #works for my debian
    
    if os.path.exists(img2_target) is False:
        img2 = pairs[i][1]
        # plt.imsave(img2_target, img2/255) #works for my mac
        plt.imsave(img2_target, img2) #works for my debian

100%|██████████| 1000/1000 [00:00<00:00, 245942.54it/s]


# Find Embeddings

In [9]:
def find_embeddings(img_path: str) -> List[list]:
    """
    Find embeddings of each face appearing in a given image
    Args:
        img_path (str): given input image
    Returns:
        results (list): embedding of each face
    """
    if os.path.exists(img_path) is False:
        raise ValueError(f"{img_path} not found")
    
    instances = []
    img_objs = DeepFace.represent(
        img_path = img_path,
        model_name = model_name,
        detector_backend = detector_backend,
        enforce_detection = False,
    )

    for img_obj in img_objs:
        embedding = img_obj["embedding"]
        instances.append(embedding)
    
    return instances

In [10]:
img_paths = []
for dirpath, dirnames, filenames in os.walk(target_path):
    for filename in filenames:
        img_path = f"{dirpath}/{filename}"
        img_paths.append(img_path)

In [11]:
print(f"there are {len(img_paths)} images available")

there are 2000 images available


In [12]:
embeddings_file = f"embeddings_{model_name}_{detector_backend}.pkl"

if os.path.exists(embeddings_file) is False:
    instances = []
    for img_path in tqdm(img_paths):
        embeddings = find_embeddings(img_path)
        for embedding in embeddings:
            instances.append((img_path, embedding))

    with open(embeddings_file, 'wb') as file:
        pickle.dump(instances, file)
else:
    print("embeddings calculated already")
    with open(embeddings_file, 'rb') as file:
        instances = pickle.load(file)

embeddings calculated already


In [13]:
# while calculating distance in euclidean, squared values calculated, so negative values are not a problem
# but negative values in embeddings confuse the homomorphic encryption, apply min max normalization
if distance_metric == "cosine":
    min_val = np.inf
    max_val = -np.inf
    for img_path, embedding in instances:
        for i in embedding:
            if i < min_val:
                min_val = i
            if i > max_val:
                max_val = i

    print(f"min: {min_val}, max: {max_val}")

    for img_path, embedding in instances:
        for i in range(len(embedding)):
            embedding[i] = ( (np.array(embedding[i]) - min_val) / (max_val - min_val) ).tolist()

min: -4.779830455780029, max: 4.792800426483154


In [14]:
# normalize vectors in advance for cosine similarity
norm_instances = []
if distance_metric == "cosine":
    for img_path, embedding in instances:
        norm = np.linalg.norm(embedding)
        norm_instances.append((img_path, embedding / norm))
    instances = norm_instances

# Performance On Plain Embeddings

In [15]:
columns = ["img_path", "embedding"]

df = pd.DataFrame(instances, columns = columns)

In [16]:
df.head()

Unnamed: 0,img_path,embedding
0,../lfwe/test/253_2.jpg,"[0.06216024060562548, 0.04315538045304618, 0.0..."
1,../lfwe/test/858_1.jpg,"[0.03829266215076155, 0.06136259995523283, 0.0..."
2,../lfwe/test/214_1.jpg,"[0.04578124460758277, 0.05695021381085548, 0.0..."
3,../lfwe/test/104_2.jpg,"[0.04828243252411915, 0.05915359048645701, 0.0..."
4,../lfwe/test/104_2.jpg,"[0.03571250223695061, 0.03882295279330867, 0.0..."


In [17]:
def find_euclidean_distance(
    source_representation: list, test_representation: list
) -> np.float64:
    """
    Find Euclidean distance between two vectors or batches of vectors.

    Args:
        source_representation (list): 1st vector or batch of vectors.
        test_representation (list): 2nd vector or batch of vectors.

    Returns:
        np.float64: Euclidean distance(s).
            Returns a np.float64 for single embeddings and np.ndarray for batch embeddings.
    """
    source_representation = np.asarray(source_representation)
    test_representation = np.asarray(test_representation)

    euclidean_distance = source_representation - test_representation
    euclidean_distance = np.sum(np.multiply(euclidean_distance, euclidean_distance))
    euclidean_distance = np.sqrt(euclidean_distance)

    return euclidean_distance

def find_cosine_distance(
    source_representation: list, test_representation: list
) -> np.float64:
    # Convert lists to numpy arrays for efficient computation
    source_vector = np.array(source_representation)
    test_vector = np.array(test_representation)
    
    # Compute the dot product
    dot_product = np.dot(source_vector, test_vector)
    
    # Compute the norms (magnitudes) of the vectors
    norm_source = np.linalg.norm(source_vector)
    norm_test = np.linalg.norm(test_vector)
    
    # Compute cosine similarity
    cosine_similarity = dot_product / (norm_source * norm_test)
    
    # Compute cosine distance
    cosine_distance = 1 - cosine_similarity
    return cosine_distance

def norm(x: list) -> np.ndarray:
    """
    Normalize input vector with l2
    Args:
        x (np.ndarray or list): given vector
    Returns:
        y (np.ndarray): l2 normalized vector
    """
    # x = np.array(x)
    # return x / np.sqrt(np.sum(np.multiply(x, x)))
    return np.linalg.norm(x)

In [18]:
pivot_distances = []
for i in tqdm(range(0, 1000)):
    img1_target = f"../lfwe/test/{i}_1.jpg"
    img2_target = f"../lfwe/test/{i}_2.jpg"

    alphas = df[df["img_path"] == img1_target]["embedding"].values
    betas = df[df["img_path"] == img2_target]["embedding"].values

    distances = []
    for alpha in alphas:
        for beta in betas:
            if distance_metric == "euclidean":
                distance = find_euclidean_distance(alpha, beta)
            else:
                distance = find_cosine_distance(alpha, beta)
            distances.append(distance)
    
    distance = min(distances)
    pivot_distances.append(distance)

100%|██████████| 1000/1000 [00:00<00:00, 1803.14it/s]


In [19]:
pivot = pd.DataFrame(pivot_distances, columns = ["distance"])
pivot["actual"] = labels

pivot["prediction"] = 0
tic = time.time()
idx = pivot[pivot["distance"] < threshold].index
pivot.loc[idx, "prediction"] = 1
toc = time.time()

In [20]:
pivot.head()

Unnamed: 0,distance,actual,prediction
0,0.005686,1,1
1,0.016665,1,1
2,0.017489,1,1
3,0.00501,1,1
4,0.012855,1,1


In [21]:
# to find the best threshold
if False:
    tmp_df = pivot[["distance", "actual"]].copy()
    idx = tmp_df[tmp_df["actual"] == 1].index
    tmp_df["Decision"] = "Different Persons"
    tmp_df.loc[idx, "Decision"] = "Same Person"
    tmp_df = tmp_df.drop(columns = ["actual"])

    from chefboost import Chefboost
    Chefboost.fit(tmp_df)

In [22]:
print(f"verification requires an additional {toc - tic} seconds")

verification requires an additional 0.003953218460083008 seconds


In [23]:
accuracy = 100 * len(pivot[pivot["actual"] == pivot["prediction"]]) / 1000

In [24]:
# pivot[pivot.actual == 1].distance.plot.kde()
# pivot[pivot.actual == 0].distance.plot.kde()

In [25]:
print(f"{accuracy=}")

accuracy=94.9


# Initialize Homomorphic Enryption Object

In [26]:
def write_data(file_name, data):
    if type(data) == bytes:
        #bytes to base64
        data = base64.b64encode(data)
        
    with open(file_name, 'wb') as f: 
        f.write(data)

def read_data(file_name):
    with open(file_name, "rb") as f:
        data = f.read()
    
    #base64 to bytes
    return base64.b64decode(data)

In [27]:
mod, coeff, scale = cs_config
context = ts.context(
            ts.SCHEME_TYPE.CKKS,
            poly_modulus_degree = mod,
            coeff_mod_bit_sizes = coeff
          )
context.generate_galois_keys()
context.global_scale = 2**scale

secret_context = context.serialize(save_secret_key = True)
write_data("secret.txt", secret_context)

context.make_context_public() #drop the secret_key from the context
public_context = context.serialize()
write_data("public.txt", public_context)

del context, secret_context, public_context

# Encryption

In [31]:
context = ts.context_from(read_data("secret.txt"))
# context = ts.context_from(read_data("public.txt")) # encryption can be done with public key, too

In [32]:
encrypted_embeddings = []
for index, instance in tqdm(df.iterrows(), total=df.shape[0]):
    img_path = instance["img_path"]

    embedding = instance["embedding"]
    enc_v1 = ts.ckks_vector(context, embedding)
    encrypted_embedding = enc_v1.serialize()
    encrypted_embeddings.append(encrypted_embedding)

100%|██████████| 2473/2473 [00:58<00:00, 42.37it/s]


In [33]:
encrypted_df = df.copy()
encrypted_df = encrypted_df.drop(columns = ["embedding"])
encrypted_df["encrypted_embedding"] = encrypted_embeddings

In [34]:
encrypted_df.head()

Unnamed: 0,img_path,encrypted_embedding
0,../lfwe/test/253_2.jpg,b'\n\x02\x80\x04\x12\xbf\xdeh^\xa1\x10\x04\x01...
1,../lfwe/test/858_1.jpg,b'\n\x02\x80\x04\x12\xf5\xddh^\xa1\x10\x04\x01...
2,../lfwe/test/214_1.jpg,b'\n\x02\x80\x04\x12\xfd\xddh^\xa1\x10\x04\x01...
3,../lfwe/test/104_2.jpg,b'\n\x02\x80\x04\x12\xb3\xddh^\xa1\x10\x04\x01...
4,../lfwe/test/104_2.jpg,b'\n\x02\x80\x04\x12\xc3\xddh^\xa1\x10\x04\x01...


In [35]:
del context, enc_v1, encrypted_embedding

# Homomorphic Calculations

In [36]:
context = ts.context_from(read_data("public.txt"))

In [37]:
one = ts.ckks_vector(context, [1])
one.link_context(context)

In [38]:
calculations = []
for i in tqdm(range(0, 1000)):
    img1_target = f"../lfwe/test/{i}_1.jpg"
    img2_target = f"../lfwe/test/{i}_2.jpg"

    alphas_idxs = encrypted_df[encrypted_df["img_path"] == img1_target].index
    betas_idxs = encrypted_df[encrypted_df["img_path"] == img2_target].index

    encrypted_distances = []
    encrypted_distance_denominators = []
    for alphas_idx in alphas_idxs:
        alpha_proto = encrypted_df.loc[alphas_idx]["encrypted_embedding"]
        alpha = ts.lazy_ckks_vector_from(alpha_proto)
        alpha.link_context(context)

        for betas_idx in betas_idxs:
            beta_proto = encrypted_df.loc[betas_idx]["encrypted_embedding"]
            beta = ts.lazy_ckks_vector_from(beta_proto)
            beta.link_context(context)

            if distance_metric == "euclidean":
                difference = alpha - beta
                encrypted_distance = difference.dot(difference)

                # if you try to decrypt it here, you will get exception because you don't have the private key
                # encrypted_squared_distance.decrypt()

            else:
                encrypted_distance = one - alpha.dot(beta)
            
            encrypted_distance = encrypted_distance.serialize()
            encrypted_distances.append(encrypted_distance)

    calculations.append((img1_target, img2_target, encrypted_distances))

100%|██████████| 1000/1000 [04:32<00:00,  3.67it/s]


In [39]:
del context, alpha_proto, beta_proto, alpha, beta

# Decryption

In [40]:
context = ts.context_from(read_data("secret.txt"))

In [41]:
distances = []
for img1_path, img2_path, encrypted_distances in tqdm(calculations):

    current_distances = []
    for encrypted_distance_proto in encrypted_distances:
        encrypted_distance = ts.lazy_ckks_vector_from(encrypted_distance_proto)
        encrypted_distance.link_context(context)

        distance = encrypted_distance.decrypt()[0]

        current_distances.append(distance)
    
    distances.append(min(current_distances))

100%|██████████| 1000/1000 [00:11<00:00, 83.57it/s]


In [42]:
results_df = pd.DataFrame(labels, columns = ["actual"])
results_df["distances"] = distances
if distance_metric == "euclidean":
    results_df["threshold"] = threshold * threshold
else:
    results_df["threshold"] = threshold

results_df["prediction"] = 0
tic = time.time()
idx = results_df[results_df["distances"] <= results_df["threshold"]].index
results_df.loc[idx, "prediction"] = 1
toc = time.time()

In [43]:
# print(f"verification requires an additional {toc - tic} seconds")

In [44]:
results_df.head()

Unnamed: 0,actual,distances,threshold,prediction
0,1,0.004778,0.022326,1
1,1,0.015753,0.022326,1
2,1,0.01679,0.022326,1
3,1,0.004142,0.022326,1
4,1,0.011512,0.022326,1


In [45]:
results_df.iloc[22]

actual        1.000000
distances     0.029641
threshold     0.022326
prediction    0.000000
Name: 22, dtype: float64

In [46]:
homomorphic_accuracy = (100 * len(results_df[results_df["actual"] == results_df["prediction"]]) / 1000)
print(f"accuracy was {accuracy} in plain verification whereas it is {homomorphic_accuracy}")

accuracy was 94.9 in plain verification whereas it is 95.0


In [47]:
# expect to have same accuracy with plain embedding comparison
error = abs(accuracy - homomorphic_accuracy)
assert error < 1, f"{error=}"

In [48]:
error

0.09999999999999432