In [1]:
import os
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import faiss
import numpy as np
import time

In [2]:
model = CLIPModel.from_pretrained("/data/similarities/model/openaiclipvitbasepatch32")
processor = CLIPProcessor.from_pretrained("/data/similarities/model/openaiclipvitbasepatch32")

2023-11-16 15:10:18.146211: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-16 15:10:18.275458: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-16 15:10:18.309716: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-16 15:10:18.859771: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

In [3]:
# 函数：从图像URL中提取特征向量
def extract_features(image_path, model, processor):
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt")
    outputs = model.get_image_features(**inputs)
    return outputs[0].detach().numpy()

In [4]:
# 构建 Faiss 索引
index = faiss.IndexFlatL2(512)

image_names = []
image_features = []

In [5]:
start_time = time.time()
# 提取特征并建立索引
for img_name in os.listdir("/data/similarities/data/pics/"):
    img_path = os.path.join("/data/similarities/data/pics/", img_name)
    features = extract_features(img_path, model, processor)
    index.add(np.array([features]))
    image_names.append(img_name)
    image_features.append(features)
end_time = time.time()
execution_time = end_time - start_time

In [6]:
print("execution_time:",execution_time)

execution_time: 594.3618865013123


In [7]:
def save_faiss_index(index, file_path):
    faiss.write_index(index, file_path)
faiss_index_file = "/data/similarities/index/faiss_index_cpu_6374.index"
save_faiss_index(index, faiss_index_file)

In [8]:
# 找出重复的图片
duplicates = []
threshold = 0.8  # 相似度阈值，可以根据需要调整
for i in range(len(image_features)):
    _, indices = index.search(np.array([image_features[i]]), 2)  # 搜索最相似的两张图片（包括它自己）
    if indices[0][1] != i and np.dot(image_features[i], image_features[indices[0][1]]) > threshold:
        duplicates.append((image_names[i], image_names[indices[0][1]]))


In [9]:
# 记录并删除重复的图片
with open('/data/similarities/index/duplicates_cpu_0.8_del.txt', 'w') as f:
    for dup1, dup2 in duplicates:
#         f.write(f'{dup1}, {dup2}\n')
        if os.path.exists(os.path.join("/data/similarities/data/pics/", dup2)):
            if os.remove(os.path.join("/data/similarities/data/pics/", dup2)):  # 删除重复图片之一
                f.write(f'{dup2}\n')
            