In [20]:
import torch
import clip
from PIL import Image
import pandas as pd
import faiss
import numpy as np

In [23]:
# Load the pre-trained CLIP model and its preprocessing pipeline
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
# process each of the images and then extract the embeddings from it
def process_images(image_paths):
    embeddings = []
    for path in image_paths:
        image = Image.open(path)

        # preprocesses the image, adds another
        image_input = preprocess(image).unsqueeze(0).to(device)

        # we don't calcuate gradients when doing inference
        with torch.no_grad():
            embedding = model.encode_image(image_input)
        embeddings.append(embedding)

    img_embedding = torch.mean(torch.stack(embeddings), dim=0)
    return img_embedding.cpu().numpy()

In [25]:
# load the feature descriptions from data.csv
def load_descriptions():
    df = pd.read_csv('data.csv')
    descriptions = df['description'].tolist()
    return descriptions

In [None]:
def embed_descriptions(descriptions):
    text_embeddings = []
    for description in descriptions:
        # tokenizes text and converts it into a tensor suitable for the model
        text_input = clip.tokenize(description).to(device)
        with torch.no_grad():
            # generate the text embedding
            text_embedding = model.encode_text(text_input)

        # converts the tensor to a numpy array for ease of use
        text_embeddings.append(text_embedding.cpu().numpy())

    # convert text embeddings to a numpy array
    text_embeddings = np.vstack(text_embeddings)
    return text_embeddings 

In [None]:
# finds the nearest neighbor of the image embedding in the text embeddings
def find_and_search_nn(img_embedding, text_embeddings):

    # index uses L2 (euclidean distance) with the dimension of the text embeddings
    index = faiss.IndexFlatL2(text_embeddings.shape[1])
    index.add(text_embeddings)

    # Search for the nearest neighbor
    # D is 2D Numpy Array of the distance of the nearest neighbors
    # I is 2D Numpy Array containing indices of the K nearest neighbors for the N query image embeddings
    D, I = index.search(img_embedding, 1)
    return I

In [None]:
# returns the description of the closest image(s)
def get_description(I, descriptions):
    closest_descriptions = []
    for i in range(len(I[0])):
        closest_descriptions.append(descriptions[I[0][i]])

    return closest_descriptions

In [None]:
def main():
    #image_paths = ["images/button_head_screw_isometric.jpg"]
    image_paths = ["images/mounting_bracket_isometric.jpg"]
    img_embedding = process_images(image_paths)
    descriptions = load_descriptions()
    text_embeddings = embed_descriptions(descriptions)
    I = find_and_search_nn(img_embedding, text_embeddings)
    closest_descriptions = get_description(I, descriptions)
    print('The closest description(s) are:')
    for descriptions in closest_descriptions:
        print(descriptions)

if __name__ == '__main__':
    main()

FileNotFoundError: [Errno 2] No such file or directory: 'i'