In [1]:
# pip install --upgrade pip
# pip install torch
# pip install torchvision
# pip install pycocotools
# pip faiss-cpu


In [2]:
import torch
import torchvision
from torchvision import transforms, datasets
from IPython.display import Image

In [3]:
print(torch.__version__)
print(torchvision.__version__)
print(torchvision.datasets)

2.1.1+cpu
0.16.1+cpu
<module 'torchvision.datasets' from 'c:\\Users\\sobata\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\torchvision\\datasets\\__init__.py'>


In [4]:
base_folder = '../data/'
image_folder = base_folder + "val2014/val2014/"
ann_file = base_folder + "annotations/captions_val2014.json"

transform = transforms.Compose([transforms.ToTensor()])
val_dataset = torchvision.datasets.CocoDetection(
    root=image_folder,
    annFile=ann_file,
    transform=transform
)

loading annotations into memory...
Done (t=0.63s)
creating index...
index created!


In [5]:
_, label = val_dataset[1]
for l in label:
    print(l)
image_file_name = "COCO_val2014_" + str(label[0]["image_id"]).zfill(12) + ".jpg"
Image(image_folder + image_file_name)

{'image_id': 73, 'id': 593422, 'caption': 'A motorcycle parked in a parking space next to another motorcycle.'}
{'image_id': 73, 'id': 746071, 'caption': 'An old motorcycle parked beside other motorcycles with a brown leather seat.'}
{'image_id': 73, 'id': 746170, 'caption': 'Motorcycle parked in the parking lot of asphalt.'}
{'image_id': 73, 'id': 746914, 'caption': 'A close up view of a motorized bicycle, sitting in a rack. '}
{'image_id': 73, 'id': 748185, 'caption': 'The back tire of an old style motorcycle is resting in a metal stand. '}


<IPython.core.display.Image object>

In [6]:
import os
from dotenv import load_dotenv

load_dotenv()

VISION_ENDPOINT = os.getenv('VISION_ENDPOINT')
VISION_API_KEY = os.getenv('VISION_API_KEY')

In [7]:
import json
import requests
import faiss
import numpy as np

images = []
labels = []
vectors = []
num = 20  # 今回はデータセットのうち画像1,000枚をベクトル化
for i in range(num):
    _, label = val_dataset[i]
    labels.append(label)
    image_file_name = "COCO_val2014_" + str(label[0]["image_id"]).zfill(12) + ".jpg"
    images.append(image_folder + image_file_name)  # 画像ファイルのパス

endpoint = os.getenv("VISION_ENDPOINT") + "/computervision/retrieval:vectorizeImage?api-version=2023-02-01-preview&modelVersion=latest"
headers = {
    "Content-Type": "application/octet-stream",  # リクエストボディは画像のバイナリデータ
    "Ocp-Apim-Subscription-Key": os.getenv("VISION_API_KEY")
}

for idx, image in enumerate(images):
    with open(image, mode="rb") as f:
        image_bin = f.read()
    # Vectorize Image API を使って画像をベクトル化
    response = requests.post(endpoint, headers=headers, data=image_bin)
    # print(response.json())
    image_vec = np.array(response.json()["vector"], dtype="float32").reshape(1, -1)
    vectors.append(image_vec)

In [17]:
print(vectors)

[array([[ 1.6416016 , -3.5449219 , -1.6220703 , ...,  0.86035156,
         0.11499023,  1.0517578 ]], dtype=float32), array([[-4.2890625, -1.9609375, -2.4140625, ...,  2.6738281, -2.2148438,
        -1.4960938]], dtype=float32), array([[ 1.7246094 ,  0.73095703, -0.99853516, ...,  1.4404297 ,
         2.625     , -1.0585938 ]], dtype=float32), array([[ 3.2246094 , -0.64404297, -2.0878906 , ...,  0.07855225,
        -1.2451172 , -1.6542969 ]], dtype=float32), array([[-5.21875  ,  1.4873047, -1.75     , ..., -1.4873047, -1.4482422,
         2.1074219]], dtype=float32), array([[-0.01748657, -0.4230957 , -0.7373047 , ...,  0.79296875,
        -0.9926758 ,  2.2695312 ]], dtype=float32), array([[-1.7597656 , -3.9023438 , -2.6152344 , ...,  0.17504883,
        -2.7773438 ,  1.5       ]], dtype=float32), array([[ 3.2890625 ,  0.96240234, -1.7109375 , ..., -1.4931641 ,
         2.828125  , -0.65625   ]], dtype=float32), array([[ 0.5239258 ,  3.0332031 ,  1.4375    , ..., -0.36010742,
         1

In [8]:
import pickle

with open("images.pkl", "wb") as f:
    pickle.dump(images, f)

with open("labels.pkl", "wb") as f:
    pickle.dump(labels, f)

with open("vectors.pkl", "wb") as f:
    pickle.dump(vectors, f)

In [9]:
dimension = 1024
index_flat_l2 = faiss.IndexFlatL2(dimension)

In [10]:
for vector in vectors:
    index_flat_l2.add(vector)

In [11]:
print(index_flat_l2.ntotal)

20


In [12]:
def search_faiss_by_text(query_text, n=3):
    endpoint = os.getenv("VISION_ENDPOINT") + "/computervision/retrieval:vectorizeText?api-version=2023-02-01-preview&modelVersion=latest"
    headers = {
        "Content-Type": "application/json",
        "Ocp-Apim-Subscription-Key": os.getenv("VISION_API_KEY")
    }
    data = {
        "text": query_text
    }
    # Vectorize Text API を使ってクエリをベクトル化
    response = requests.post(endpoint, headers=headers, data=json.dumps(data))
    query_vector = np.array(response.json()["vector"], dtype="float32").reshape(1, -1)
    # Faiss 検索
    D, I = index_flat_l2.search(query_vector, n)
    return D, I

In [27]:
n = 6
D, I = search_faiss_by_text("dog", n)
print(D)
print(I)

[[8431.551 8448.016 8450.77  8451.719 8452.975 8453.904]]
[[ 2  0 16 14  1 10]]


In [28]:
Image(images[I[0][0]])


<IPython.core.display.Image object>

In [29]:
Image(images[I[0][1]])

<IPython.core.display.Image object>

In [30]:
Image(images[I[0][2]])

<IPython.core.display.Image object>