In [None]:
!pip install torch torchvision faiss-cpu Pillow numpy pandas scikit-learn


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from t

In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import torchvision.transforms as transforms
import torchvision.models as models
import torch
import numpy as np
import pickle
from sklearn.decomposition import PCA
import faiss
from tqdm import tqdm

# Initialize the pre-trained ResNet model and move it to GPU if available
model = models.resnet50(pretrained=True)
if torch.cuda.is_available():
    model = model.to('cuda')
model.eval()

# Image preprocessing transformations
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def get_image(url):
    response = requests.get(url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    image = transform(image)
    image_tensor = image.unsqueeze(0)
    if torch.cuda.is_available():
        image_tensor = image_tensor.to('cuda')
    return image_tensor

def extract_features(image_tensor):
    with torch.no_grad():
        features = model(image_tensor)
    return features.cpu().numpy().flatten()

def save_data(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def process_images(csv_file):
    df = pd.read_csv(csv_file)
    df = df.iloc[:10000]
    features = []
    ids = []

    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing images"):
        try:
            image_tensor = get_image(row['iiifthumburl'])
            image_features = extract_features(image_tensor)
            features.append(image_features)
            ids.append(row['objectid'])
        except Exception as e:
            print(f"Failed to process image {row['iiifthumburl']}: {e}")

    return np.array(features), ids

def build_index(features):
    pca = PCA(n_components=50)
    reduced_features = pca.fit_transform(features)

    d = 50
    index = faiss.IndexFlatL2(d)
    index.add(reduced_features.astype('float32'))

    save_data('pca_model.pkl', pca)
    faiss.write_index(index, 'faiss_index.index')
    return pca, index

def query_image(url, pca, index, ids, k=3):
    image_tensor = get_image(url)
    image_features = extract_features(image_tensor)
    image_pca = pca.transform([image_features])
    _, I = index.search(image_pca.astype('float32'), k)
    return [ids[i] for i in I[0]]

if __name__ == "__main__":
    features, ids = process_images('/content/drive/MyDrive/merged_df.csv')
    pca, index = build_index(features)
    pca = load_data('pca_model.pkl')
    index = faiss.read_index('faiss_index.index')

    similar_ids = query_image('https://en.wikipedia.org/wiki/Saint_Florian#/media/File:Francesco_del_Cossa_014.jpg', pca, index, ids, k=3)
    print(similar_ids)


Processing images:   2%|▏         | 189/10000 [00:26<34:55,  4.68it/s]

Failed to process image https://api.nga.gov/iiif/9572178c-0a80-4e74-88bd-4bf3547e8eeb/full/!200,200/0/default.jpg: cannot identify image file <_io.BytesIO object at 0x79f88159de40>


Processing images:   3%|▎         | 297/10000 [00:42<35:30,  4.55it/s]

Failed to process image https://api.nga.gov/iiif/abde3cf1-6fd5-4c52-8d07-5ed9b5b05a84/full/!200,200/0/default.jpg: cannot identify image file <_io.BytesIO object at 0x79f8815a04a0>


Processing images:  54%|█████▍    | 5406/10000 [56:05<55:24,  1.38it/s]  

Failed to process image https://api.nga.gov/iiif/0925ddd1-f707-4b75-bbb7-82bb04d44453/full/!200,200/0/default.jpg: cannot identify image file <_io.BytesIO object at 0x79f8815a2e80>


Processing images:  55%|█████▌    | 5501/10000 [57:31<56:40,  1.32it/s]  

Failed to process image https://api.nga.gov/iiif/f96646f2-62e4-441e-bcca-b5249745f89b/full/!200,200/0/default.jpg: cannot identify image file <_io.BytesIO object at 0x79f88159e250>


Processing images:  65%|██████▍   | 6468/10000 [1:11:12<40:38,  1.45it/s]

Failed to process image https://api.nga.gov/iiif/5d53e5d5-91bf-4324-b71f-86bad484a620/full/!200,200/0/default.jpg: cannot identify image file <_io.BytesIO object at 0x79f8815a27a0>


Processing images:  66%|██████▌   | 6563/10000 [1:12:30<34:23,  1.67it/s]

Failed to process image https://api.nga.gov/iiif/e953a62f-2c34-465d-be88-cc25268b4e1f/full/!200,200/0/default.jpg: cannot identify image file <_io.BytesIO object at 0x79f881538220>


Processing images: 100%|██████████| 10000/10000 [2:01:54<00:00,  1.37it/s]


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x79f8b430c540>

In [None]:
def query_image(path, pca, index, ids, k=3):
    image_tensor = get_image(path)
    image_features = extract_features(image_tensor)
    image_pca = pca.transform([image_features])
    _, I = index.search(image_pca.astype('float32'), k)
    return [ids[i] for i in I[0]]
def get_image(path):
    # response = requests.get(path)
    image = Image.open(path).convert('RGB')
    image = transform(image)
    image_tensor = image.unsqueeze(0)
    if torch.cuda.is_available():
        image_tensor = image_tensor.to('cuda')
    return image_tensor

def extract_features(image_tensor):
    with torch.no_grad():
        features = model(image_tensor)
    return features.cpu().numpy().flatten()

similar_ids = query_image('/content/check1.jpg', pca, index, ids, k=3)
print(similar_ids)


[368, 89, 46587]
