In [23]:
import os
import numpy as np
import cv2
import tqdm
from pprint import pprint
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model

In [24]:
from dotenv import load_dotenv

load_dotenv()
folder_path = os.getenv('file_path_Munch')

In [25]:
files = os.listdir(folder_path)
target_files = []
for f in files:
    if ('.jpg' in f) or ('.jpeg' in f):
        target_files.append(os.path.join(folder_path, f))

print('Количество картин в датасете:', len(target_files))

Количество картин в датасете: 223


## ResNet50

In [26]:
base_model = ResNet50(weights='imagenet')

In [27]:
model_resnet = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

In [28]:
features_dict = {}

In [8]:
for file in files:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        img = cv2.resize(img, (224, 224))
        img - cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img, axis=0)
        img_array = preprocess_input(img_array)
        
        features = model_resnet.predict(img_array)
        features = features.flatten()
        # print(features)
        features_dict[file] = features.tolist()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

In [29]:
# for i in features_dict:
#     pprint(i)
#     pprint(features_dict[i])
#     break

In [10]:
import json

with open('features_resnet50.json', 'w', encoding='utf-8') as json_file:
    json.dump(features_dict, json_file, ensure_ascii=False)


## SigLib

In [52]:
import torch
from transformers import AutoImageProcessor, SiglipVisionModel

processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")

model.safetensors:  79%|#######8  | 640M/813M [00:00<?, ?B/s]

In [85]:
features_dict_siglib = {}

In [86]:
# from tqdm import tqdm

for file in files:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        inputs = processor(images=img, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state
        # print(embeddings)
        embeddings_numpy = embeddings.squeeze().cpu().numpy()
        # print(len(embeddings_numpy))
        # print(embeddings_numpy.shape)
        features_dict_siglib[file] = embeddings_numpy.tolist()

In [87]:
print(len(features_dict_siglib))

223


In [66]:
for i in features_dict_siglib:
    print(i)
    print(features_dict_siglib[i])
    break

1938_mm_dd_a_Автопортрет_с_бутылками.jpg
[[[-1.325994610786438, 0.8511273860931396, 2.1383631229400635, 1.3205817937850952, -0.05886836349964142, -1.248146653175354, -1.364378571510315, 2.1084978580474854, 0.6136986017227173, 0.2785596549510956, 0.6837567090988159, -0.733473539352417, -1.2019062042236328, 2.111262798309326, -0.9637737274169922, 0.2967715859413147, 2.0350759029388428, 3.6139211654663086, 0.5501099228858948, 4.850362777709961, 1.4034675359725952, 2.249666213989258, -1.814753770828247, -1.5580817461013794, 0.7605096101760864, -2.19254469871521, 1.4547792673110962, 0.8581454753875732, 0.30296650528907776, 0.026041902601718903, -0.7662124037742615, -0.9965450763702393, -2.303663492202759, -0.5940077304840088, -0.6853287816047668, 0.6865057945251465, -2.2116758823394775, -1.1660113334655762, 1.8554821014404297, 1.3465328216552734, 2.198723316192627, 0.3287179172039032, 1.6035691499710083, -1.6118154525756836, 0.5074584484100342, -3.451517105102539, 2.119843006134033, -1.1546

In [72]:
with open('features_siglib.json', 'w', encoding='utf-8') as json_file:
    json.dump(features_dict_siglib, json_file, ensure_ascii=False)


## CLIP

In [88]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

In [89]:
model_name = "openai/clip-vit-base-patch32"

In [91]:
model = CLIPModel.from_pretrained(model_name)

In [93]:
processor = CLIPProcessor.from_pretrained(model_name)

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [94]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [95]:
features_dict_clip = {}

In [111]:
for file in files:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        inputs = processor(images=img, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
        
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        
        features_dict_clip[file] = image_features.cpu().squeeze().tolist() #image_features.cpu().numpy()

In [112]:
print(len(features_dict_clip))

223


In [113]:
for i in features_dict_clip:
    print(type(i))
    print(type(features_dict_clip[i]))
    break

<class 'str'>
<class 'list'>


In [None]:
import json

with open('features_clip.json', 'w', encoding='utf-8') as json_file:
    json.dump(features_dict_clip, json_file, ensure_ascii=False)

## ViT

In [3]:
from transformers import AutoImageProcessor, AutoModel
processor = AutoImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = AutoModel.from_pretrained('google/vit-base-patch16-224')

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


model.safetensors:  67%|######6   | 231M/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTSdpaAttention(
          (attention): ViTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [18]:
features_dict_vit = {}

In [20]:
for file in files:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
        inputs = processor(images=img, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
            
        with torch.no_grad():
            outputs = model(**inputs)
            image_embedding = outputs.last_hidden_state[:, 0, :]
            
        # image_embedding = image_embedding / image_embedding.norm(dim=-1, keepdim=True)
        
        features_dict_vit[file] = image_embedding.squeeze().cpu().tolist()

In [21]:
len(features_dict_vit)

223

In [22]:
import json

with open('features_vit.json', 'w', encoding='utf-8') as json_file:
    json.dump(features_dict_vit, json_file, ensure_ascii=False)