In [1]:
import json
import os

import numpy as np
import tqdm
from PIL import Image
from pprint import pprint

import cv2
from dotenv import load_dotenv

from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from transformers import AutoImageProcessor, AutoModel, CLIPModel, CLIPProcessor, SiglipVisionModel

import torch
import torchvision.models as models
import torchvision.transforms as transforms

In [2]:
load_dotenv()
folder_path = os.getenv('path_vrubel_pics')

In [3]:
files = os.listdir(folder_path)
target_files = []
for f in files:
    if ('.jpg' in f) or ('.jpeg' in f):
        target_files.append(os.path.join(folder_path, f))

print('Количество картин в датасете:', len(target_files))

Количество картин в датасете: 133


## ResNet50

In [4]:
base_model = ResNet50(weights='imagenet')

In [5]:
model_resnet = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

In [6]:
features_dict_resnet50 = {}

In [7]:
for file in files:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        img = cv2.resize(img, (224, 224))
        img - cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img, axis=0)
        img_array = preprocess_input(img_array)
        
        features = model_resnet.predict(img_array)
        features = features.flatten()

        features_dict_resnet50[file] = features.tolist()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 741ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63

In [8]:
with open('features_resnet50_vrubel.json', 'w', encoding='utf-8') as json_file:
    json.dump(features_dict_resnet50, json_file, ensure_ascii=False)

## SigLip

In [9]:
processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [10]:
features_dict_siglip = {}

In [11]:
for file in files:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        inputs = processor(images=img, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**inputs)
            
        embeddings = outputs.last_hidden_state
        embeddings_numpy = embeddings.squeeze().cpu().numpy()
        
        features_dict_siglip[file] = embeddings_numpy.tolist()



In [12]:
with open('features_siglip_vrubel.json', 'w', encoding='utf-8') as json_file:
    json.dump(features_dict_siglip, json_file, ensure_ascii=False)

## CLIP

In [13]:
model_name = "openai/clip-vit-base-patch32"

In [14]:
model = CLIPModel.from_pretrained(model_name)

In [15]:
processor = CLIPProcessor.from_pretrained(model_name)

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [17]:
features_dict_clip = {}

In [18]:
for file in files:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        inputs = processor(images=img, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
        
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        
        features_dict_clip[file] = image_features.cpu().squeeze().tolist() #image_features.cpu().numpy()



In [19]:
with open('features_clip_goya.json', 'w', encoding='utf-8') as json_file:
    json.dump(features_dict_clip, json_file, ensure_ascii=False)

## ViT

In [20]:
processor = AutoImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = AutoModel.from_pretrained('google/vit-base-patch16-224')

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTSdpaAttention(
          (attention): ViTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [22]:
features_dict_vit = {}

In [23]:
for file in files:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
        inputs = processor(images=img, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
            
        with torch.no_grad():
            outputs = model(**inputs)
            image_embedding = outputs.last_hidden_state[:, 0, :]
            
        # image_embedding = image_embedding / image_embedding.norm(dim=-1, keepdim=True)
        
        features_dict_vit[file] = image_embedding.squeeze().cpu().tolist()



In [24]:
with open('features_vit_goya.json', 'w', encoding='utf-8') as json_file:
    json.dump(features_dict_vit, json_file, ensure_ascii=False)

## VGG-19

In [25]:
vgg19 = models.vgg19(pretrained=True).features.eval()
selected_layers = torch.nn.Sequential(*list(vgg19.children())[:22])



In [26]:
features_dict_vgg19 = {}

In [27]:
for file in files:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (224, 224))
        img = img.astype(np.float32) / 255.0
        
        mean = np.array([0.485, 0.456, 0.406])
        std  = np.array([0.229, 0.224, 0.225])
        img = (img - mean) / std
        
        img = np.transpose(img, (2, 0, 1))
        img = torch.tensor(img, dtype=torch.float32).unsqueeze(0).to(device)
        
        with torch.no_grad():
            features = selected_layers(img)
            vector = torch.flatten(features, 1)
            features_dict_vgg19[file] = vector.squeeze(0).numpy().tolist()



In [28]:
with open('features_vgg19_goya.json', 'w', encoding='utf-8') as json_file:
    json.dump(features_dict_vgg19, json_file, ensure_ascii=False)