# 目的
画像からembeddingを作成する

# Setting

In [60]:
DATA_PATH = "../../sample_data"
OUTPUT_PATH = "../../sample_output"
MODEL_PATH = "BAAI/bge-small-en-v1.5" # BAAI/bge-large-en-v1.5

# Import

In [36]:
import glob

import polars as pl

import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
from sklearn.decomposition import PCA
import numpy as np

# Data Load

In [56]:
# ResNet50モデルで2048次元埋め込みを取得するための準備
base_model = models.resnet50(pretrained=True)
base_model.fc = nn.Identity()
base_model.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

def get_2048dim_embedding(image_path):
    img = Image.open(image_path).convert("RGB")
    img_tensor = transform(img).unsqueeze(0)
    with torch.no_grad():
        embedding_2048 = base_model(img_tensor)
    return embedding_2048.squeeze(0).cpu().numpy()  # (2048,)

# PCA用のデータを準備（ここでは任意の画像群で2048次元ベクトルを複数生成する）
# 例：sample_1.png, sample_2.png, sample_3.png など
image_paths = glob.glob(f"{DATA_PATH}/image/*.png")
embeddings_2048 = [get_2048dim_embedding(p) for p in image_paths]
embeddings_2048 = np.stack(embeddings_2048, axis=0)  # (N, 2048)

# PCAを学習して2048次元->128次元に圧縮
num = min(128, len(image_paths)-1)
pca = PCA(n_components= num) # image_pathsが128枚より少ない場合は、その数に合わせる
embeddings_128 = pca.fit_transform(embeddings_2048)



In [59]:
OUTPUT_PATH

'../../output'

In [62]:
emb_128_df = pl.DataFrame(embeddings_128, schema=[f"vec_{i}" for i in range(num)])
emb_128_df.write_csv(f"{OUTPUT_PATH}/image_embeddings_128.csv")