In [65]:
# imports
import torch
import clip
import open_clip
from PIL import Image

import pandas as pd
import numpy as np

import os

from tqdm import tqdm

In [66]:
# see what pretrained models are available
print(clip.available_models())

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']


In [67]:
# see what pretrained models are available
available_models = open_clip.list_pretrained()
for i in range(len(available_models)):
    print(available_models[i:i+5])

[('RN50', 'openai'), ('RN50', 'yfcc15m'), ('RN50', 'cc12m'), ('RN50-quickgelu', 'openai'), ('RN50-quickgelu', 'yfcc15m')]
[('RN50', 'yfcc15m'), ('RN50', 'cc12m'), ('RN50-quickgelu', 'openai'), ('RN50-quickgelu', 'yfcc15m'), ('RN50-quickgelu', 'cc12m')]
[('RN50', 'cc12m'), ('RN50-quickgelu', 'openai'), ('RN50-quickgelu', 'yfcc15m'), ('RN50-quickgelu', 'cc12m'), ('RN101', 'openai')]
[('RN50-quickgelu', 'openai'), ('RN50-quickgelu', 'yfcc15m'), ('RN50-quickgelu', 'cc12m'), ('RN101', 'openai'), ('RN101', 'yfcc15m')]
[('RN50-quickgelu', 'yfcc15m'), ('RN50-quickgelu', 'cc12m'), ('RN101', 'openai'), ('RN101', 'yfcc15m'), ('RN101-quickgelu', 'openai')]
[('RN50-quickgelu', 'cc12m'), ('RN101', 'openai'), ('RN101', 'yfcc15m'), ('RN101-quickgelu', 'openai'), ('RN101-quickgelu', 'yfcc15m')]
[('RN101', 'openai'), ('RN101', 'yfcc15m'), ('RN101-quickgelu', 'openai'), ('RN101-quickgelu', 'yfcc15m'), ('RN50x4', 'openai')]
[('RN101', 'yfcc15m'), ('RN101-quickgelu', 'openai'), ('RN101-quickgelu', 'yfcc15m

In [68]:
model_names = [pair[0] for pair in available_models]
model_names = sorted(list(set(model_names)))
model_names

['EVA01-g-14',
 'EVA01-g-14-plus',
 'EVA02-B-16',
 'EVA02-E-14',
 'EVA02-E-14-plus',
 'EVA02-L-14',
 'EVA02-L-14-336',
 'RN101',
 'RN101-quickgelu',
 'RN50',
 'RN50-quickgelu',
 'RN50x16',
 'RN50x4',
 'RN50x64',
 'ViT-B-16',
 'ViT-B-16-SigLIP',
 'ViT-B-16-SigLIP-256',
 'ViT-B-16-SigLIP-384',
 'ViT-B-16-SigLIP-512',
 'ViT-B-16-SigLIP-i18n-256',
 'ViT-B-16-plus-240',
 'ViT-B-16-quickgelu',
 'ViT-B-32',
 'ViT-B-32-256',
 'ViT-B-32-quickgelu',
 'ViT-H-14',
 'ViT-H-14-CLIPA',
 'ViT-H-14-CLIPA-336',
 'ViT-H-14-quickgelu',
 'ViT-L-14',
 'ViT-L-14-336',
 'ViT-L-14-CLIPA',
 'ViT-L-14-CLIPA-336',
 'ViT-L-14-quickgelu',
 'ViT-L-16-SigLIP-256',
 'ViT-L-16-SigLIP-384',
 'ViT-SO400M-14-SigLIP',
 'ViT-SO400M-14-SigLIP-384',
 'ViT-bigG-14',
 'ViT-bigG-14-CLIPA',
 'ViT-bigG-14-CLIPA-336',
 'ViT-g-14',
 'coca_ViT-B-32',
 'coca_ViT-L-14',
 'convnext_base',
 'convnext_base_w',
 'convnext_base_w_320',
 'convnext_large_d',
 'convnext_large_d_320',
 'convnext_xxlarge',
 'nllb-clip-base',
 'nllb-clip-large',


In [69]:
model = [pair for pair in available_models if pair[0] == 'ViT-L-14-336']
print(model)

[('ViT-L-14-336', 'openai')]


In [70]:
# importing model, preprocess, tokenizer, combined_df
model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14-336', pretrained='openai')
tokenizer = open_clip.get_tokenizer('ViT-L-14-336')

combined_df = pd.read_csv('combined_df.csv')

In [72]:
# Generate embeddings for training data

row = combined_df.iloc[0]

embeddings = []
labels = []

image = preprocess(Image.open(row['img_path'])).unsqueeze(0)
text = tokenizer(row['Product_Text'])

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    combined_features = torch.cat((image_features, text_features), dim=1)

print(combined_features)
print(combined_features.shape, image_features.shape, text_features.shape)


tensor([[ 0.0334,  0.9108,  0.0570,  ...,  0.1423, -0.1029, -0.4040]])
torch.Size([1, 1536]) torch.Size([1, 768]) torch.Size([1, 768])


In [76]:
# img 랑 text 각각 tensor로 바꾼 후 저장
def process_img_text(df, tokenizer, model):
    # Container for all concatenated outputs
    all_tensors = []
    total_length = len(df)
    
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        # Print progress every 100 rows
        if (total_length - index) % 100 == 0:
            print(f'Processing row {index}...')
        # Process text
        try:
            text = row['Product_Text']
            if isinstance(text, str) and text.strip():
                text_input = tokenizer(row['Product_Text'])
                with torch.no_grad():
                    text_features = model.encode_text(text_input)

        except:
            text_features = torch.zeros(1, 768)
            
        try:
            # Process image
            image_path = row['img_path']
            image = preprocess(Image.open(image_path)).unsqueeze(0)
            image_features = model.encode_image(image)
        except Exception as e:
            print(f'Error processing image at row {index}: {e}')
            image_features = torch.zeros(1, 768)

        # Concatenate and append
        combined_features = torch.cat((image_features, text_features), dim=1)
        all_tensors.append(combined_features)

    return torch.cat(all_tensors, dim=0)



In [77]:
all_combined_features = process_img_text(combined_df, tokenizer, model)
torch.save(all_combined_features, 'CLIP_img_text.pt')

  0%|          | 9/11687 [00:05<2:08:20,  1.52it/s]


KeyboardInterrupt: 