In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch import load
import mlflow

In [2]:
from transformers import ViTModel, AutoModelForMaskedLM, AutoTokenizer, ViTImageProcessor, DistilBertModel
from pinecone import Pinecone
from dotenv import load_dotenv
import torch


load_dotenv('../.env')
pc = Pinecone()
index = pc.Index("clipmodel")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from io import BytesIO
import base64
from PIL import Image

import sys

sys.path.append('../src')

from model import CLIPChemistryModel, TextEncoderHead, ImageEncoderHead


ENCODER_BASE = DistilBertModel.from_pretrained("distilbert-base-uncased")
IMAGE_BASE = ViTModel.from_pretrained("google/vit-base-patch16-224")
text_encoder = TextEncoderHead(model=ENCODER_BASE)
image_encoder = ImageEncoderHead(model=IMAGE_BASE)

clip_model = CLIPChemistryModel(text_encoder=text_encoder, image_encoder=image_encoder)

clip_model.load_state_dict(torch.load('/Users/sebastianalejandrosarastizambonino/Documents/projects/CLIP_Pytorch/src/best_model_fashion.pth', map_location=torch.device('cpu')))

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  clip_model.load_state_dict(torch.load('/Users/sebastianalejandrosarastizambonino/Documents/projects/CLIP_Pytorch/src/best_model_fashion.pth', map_location=torch.device('cpu')))


<All keys matched successfully>

In [5]:
te_final = clip_model.text_encoder
ie_final = clip_model.image_encoder

In [19]:
def process_text_for_encoder(text, model):
    # tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    encoded_input = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=256)
    input_ids = encoded_input['input_ids']
    attention_mask = encoded_input['attention_mask']
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    return output.detach().numpy().tolist()[0]

def process_image_for_encoder(image, model):
    image = Image.open(BytesIO(image))
    # print(type(image))
    image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
    image_tensor = image_processor(image, 
            return_tensors="pt", 
            do_resize=True
            )['pixel_values']
    output =  model(pixel_values=image_tensor)
    return output.detach().numpy().tolist()[0]

In [7]:
fashion_dataset = "hf://datasets/rajuptvs/ecommerce_products_clip/data/train-00000-of-00001-1f042f20fd269c32.parquet"
df = pd.read_parquet(fashion_dataset)

Generate the text embeddings

In [16]:
text_embeddings = []
for row in df.iterrows():
    output = process_text_for_encoder(row[1]['Clipinfo'], te_final)
    text_embeddings.append(output)

Generate the image embeddings

In [20]:
image_embeddings = []
for row in df.iterrows():
    output = process_image_for_encoder(row[1]['image']['bytes'], ie_final)
    image_embeddings.append(output)

Generate the records

In [21]:
import base64

def bytes_to_str(bytes_data):
    return base64.b64encode(bytes_data).decode('utf-8')

def str_to_bytes(str_data):
    return base64.b64decode(str_data)

In [71]:
import io

def compress_image(image_bytes, quality=5):
    img = Image.open(io.BytesIO(image_bytes))
    buffer = io.BytesIO()
    img.save(buffer, format='JPEG', quality=quality)
    return buffer.getvalue()

def push_embeddings_to_pine_cone(index, embeddings, df, mode, length, batch_size=50):
    """
    Push embeddings to Pinecone in batches to avoid message size limits
    
    Args:
        index: Pinecone index
        embeddings: array of embeddings
        df: dataframe with data
        mode: 'text' or 'image'
        length: total number of records
        batch_size: size of each batch
    """
    for start_idx in range(0, length, batch_size):
        end_idx = min(start_idx + batch_size, length)
        batch_records = []
        
        for i in range(start_idx, end_idx):
            if mode == 'text':
                batch_records.append({
                    "id": str(mode) + str(i),
                    "values": embeddings[i],
                    "metadata": {str(mode): df["Clipinfo"].iloc[i]}
                })
            elif mode == 'image':
                # Comprimir la imagen y convertir a string
                compressed_img = compress_image(df[mode].iloc[i]['bytes'], quality=5)
                batch_records.append({
                    "id": str(mode) + str(i),
                    "values": embeddings[i],
                    "metadata": {str(mode): bytes_to_str(compressed_img)}
                })
            else:
                raise ValueError("mode must be either 'text' or 'image'")
        
        # Subir el batch actual
        index.upsert(
            vectors=batch_records,
            namespace="space-" + mode + "-fashion"
        )
        
        print(f"Processed batch {start_idx//batch_size + 1} of {(length + batch_size - 1)//batch_size}")

In [36]:
push_embeddings_to_pine_cone(
    index=index, 
    embeddings=text_embeddings, 
    df=df, 
    mode='text', 
    length=len(text_embeddings))

In [72]:
push_embeddings_to_pine_cone(
    index=index, 
    embeddings=image_embeddings, 
    df=df, 
    mode='image', 
    batch_size=25,
    length=len(image_embeddings))

Processed batch 1 of 77
Processed batch 2 of 77
Processed batch 3 of 77
Processed batch 4 of 77
Processed batch 5 of 77
Processed batch 6 of 77
Processed batch 7 of 77
Processed batch 8 of 77
Processed batch 9 of 77
Processed batch 10 of 77
Processed batch 11 of 77
Processed batch 12 of 77
Processed batch 13 of 77
Processed batch 14 of 77
Processed batch 15 of 77
Processed batch 16 of 77
Processed batch 17 of 77
Processed batch 18 of 77
Processed batch 19 of 77
Processed batch 20 of 77
Processed batch 21 of 77
Processed batch 22 of 77
Processed batch 23 of 77
Processed batch 24 of 77
Processed batch 25 of 77
Processed batch 26 of 77
Processed batch 27 of 77
Processed batch 28 of 77
Processed batch 29 of 77
Processed batch 30 of 77
Processed batch 31 of 77
Processed batch 32 of 77
Processed batch 33 of 77
Processed batch 34 of 77
Processed batch 35 of 77
Processed batch 36 of 77
Processed batch 37 of 77
Processed batch 38 of 77
Processed batch 39 of 77
Processed batch 40 of 77
Processed