In [6]:
from transformers import AutoTokenizer
vision_tokenizer = AutoTokenizer.from_pretrained(
    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
)
vision_tokenizer.save_pretrained("db")

('db\\tokenizer_config.json',
 'db\\special_tokens_map.json',
 'db\\vocab.json',
 'db\\merges.txt',
 'db\\added_tokens.json',
 'db\\tokenizer.json')

In [7]:
tokenizer = AutoTokenizer.from_pretrained("db")

In [12]:
from transformers import AutoTokenizer, AutoModel
import torch
import chromadb
from chromadb.api.types import Documents, EmbeddingFunction

class LLaVAEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("db")  # Your saved tokenizer
        self.model = AutoModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def __call__(self, texts: Documents) -> list:
        # Tokenize texts
        encoded_input = self.tokenizer(
            texts, 
            padding=True, 
            truncation=True, 
            return_tensors='pt'
        ).to(self.device)
        
        # Generate embeddings
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            embeddings = model_output.last_hidden_state[:, 0, :]  # Using [CLS] token embedding
            
        return embeddings.cpu().numpy().tolist()

In [None]:

client = chromadb.PersistentClient()

# Create collection with custom embedding function
collection = client.get_or_create_collection(
    name="text_collection",
    embedding_function=LLaVAEmbeddingFunction()
)

# Add documents
collection.add(
    documents=["your text here"],
    ids=["unique_id"]
)