In [1]:
from faiss_search import Embedder
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np


  if client is None and model_name is 'text-embedding-3-small':
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
CPM_embedder = Embedder(model_name="openbmb/MiniCPM-Embedding",
                    use_api=False, 
                    attn_implementation="flash_attention_2",
                    to_cuda=True)

CPM_embedder.get_embedding("Hello, world!")

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.05s/it]


array([ 0.01193028, -0.02105716, -0.0044258 , ..., -0.01286218,
       -0.02372779,  0.01793973], dtype=float32)

In [15]:
llama_embedder = Embedder(model_name="shenzhi-wang/Llama3.1-8B-Chinese-Chat",
                    use_api=False, 
                    to_cuda=True)

Loading checkpoint shards: 100%|██████████| 4/4 [01:00<00:00, 15.20s/it]


In [16]:
llama_embedder.get_embedding("你好，世界！")

array([-0.00159896, -0.0107032 ,  0.00395211, ...,  0.00627876,
       -0.005287  ,  0.0006096 ], dtype=float32)

## Jina embedding API (Web)
* **requires API token**

In [2]:
import dotenv
import requests
import os

load_dotenv()

url = 'https://api.jina.ai/v1/embeddings'

headers = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer jina_a692d01325a547f8a27a12528f0970977DQDF0LM0QswxnB3ZxSQUADeGL4o'
}
data = {
    "model": "jina-embeddings-v3",
    "task": "text-matching",
    "dimensions": 1024,
    "late_chunking": False,
    "embedding_type": "float",
    "input": [
        "Organic skincare for sensitive skin with aloe vera and chamomile: Imagine the soothing embrace of nature with our organic skincare range, crafted specifically for sensitive skin. Infused with the calming properties of aloe vera and chamomile, each product provides gentle nourishment and protection. Say goodbye to irritation and hello to a glowing, healthy complexion.",
        "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille: Erleben Sie die wohltuende Wirkung unserer Bio-Hautpflege, speziell für empfindliche Haut entwickelt. Mit den beruhigenden Eigenschaften von Aloe Vera und Kamille pflegen und schützen unsere Produkte Ihre Haut auf natürliche Weise. Verabschieden Sie sich von Hautirritationen und genießen Sie einen strahlenden Teint.",
        "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla: Descubre el poder de la naturaleza con nuestra línea de cuidado de la piel orgánico, diseñada especialmente para pieles sensibles. Enriquecidos con aloe vera y manzanilla, estos productos ofrecen una hidratación y protección suave. Despídete de las irritaciones y saluda a una piel radiante y saludable.",
        "针对敏感肌专门设计的天然有机护肤产品：体验由芦荟和洋甘菊提取物带来的自然呵护。我们的护肤产品特别为敏感肌设计，温和滋润，保护您的肌肤不受刺激。让您的肌肤告别不适，迎来健康光彩。",
        "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています: 今シーズンのメイクアップトレンドは、大胆な色彩と革新的な技術に注目しています。ネオンアイライナーからホログラフィックハイライターまで、クリエイティビティを解き放ち、毎回ユニークなルックを演出しましょう。"
    ]
}

response = requests.post(url, headers=headers, json=data)

np.array(response.json()['data'][0]['embedding'], dtype="f")

array([ 0.00868749,  0.10442207,  0.04934141, ...,  0.00463035,
       -0.01745086, -0.01694834], dtype=float32)

In [4]:
# write the client class required by Embedder class
class client:
    def __init__(self, api_key=None):
        self.model_name = "jina-embeddings-v3"
        self.url = 'https://api.jina.ai/v1/embeddings'
        if api_key is None:
            api_key = os.environ.get("JINAAI_API_KEY")
        if api_key is None:
            raise ValueError(
                "The api_key client option must be set either by passing api_key to the client or by setting the environment variable"
            )
        self.api_key = api_key

        self.headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}'
        }

        self.embeddings = embeddings(self.headers, self.model_name, self.url)

    
## This was to match the open ai client() structure
class embeddings:
    def __init__(self, headers, model_name, url):
        self.headers = headers
        self.model_name = model_name
        self.url = url

    def create(self, input, model = None):
        data = {
            "model": self.model_name if model is None else model,
            "task": "text-matching",
            "dimensions": 1024,
            "late_chunking": False,
            "embedding_type": "float",
            "input": [input]
        }
        response = requests.post(self.url, headers=self.headers, json=data)
        return response.json()


In [5]:
dotenv.load_dotenv()

True

In [6]:
JINA_client = client()
JINA_embedder = Embedder(model_name="jina-embeddings-v3", client=JINA_client)
JINA_embedder.get_embedding("Hello, world!")

array([ 0.0981229 , -0.06144965,  0.10672536, ...,  0.01664312,
       -0.02229654,  0.00822767], dtype=float32)

## JINA embedding model (local)

In [None]:
from jina import Deployment, Executor, requests
from docarray import DocList, BaseDoc

class TestDoc(BaseDoc):
    text: str = None

class MyExecutor(Executor):
    @requests(on='/bar')
    def foo(self, docs: DocList[TestDoc], **kwargs) -> DocList[TestDoc]:
        print("Received docs in executor:")
        for doc in docs:
            print(f"text: {doc.text}")  # Print the specific text attribute of TestDoc
        return docs
    

dep = Deployment(port=12345, name='myexec1', uses=MyExecutor)

with dep:
    dep.block()


In [None]:
from jina import Client
from docarray import DocList, BaseDoc


class TestDoc(BaseDoc):
    text: str = None


c = Client(host='localhost', port=12345)
c.post(
    on='/bar', 
    inputs=DocList([TestDoc(text="I'm a bear!")]), 
    return_type=DocList[TestDoc],
    on_done=print
)