# Using MLC-LLM for Text Embedding in Python

In [2]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-m")
model = AutoModel.from_pretrained("Snowflake/snowflake-arctic-embed-m", add_pooling_layer=False)
model.eval()

query_prefix = "Represent this sentence for searching relevant passages: "
queries = ["what is snowflake?", "Where can I get the best tacos?"]
queries_with_prefix = ["{}{}".format(query_prefix, i) for i in queries]
query_tokens = tokenizer(
    queries_with_prefix, padding=True, truncation=True, return_tensors="pt", max_length=512
)

documents = ["The Data Cloud!", "Mexico City of Course!"]
document_tokens = tokenizer(
    documents, padding=True, truncation=True, return_tensors="pt", max_length=512
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from mlc_llm.embeddings.embeddings import MLCEmbeddings

mlc_embeddings = MLCEmbeddings(
    "/Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/dist/snowflake-arctic-embed-m-q0f32-MLC",
    "/Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/dist/libs/snowflake-arctic-embed-m-q0f32-metal.so",
    device="metal:0",
    # debug_dir="/Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/debug",
)
mlc_queries = ["[CLS] " + query + " [SEP]" for query in queries_with_prefix]
mlc_documents = ["[CLS] " + document + " [SEP]" for document in documents]
mlc_tokens = mlc_embeddings._tokenize_queries(mlc_queries)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[2024-04-30 18:47:09] INFO auto_device.py:79: [92mFound[0m device: metal:0
[2024-04-30 18:47:09] INFO chat_module.py:379: Using model folder: /Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/dist/snowflake-arctic-embed-m-q0f32-MLC
[2024-04-30 18:47:09] INFO chat_module.py:380: Using mlc chat config: /Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/dist/snowflake-arctic-embed-m-q0f32-MLC/mlc-chat-config.json


In [4]:
import numpy as np

for i in range(len(mlc_tokens[0])):
    np.testing.assert_array_equal(mlc_tokens[0][i], query_tokens["input_ids"][i].numpy())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
query_embeddings = model(**query_tokens)
query_embeddings[0]

tensor([[[ 0.3311,  0.8075,  0.1499,  ...,  0.6162, -0.0541,  0.2450],
         [ 0.3530,  1.0479,  0.0503,  ...,  0.9943, -0.1670,  0.3151],
         [ 0.2639,  1.1741, -0.1657,  ...,  1.0260,  0.0698,  0.0333],
         ...,
         [ 0.1866,  0.8087,  0.0610,  ...,  0.6060, -0.0143,  0.5403],
         [ 0.2869,  0.8392,  0.1528,  ...,  0.6220, -0.0822,  0.4867],
         [ 0.1527,  0.8309,  0.1612,  ...,  0.5274, -0.0802,  0.5454]],

        [[-0.1421, -0.0361,  0.6161,  ...,  0.2524,  0.0108,  0.5810],
         [ 0.1238,  0.0119,  0.4495,  ...,  0.2463, -0.1507,  0.9454],
         [-0.0113, -0.0466,  0.2424,  ...,  0.5182, -0.1246,  1.1071],
         ...,
         [-0.1490,  0.3295,  0.2802,  ...,  0.2021, -0.1088,  1.0623],
         [-0.2999, -0.1606,  0.3321,  ...,  0.7638, -0.0594,  1.0833],
         [ 0.0363,  0.0367,  0.9930,  ...,  0.2296, -0.0787,  0.8901]]],
       grad_fn=<NativeLayerNormBackward0>)

In [6]:
mlc_query_embeds = mlc_embeddings.embed(mlc_queries).numpy()
print(mlc_query_embeds)
np.testing.assert_array_almost_equal(
    query_embeddings[0].detach().numpy(), mlc_query_embeds, decimal=0.01
)

[[[ 3.3113351e-01  8.0746931e-01  1.4985467e-01 ...  6.1624938e-01
   -5.4088268e-02  2.4495500e-01]
  [ 3.5296029e-01  1.0478674e+00  5.0326731e-02 ...  9.9431241e-01
   -1.6697663e-01  3.1511512e-01]
  [ 2.6390970e-01  1.1740766e+00 -1.6567884e-01 ...  1.0260067e+00
    6.9798030e-02  3.3341456e-02]
  ...
  [ 2.4550232e-01  8.0163407e-01  1.1892214e-03 ...  5.2851880e-01
    2.9782993e-01  3.5837966e-01]
  [ 2.5918993e-01  8.0476511e-01 -4.7530915e-04 ...  5.3083724e-01
    2.9855818e-01  3.5750693e-01]
  [ 2.5926834e-01  8.0297321e-01 -1.8946523e-03 ...  5.3179914e-01
    3.0155331e-01  3.5566795e-01]]

 [[-1.4205964e-01 -3.6145214e-02  6.1605424e-01 ...  2.5238845e-01
    1.0826047e-02  5.8102679e-01]
  [ 1.2380671e-01  1.1915212e-02  4.4948661e-01 ...  2.4630266e-01
   -1.5065147e-01  9.4537169e-01]
  [-1.1339599e-02 -4.6636645e-02  2.4238448e-01 ...  5.1824296e-01
   -1.2462155e-01  1.1071267e+00]
  ...
  [-1.4902829e-01  3.2945088e-01  2.8020367e-01 ...  2.0212011e-01
   -1.0883

In [7]:
mlc_query_embeds = mlc_embeddings.embed(mlc_queries).numpy()
mlc_document_embeds = mlc_embeddings.embed(mlc_documents).numpy()

mlc_query_embeds = mlc_query_embeds[:, 0]
mlc_document_embeds = mlc_document_embeds[:, 0]

mlc_query_embeds = mlc_query_embeds / np.linalg.norm(mlc_query_embeds, axis=1, keepdims=True)
mlc_document_embeds = mlc_document_embeds / np.linalg.norm(
    mlc_document_embeds, axis=1, keepdims=True
)

scores = np.dot(mlc_query_embeds, mlc_document_embeds.T)

for query, query_scores in zip(queries, scores):
    doc_score_pairs = sorted(zip(documents, query_scores), key=lambda x: x[1], reverse=True)
    print("Query: {}".format(query))
    for doc, score in doc_score_pairs:
        print("{}: {}".format(score, doc))

Query: what is snowflake?
0.2747487425804138: The Data Cloud!
0.19997990131378174: Mexico City of Course!
Query: Where can I get the best tacos?
0.29974812269210815: Mexico City of Course!
0.2344069629907608: The Data Cloud!


In [8]:
import torch

with torch.no_grad():
    query_embeddings = model(**query_tokens)[0][:, 0]
    doument_embeddings = model(**document_tokens)[0][:, 0]


# normalize embeddings
query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
doument_embeddings = torch.nn.functional.normalize(doument_embeddings, p=2, dim=1)

scores = torch.mm(query_embeddings, doument_embeddings.transpose(0, 1))
for query, query_scores in zip(queries, scores):
    doc_score_pairs = list(zip(documents, query_scores))
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    # Output passages & scores
    print("Query:", query)
    for document, score in doc_score_pairs:
        print(score, document)

Query: what is snowflake?
tensor(0.2747) The Data Cloud!
tensor(0.2000) Mexico City of Course!
Query: Where can I get the best tacos?
tensor(0.2997) Mexico City of Course!
tensor(0.2344) The Data Cloud!


In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.embeddings import Embeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings

from typing import List

In [10]:
class ArcticEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        parsed_texts = ["[CLS]" + text + "[SEP]" for text in texts]
        embed_tokens = mlc_embeddings.embed(parsed_texts).numpy()[:, 0]
        embed_tokens = embed_tokens / np.linalg.norm(mlc_query_embeds, axis=1, keepdims=True)
        return embed_tokens.tolist()

    def embed_query(self, text: str) -> List[float]:
        parsed_text = "[CLS]" + text + "[SEP]"
        embed_tokens = mlc_embeddings.embed([parsed_text]).numpy()[:, 0]
        embed_tokens = embed_tokens / np.linalg.norm(mlc_query_embeds, axis=1, keepdims=True)
        return embed_tokens.tolist()[0]

In [13]:
chroma_client = Chroma(
    "mlc_rag",
    GPT4AllEmbeddings(),
    "/Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/rag",
)

urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)

chunk_size = 20
for i in range(0, len(doc_splits), chunk_size):
    print("Adding documents {} to {}".format(i, i + chunk_size))
    chroma_client.add_documents(doc_splits[i : i + chunk_size])
retriever = chroma_client.as_retriever()

[2024-04-30 18:47:58] INFO posthog.py:20: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
[2024-04-30 18:47:58] INFO web_base.py:105: fake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.
[2024-04-30 18:47:58] INFO web_base.py:105: fake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.
[2024-04-30 18:47:58] INFO web_base.py:105: fake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.


Adding documents 0 to 20
Adding documents 20 to 40
Adding documents 40 to 60
Adding documents 60 to 80
Adding documents 80 to 100
Adding documents 100 to 120
Adding documents 120 to 140
Adding documents 140 to 160
Adding documents 160 to 180
Adding documents 180 to 200


In [14]:
question = "agent memory"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
doc_txt

'They also discussed the risks, especially with illicit drugs and bioweapons. They developed a test set containing a list of known chemical weapon agents and asked the agent to synthesize them. 4 out of 11 requests (36%) were accepted to obtain a synthesis solution and the agent attempted to consult documentation to execute the procedure. 7 out of 11 were rejected and among these 7 rejected cases, 5 happened after a Web search while 2 were rejected based on prompt only.\nGenerative Agents Simulation#\nGenerative Agents (Park, et al. 2023) is super fun experiment where 25 virtual characters, each controlled by a LLM-powered agent, are living and interacting in a sandbox environment, inspired by The Sims. Generative agents create believable simulacra of human behavior for interactive applications.\nThe design of generative agents combines LLM with memory, planning and reflection mechanisms to enable agents to behave conditioned on past experience, as well as to interact with other agents