In [1]:
from datasets import load_dataset


DATASET = "squad"  # Name of dataset from HuggingFace Datasets
INSERT_RATIO = 0.9999  # Ratio of example dataset to be inserted

data = load_dataset(DATASET, split="validation")
# Generates a fixed subset. To generate a random subset, remove the seed.
data = data.train_test_split(test_size=INSERT_RATIO, seed=42)["test"]
# Clean up the data structure in the dataset.
data = data.map(
    lambda val: {"answer": val["answers"]["text"][0]},
    remove_columns=["id", "answers", "context"],
)

# View summary of example data
print(data)

Dataset({
    features: ['title', 'question', 'answer'],
    num_rows: 10569
})


In [27]:
import pickle
from docuverse.utils import open_stream
cache_file="/home/raduf/.local/share/elastic_ingestion/benchmark__beir_dev__quora____en__corpus.small.jsonl_512_100_True_all_gte-small.pickle.xz"
import json
data=pickle.load(open_stream(cache_file))

In [28]:
data[0]

{'id': '117-0-139',
 'title': '',
 'text': "I was suddenly logged off Gmail. I can't remember my Gmail password and just realized the recovery email is no longer alive. What can I do?",
 'tlen': 35}

from transformers import AutoTokenizer, AutoModel
import torch

# MODEL = (
#     "sentence-transformers/all-MiniLM-L6-v2"  # Name of model from HuggingFace Models
# )
MODEL = ("../models/slate.30m.english.rtrvr-20240719T181101")
INFERENCE_BATCH_SIZE = 64  # Batch size of model inference

# Load tokenizer & model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModel.from_pretrained(MODEL)

def encode_text(batch, element):
    # Tokenize sentences
    encoded_input = tokenizer(
        batch[element], padding=True, truncation=True, return_tensors="pt"
    )

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    token_embeddings = model_output[0]
    attention_mask = encoded_input["attention_mask"]
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    sentence_embeddings = torch.sum(
        token_embeddings * input_mask_expanded, 1
    ) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    # Normalize embeddings
    batch["qembedding"] = torch.nn.functional.normalize(
        sentence_embeddings, p=2, dim=1
    )
    return batch


# data = data.map(encode_text, batched=True, batch_size=INFERENCE_BATCH_SIZE)

In [29]:
# data_list = data.to_list()
MODEL = ("../models/slate.30m.english.rtrvr-20240719T181101")
data_list = []
from docuverse.utils.embeddings.dense_embedding_function import DenseEmbeddingFunction
model = DenseEmbeddingFunction(MODEL)

=== done initializing model


In [63]:
embeddings = model.encode([d['text'] for d in data], show_progress_bar=True)
keys_to_keep = {"text"}
data_list = [{**{k:v for k, v in d.items() if k in keys_to_keep}, '_id': d['id'], 'qembedding':embeddings[i]} for i,d in enumerate(data)]

Batches:   0%|          | 0/301 [00:00<?, ?it/s]

In [64]:
data_list[0]

{'text': "I was suddenly logged off Gmail. I can't remember my Gmail password and just realized the recovery email is no longer alive. What can I do?",
 '_id': '117-0-139',
 'qembedding': [0.01239310298115015,
  0.08252162486314774,
  -0.007941470481455326,
  -0.00688091991469264,
  -0.04727655649185181,
  -0.02066742815077305,
  -0.012648255564272404,
  -0.025829849764704704,
  0.017632635310292244,
  -0.02430420182645321,
  -0.015020041726529598,
  -0.008922950364649296,
  0.05736778676509857,
  -0.024723688140511513,
  -0.018712686374783516,
  0.01332109048962593,
  -0.08252348750829697,
  0.06152497977018356,
  -0.07216667383909225,
  0.06205274164676666,
  -0.002090111607685685,
  -0.09694285690784454,
  -0.018821170553565025,
  0.01744072698056698,
  0.02464458718895912,
  -0.010289010591804981,
  -0.023978393524885178,
  -0.025935254991054535,
  -0.033752065151929855,
  -0.16178223490715027,
  -0.05182208865880966,
  0.010323116555809975,
  -0.024663103744387627,
  0.06469075381

In [65]:
from pymilvus import MilvusClient
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection, utility
from pymilvus import Index

MILVUS_URI = "http://localhost:19530"  # Connection URI
COLLECTION_NAME = "huggingface_test2"  # Collection name
DIMENSION = 384  # Embedding dimension depending on model

milvus_client = MilvusClient(MILVUS_URI)
if milvus_client.has_collection(collection_name=COLLECTION_NAME):
    print(f"Dropping collection{COLLECTION_NAME}")
    milvus_client.drop_collection(collection_name=COLLECTION_NAME)
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, description="ID", auto_id=True),
    FieldSchema(name="_id", dtype=DataType.VARCHAR, max_length=1000),    
    # FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="qembedding", dtype=DataType.FLOAT_VECTOR, dim=DIMENSION),
    # FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=1000),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=1000),
    # FieldSchema(name="tlen", dtype=DataType.INT64)
]
schema = CollectionSchema(fields, description="crap")

milvus_client.create_collection(
    collection_name=COLLECTION_NAME,
    dimension=DIMENSION,
    schema=schema,
    auto_id=True,  # Enable auto id
    # enable_dynamic_field=True,  # Enable dynamic fields
    vector_field_name="qembedding",  # Map vector field name and embedding column in dataset
    #consistency_level="Strong",  # To enable search with latest data
)

Dropping collectionhuggingface_test2


In [66]:
index_params = milvus_client.prepare_index_params("qembedding")
print(f"Index params before: {index_params}")
_index_params = {
    "metric_type": "IP",
    "index_type": "FLAT",
    "params": {
        "M": 16,
        "efConstruction": 16,
        "nlist": 1024
    }
}
index_params.add_index(field_name="qembedding", **_index_params)
print(f"Index params after: {index_params}")

milvus_client.create_index(collection_name=COLLECTION_NAME, index_params=index_params)


Index params before: [{'field_name': 'qembedding', 'index_name': ''}]
Index params after: [{'field_name': 'qembedding', 'index_type': 'FLAT', 'index_name': '', 'metric_type': 'IP', 'params': {'M': 16, 'efConstruction': 16, 'nlist': 1024}}]


In [67]:
questions = [
    data_list[0]['text'],
    "How can I get free gems in Clash of Clans?",
    "How can I get free gems Clash of Clans?",
    "How do you feel when someone upvotes your answer on Quora?",
    "What are the best thriller movie in Hollywood?",
    "What should someone do to overcome anxiety?"
]
milvus_client.load_collection(collection_name=COLLECTION_NAME)
qembs = model.encode(questions, show_progress_bar=True)
search_params = {
    "metric_type": "IP",
    "params": {
        "efSearch": 10,
        "ef": 10,
    }
}
print(f"Read {len(qembs)} embeddings.")
search_results = milvus_client.search(
    collection_name=COLLECTION_NAME,
    search_params=search_params,
    data=qembs,
    limit=100,  # How many search results to output
    output_fields=["text", "qembedding", 'id'],  # Include these fields in search results
)
for q, res in zip(questions, search_results):
    print(f"Question: {q}")
    if len(res) == 0:
        print("  ** No results found. **")
    else:
        for r in res:
            print({'answer': r['entity']['text'], 'score': r['distance']})
        print("\n")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Read 6 embeddings.
Question: I was suddenly logged off Gmail. I can't remember my Gmail password and just realized the recovery email is no longer alive. What can I do?
  ** No results found. **
Question: How can I get free gems in Clash of Clans?
  ** No results found. **
Question: How can I get free gems Clash of Clans?
  ** No results found. **
Question: How do you feel when someone upvotes your answer on Quora?
  ** No results found. **
Question: What are the best thriller movie in Hollywood?
  ** No results found. **
Question: What should someone do to overcome anxiety?
  ** No results found. **


In [55]:
import random
rnd = [random.random() for _ in range(384)]
milvus_client.search(collection_name=COLLECTION_NAME, data=[rnd], limit=3, search_params=search_params)

data: ['[]'] 

_index_params = {
    "metric_type": "L2",
    "index_type": "HNSW",
    "params": {
        "M": 16,
        "efConstruction": 200
    }
}
milvus_client.insert(collection_name=COLLECTION_NAME, data=data_list)
# milvus_client.create_index(collection_name=COLLECTION_NAME, index_params=index_params)

questions = {
    "question": ["How can I get free gems in Clash of Clans?",
                 "How do you feel when someone upvotes your answer on Quora?",
                 "What are the best thriller movie in Hollywood?",
                 "What should someone do to overcome anxiety?"
    ]
}

# Generate question embeddings
qembeddings = [v.tolist() for v in encode_text(questions)["qembedding"]]

# Search across Milvus
search_results = milvus_client.search(
    collection_name=COLLECTION_NAME,
    data=qembeddings,
    limit=3,  # How many search results to output
    output_fields=["text", "qembedding"],  # Include these fields in search results
)

# Print out results
for q, res in zip(questions["question"], search_results):
    print("Question:", q)
    for r in res:
        print(
            {
                "answer": r["entity"]["text"],
                "score": r["distance"],
                #"original question": r["entity"]["question"],
            }
        )
    print("\n")
    

In [101]:
import numpy as np
search_results[0][0]['entity']['id']

4207

In [46]:
def get_vector(text):
    return np.array(model.encode([text])[0])

In [58]:
import numpy as np
v1=get_vector(questions[0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [103]:
v2=get_vector("What were the causes of the Cold War? Outline the process of Cold War divisions.")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [116]:
# np.inner(b,v1)
np.linalg.norm(b-v3, ord=2)

0.9778001881132582

In [61]:
v3=get_vector("How can I get free gems Clash of Clans?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [62]:
np.inner(v1,v3)

0.9866615843616061

In [106]:
b-v2

array([-2.08473839e-02,  3.14693376e-02,  8.73036012e-02, -2.19272450e-02,
       -2.49180133e-02,  3.84930773e-02, -1.80336763e-02,  4.56284331e-02,
        2.37510139e-02,  5.22989240e-02,  6.43738220e-03,  4.93863598e-02,
        1.18108396e-02,  3.76401795e-02, -4.23989976e-02, -3.48769426e-02,
       -9.49832052e-03,  4.58914973e-03, -3.87049131e-02, -5.01294993e-02,
        3.85297933e-02,  1.08092640e-01, -2.56039761e-03,  4.25339229e-02,
       -1.73569135e-02, -4.95975977e-03, -1.25451721e-02, -8.25382231e-02,
        3.09320949e-02, -6.28986955e-03, -4.87477072e-02,  1.39967036e-02,
       -1.89549654e-02, -1.00677531e-01,  1.92073852e-01,  5.15142977e-02,
        1.78274238e-02, -4.71235178e-02, -1.06536224e-03,  2.12056749e-02,
       -8.50700215e-03,  8.99869055e-02,  5.84303886e-02,  6.79340046e-02,
        5.60854040e-02, -2.51674784e-02,  2.21352875e-02,  1.11801649e-01,
       -8.45035538e-03,  1.82531839e-02,  3.12677808e-02, -7.24360719e-03,
        9.27256793e-03,  

In [107]:
data_list[4207]

{'id': 4207,
 'title': '',
 'text': 'What were the causes of the Cold War? Outline the process of Cold War divisions.',
 'tlen': 19,
 'question_embedding': [-0.06335372477769852,
  0.06741213798522949,
  0.03997817263007164,
  0.025978438556194305,
  -0.031151721253991127,
  0.013791236095130444,
  -0.024804795160889626,
  0.013206233270466328,
  0.034603457897901535,
  0.027347618713974953,
  0.0021884581074118614,
  0.03830800950527191,
  0.018169675022363663,
  0.02940387651324272,
  0.0057934923097491264,
  0.04977790266275406,
  -0.08704179525375366,
  0.02684231661260128,
  -0.05285944044589996,
  0.027862507849931717,
  0.031160207465291023,
  0.03377993777394295,
  -0.023404331877827644,
  -0.03693718835711479,
  -0.06874148547649384,
  -0.009978036396205425,
  -0.0441642664372921,
  0.0033890248741954565,
  -0.04585142806172371,
  -0.29012593626976013,
  -0.008036337792873383,
  -0.005080545321106911,
  -0.001093275030143559,
  -0.1359795480966568,
  0.07473485916852951,
  -0.

In [108]:
c=np.array(data_list[4207]['qembedding'])
np.inner(b, c)

1.000000044775322

In [58]:
from pymilvus import (
    MilvusClient,
    DataType
)


def test_search(vector_for_db, vector_for_query, metric="IP"):
    truncate_dim = 4
    collection_name = "test"
    vector_field_name = "embeddings"

    client = MilvusClient("test.db")
    schema = client.create_schema(auto_id=True, enable_dynamic_field=True, primary_field="id")
    
    schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
    schema.add_field(field_name="source", datatype=DataType.VARCHAR, max_length=50000)
    schema.add_field(field_name=vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=truncate_dim)
    
    index_params = client.prepare_index_params()
    index_params.add_index(
        field_name=vector_field_name,
        index_type="FLAT",
        metric_type=metric,
        params={"nlist": 1024}
    )
    
    if client.has_collection(collection_name=collection_name):
        client.drop_collection(collection_name=collection_name)
    client.create_collection(
        collection_name=collection_name, schema=schema, index_params=index_params
    )
    entities = [
        {"source": "hello", vector_field_name: vector_for_db}
    ]
    insert_result = client.insert(collection_name=collection_name, data=entities)
    
    return client.search(
        collection_name=collection_name,
        data=[vector_for_query],
        search_params={"metric_type": metric, "params": {"nprobe": 10}},
        anns_field=vector_field_name,
        limit=10,
        output_fields=["source"],
    )


In [59]:
print(test_search([1, 2, 3, 4], [1e10, 1e10, 1e10, 1e10]))

data: ["[{'id': 452889328847683586, 'distance': 100000006144.0, 'entity': {'source': 'hello'}}]"] 
