# Granite Sparse Demo

### Ingest and Retriece using PyMilvus

`pip install pymilvus[model]`

In [6]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [7]:
from pymilvus import model
from pymilvus import MilvusClient, DataType

client = MilvusClient("./milvus_demo.db")

client.drop_collection(collection_name="my_sparse_collection")

schema = client.create_schema(
    auto_id=True,
    enable_dynamic_fields=True,
)

schema.add_field(field_name="pk", datatype=DataType.VARCHAR, is_primary=True, max_length=100)
schema.add_field(field_name="id", datatype=DataType.VARCHAR, is_primary=False, max_length=100)
schema.add_field(field_name="embeddings", datatype=DataType.SPARSE_FLOAT_VECTOR)

index_params = client.prepare_index_params()

index_params.add_index(field_name="embeddings",
                               index_name="sparse_inverted_index",
                               index_type="SPARSE_INVERTED_INDEX",
                               metric_type="IP",
                               params={"drop_ratio_build": 0.2})
client.create_collection(
    collection_name="my_sparse_collection",
    schema=schema,
    index_params=index_params
)

embeddings_model = model.sparse.SpladeEmbeddingFunction(
    model_name="ibm-granite/granite-embedding-30m-sparse", 
    device="cpu",
    batch_size=2,
    k_tokens_query=50,
    k_tokens_document=192
)

# Prepare documents to be ingested
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

# SpladeEmbeddingFunction.encode_documents returns sparse matrix or sparse array depending
# on the milvus-model version. reshape(1,-1) ensures the format is correct for ingestion.
doc_vector = [{"embeddings": doc_emb.reshape(1,-1), "id": f"item_{i}"} for i, doc_emb in enumerate(embeddings_model.encode_documents(docs))]


client.insert(
    collection_name="my_sparse_collection",
    data=doc_vector
)

# Prepare search parameters
search_params = {
    "params": {"drop_ratio_search": 0.2},  # Additional optional search parameters
}

# Prepare the query vector

queries = [
      "When was artificial intelligence founded", 
      "Where was Turing born?"
]
query_vector = embeddings_model.encode_documents(queries)

res = client.search(
    collection_name="my_sparse_collection",
    data=query_vector,
    limit=1, #top k documents to return
    output_fields=["id"],
    search_params=search_params,
)

for r in res:
    print(r)


[{'id': '456894915525738502', 'distance': 12.364130020141602, 'entity': {'id': 'item_0'}}]
[{'id': '456894915525738504', 'distance': 17.1358699798584, 'entity': {'id': 'item_2'}}]


### Get Embeddings using HF Transformers

In [10]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

In [None]:
class SparseSentenceTransformer:
    def __init__(self, model_name_or_path, device:str= 'cpu'):
        self.model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.device = device
        self.model.to(device)
        if device == "cuda":
            self.model = self.model.cuda()
            self.model = self.model.bfloat16()

    @torch.no_grad()
    def encode(self, sentences, max_tokens=20):        
        if type(sentences) == str:
            sentences = [sentences]
        
        input_dict = self.tokenizer(sentences, max_length=512, padding=True, return_tensors='pt', truncation=True)
        attention_mask = input_dict['attention_mask']  # (bs,seqlen)

        if self.device == "cuda":
            input_dict['input_ids'] = input_dict['input_ids'].cuda()
            input_dict['attention_mask'] = input_dict['attention_mask'].cuda()
            if 'token_type_ids' in input_dict:
                input_dict['token_type_ids'] = input_dict['token_type_ids'].cuda()
        
        hidden_state = self.model(**input_dict)[0]

        maxarg = torch.log(1.0 + torch.relu(hidden_state))

        input_mask_expanded = attention_mask.unsqueeze(-1).to(maxarg.device) # bs * seqlen * voc
        maxdim1 = torch.max(maxarg * input_mask_expanded, dim=1).values  # bs * voc
        
        # get topk high weights
        topk, indices = torch.topk(maxdim1, k=max_tokens) # (weight - (bs * max_terms), index - (bs * max_terms))
        print (topk.shape)

        expansions = [[(self.tokenizer.decode(int(indices[sidx][tidx])), float(topk[sidx][tidx])) for tidx in range(topk.shape[1])] for sidx in range(topk.shape[0]) ]  

        return expansions

In [73]:
sparse_model = SparseSentenceTransformer("ibm-granite/granite-embedding-30m-sparse")

In [None]:
#change max_tokens to produce more or less expansions for the sentences
sparse_model.encode(["Artificial intelligence was founded as an academic discipline in 1956."], max_tokens=20)

torch.Size([1, 20])


[[(' AI', 1.667151689529419),
  (' intelligence', 1.4905368089675903),
  (' artificial', 1.250130534172058),
  (' discipline', 1.2192906141281128),
  (' founded', 1.0603735446929932),
  (' 1956', 1.035099983215332),
  (' invention', 0.9785783290863037),
  ('56', 0.7224238514900208),
  (' learning', 0.6999132037162781),
  (' scientific', 0.6892694234848022),
  (' computer', 0.6566571593284607),
  (' academic', 0.6217383146286011),
  (' university', 0.5886250138282776),
  (' robot', 0.5613625049591064),
  (' establishment', 0.550841748714447),
  (' philosophy', 0.5431854128837585),
  ('A', 0.5025951862335205),
  (' brain', 0.476378858089447),
  (' machine', 0.4488101005554199),
  ('1960', 0.44649428129196167)]]