In [None]:
## ENSURE THAT THE ETCD, MINIO, AND MILVUS SERVICES IN docker-compose.yml
### ARE UNCOMMENTED AND RUNNING IF YOU WANT TO USE THE CONTAINERIZED INSTANCES

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Milvus
from langchain.document_loaders import TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA

from textwrap import dedent

In [2]:
# MILVUS_HOST = "localhost" # When running notebook outside of compoose jupyter container
MILVUS_HOST = "milvus" # When running notebook inside of compose jupyter container
MILVUS_PORT = 19530
# MILVUS_HTTPS_URL= f"https://{MILVUS_HOST}:{MILVUS_PORT}"
MILVUS_USER = "root"
MILVUS_PASSWORD = "milvus"

In [3]:
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"

In [4]:
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

In [5]:

loader = TextLoader("../data/state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=75)
docs = text_splitter.split_documents(documents)

In [6]:
docsearch = Milvus.from_documents(
    docs,
    embeddings,
    connection_args={"host":MILVUS_HOST, "port":MILVUS_PORT}
)

In [7]:
query = "How much does the president want to cut the cancer death rate?"
docs = docsearch.similarity_search(query, k=10)

In [8]:
docs[0].page_content

'Cancer is the #2 cause of death in America–second only to heart disease. \n\nLast month, I announced our plan to supercharge  \nthe Cancer Moonshot that President Obama asked me to lead six years ago. \n\nOur goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases.  \n\nMore support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health.'

In [9]:
## IMPORTANT NOTE If you want to use the Mistral 7B Model
## As of 10/7/2023, need to run the pip install below, as Mistral is not included in main transformers library yet
# !pip install git+https://github.com/huggingface/transformers.git

## RESTART THE KERNEL AFTER INSTALLING

In [10]:
# model_name = "PY007/TinyLlama-1.1B-Chat-v0.3"
# model_name = "TheBloke/CollectiveCognition-v1.1-Mistral-7B-GPTQ"
model_name = "TheBloke/Llama-2-7B-chat-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto",
                                             )
# model=model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          device_map="auto"
                                          )
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

## Using the Model without any additional context

In [11]:
query = "How much does the president want to cut the cancer death rate?"

In [12]:
inputs = tokenizer.encode(query, return_tensors="pt").to("cuda")

In [13]:
output_tensor = model.generate(inputs, min_new_tokens=25, max_new_tokens=100, repetition_penalty=1.3)



In [14]:
# These are the token ID's for the generated response
output_tensor[0]

tensor([    1,  1128,  1568,   947,   278,  6673,   864,   304,  5700,   278,
        23900,  4892,  6554, 29973,    13,    13,  1576,  7178, 29915, 29879,
         1815,  2265,  4546,   787,  8711, 14511,  1230,   263,  9893,   304,
        10032,   278,  1353,   310, 23900,  4892, 29879,   491, 29871, 29945,
        29900, 29995,   975,   278,  2446,   316,  6332, 29889,   910,   626,
         2966,  2738,  7306,   674,  1996,  7282, 13258,  1860,   297,  5925,
        29892,   848, 19383, 29892,   322,   716,  7539,  1860, 29892,   408,
         1532,   408,  3620,   297,  9045, 18020, 24833,   322, 23274, 29889,
          450,  8037,  5619,   756,  7972,   395, 29896, 24464,   297,  5220,
          292,   363,   278,  1824, 29892,   607,   338,  3806,   304,   367,
        19228,   491,  2024,  1016,   943,   322,   916,  8974,   310, 29199,
        29889,     2], device='cuda:0')

In [15]:
# We decode the tokens like this (instead of decoding the entire output_tensor)
## so that we don't include the propmt itself in the output
generated_text = tokenizer.batch_decode(output_tensor[:, inputs.shape[1]:])[0]

In [16]:
print(generated_text)



The President's Cancer Moonshot initiative aims to reduce the number of cancer deaths by 50% over the next decade. This ambitious goal will require significant investments in research, data sharing, and new treatments, as well as changes in healthcare policies and practices. The White House has proposed $1 billion in funding for the program, which is expected to be matched by private donors and other sources of funds.</s>


## Using the model with retrieval

In [17]:
def run_retrieval_qa(query, k=4, min_new_tokens=1, max_new_tokens=20, return_sources=True, repetition_penalty=1.0, remove_tokens=("<s>","</s>")):
    docs = docsearch.similarity_search(query, k=k)

    context = "\n".join([doc.page_content for doc in docs])

    template = PromptTemplate(template=dedent(f""""
    <SYS>You are a question-and-answer assistant that only uses information from the provided context when responding.</SYS>
    
    <s>[INST] Use the context to answer the question. Be detailed and specific in your answers, but do not include anything besides the answer to the question in your response.
    
    Context: {context}

    Question: {{query}}[/INST]

    Answer: 
    """), input_variables=["query"])

    prompt = template.format(query=query)

    input_tensor = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    output_tensor = model.generate(input_tensor, min_new_tokens=min_new_tokens, max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty)

    ## The batch_decode call below removes the input tokens
    generated_text = tokenizer.batch_decode(output_tensor[:, input_tensor.shape[1]:])[0]
    
    for token in remove_tokens:
        generated_text = generated_text.replace(token,"")
    generated_text = generated_text.strip('" \n')
    
    output = {
        "text": generated_text
    }
    if return_sources: output["sources"] = docs
    return output

In [18]:
res = run_retrieval_qa(query, min_new_tokens=150, max_new_tokens=300, repetition_penalty=1.1)

In [19]:
print(res["text"])

The president wants to cut the cancer death rate by at least 50% over the next 25 years. This means reducing the number of cancer deaths by half within the next quarter-century. The president's plan is to achieve this through increased funding for research, improved access to healthcare, and greater support for patients and families affected by cancer. Specifically, the president has proposed increasing funding for the Advanced Research Projects Agency for Health (ARPA-H) to accelerate the development of new cancer treatments and diagnostic tools. Additionally, the president has called on Congress to provide greater support for patients and families affected by cancer, including those who are struggling with medical bills and other financial burdens. By taking these steps, the president hopes to make significant progress towards his goal of cutting the cancer death rate in half over the next 25 years.


## Indexing Multiple Vector Fields

### Create a "summary" field for the documents we want to index, so we will have both a "page_content" field and a "summary" field to embed and store

In [20]:
milvus_docs = [doc.dict() for doc in docs]

In [21]:
def summarize_text(text:str, min_new_tokens=1, max_new_tokens=200, repetition_penalty=1.1):
    
    summarization_template_string = """[SYS]You are an assistant with a great ability for summarizing text content.[/SYS]
    
    <s>[INST]Summarize the following information. Capture the important information, but be as concise as possible.

    Information: {document}[/INST]

    Summary: """
    summarization_template = PromptTemplate(template=summarization_template_string, input_variables=["document"])
    
    summarization_prompt = summarization_template.format(document=text)

    input_tensor = tokenizer.encode(summarization_prompt, return_tensors="pt").to("cuda")
    output_tensor = model.generate(
                                   input_tensor, 
                                   min_new_tokens=min_new_tokens, 
                                   max_new_tokens=max_new_tokens, 
                                   repetition_penalty=repetition_penalty
                                  )

    generated_text = tokenizer.batch_decode(output_tensor[:, input_tensor.shape[1]:])[0]
    

    try:
        generated_text = generated_text.split("Summary: ")[1]
    except IndexError:
        pass
    

    replace_tokens = ("<s>","</s>")
    for token in replace_tokens:
        generated_text = generated_text.replace(token,"")
    generated_text = generated_text.strip()
    
    return generated_text

In [22]:
for i, doc in enumerate(milvus_docs):
    doc["id"] = i+1
    text = doc["page_content"]
    summary = summarize_text(text)
    doc["summary"] = summary

## Create a Milvus DB

In [23]:
from pymilvus import connections, db, MilvusException, utility

conn = connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)

In [24]:
db_name = "mydb"
try:
    database = db.create_database(db_name)
except MilvusException as e:
    if e.code == 1:
        print(f"Database {db_name} already exists. Skipping creation...")
        db.using_database(db_name)
    else:
        raise e

RPC error: [create_database], <MilvusException: (code=1, message=database already exist: mydb)>, <Time:{'RPC start': '2023-10-13 02:48:20.474212', 'RPC error': '2023-10-13 02:48:20.475708'}>


Database mydb already exists. Skipping creation...


In [25]:
db.list_database()

['default', 'mydb']

### Create a Schema and Collection using the Schema

In [26]:
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
doc_id = FieldSchema(
  name="id",
  dtype=DataType.INT64,
  is_primary=True,
)
page_content = FieldSchema(
  name="page_content",
  dtype=DataType.VARCHAR,
  max_length=1000,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="NONE"
)
page_content_vector = FieldSchema(
  name="page_content_vector",
  dtype=DataType.FLOAT_VECTOR,
  dim=embeddings.client.get_sentence_embedding_dimension()
)
summary = FieldSchema(
  name="summary",
  dtype=DataType.VARCHAR,
  max_length=1000,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="NONE"
)
summary_vector = FieldSchema(
  name="summary_vector",
  dtype=DataType.FLOAT_VECTOR,
  dim=embeddings.client.get_sentence_embedding_dimension()
)
metadata = FieldSchema(
    name="metadata",
    dtype=DataType.JSON
)

## HAVE TO CREATE MULTIPLE COLLECTIONS BECAUSE MILVUS MULTI-VECTOR NOT SUPPORTED YET (as of 10/12/2023)
page_content_schema = CollectionSchema(
  fields=[doc_id, page_content, page_content_vector, summary, metadata],
  description="SOTU Search with page content vector",
  enable_dynamic_field=True
)
summary_schema = CollectionSchema(
  fields=[doc_id, page_content, summary, summary_vector, metadata],
  description="SOTU Search with summary vector",
  enable_dynamic_field=True
)
page_content_collection_name = "sotu_text"
summary_collection_name = "sotu_summary"

In [27]:
DROP_COLLECTION_IF_EXISTS=True
if DROP_COLLECTION_IF_EXISTS:
    for collection_name in [page_content_collection_name, summary_collection_name]:
        utility.drop_collection(collection_name)

In [28]:
page_content_collection = Collection(
    name=page_content_collection_name,
    schema=page_content_schema,
    using='default',
    shards_num=1
)
summary_collection = Collection(
    name=summary_collection_name,
    schema=summary_schema,
    using='default',
    shards_num=1
)

## TO GET EXISTING COLLECTIONS
# page_content_collection = Collection(page_content_collection_name)
# summary_collection = Collection(summary_collection_name)

### Create embeddings for the "page_content" and "summary" fields

In [29]:
def generate_embeddings(text:str, embedding_model=embeddings):
    return embedding_model.embed_documents(text)[0]

In [30]:
for doc in milvus_docs:
    for field in ["page_content", "summary"]:
        doc[f"{field}_vector"] = generate_embeddings(doc[field])    

### Write Documents to Milvus Collections

In [31]:
page_content_collection.upsert(milvus_docs)

(insert count: 10, delete count: 10, upsert count: 10, timestamp: 444901700774658051, success count: 10, err count: 0)

In [32]:
summary_collection.upsert(milvus_docs)

(insert count: 10, delete count: 10, upsert count: 10, timestamp: 444901700774658059, success count: 10, err count: 0)

### Index Vectors in Milvus

In [33]:
index_params = {
  "metric_type":"L2",
  "index_type":"IVF_FLAT",
  "params":{"nlist":1024}
}

In [34]:
page_content_collection.create_index(
  field_name="page_content_vector", 
  index_params=index_params
)

summary_collection.create_index(
  field_name="summary_vector", 
  index_params=index_params
)

Status(code=0, message=)

In [35]:
utility.index_building_progress("sotu_text")
# Output: {'total_rows': 0, 'indexed_rows': 0}

{'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0}

## Search Milvus

In [36]:
page_content_collection.load()

In [37]:
search_params = {
    "metric_type": "L2", 
    # "offset": 5, 
    "ignore_growing": False, 
    "params": {"nprobe": 10}
}

In [38]:
search_vector = embeddings.embed_documents("Cancer Moonshot initiative")

In [39]:
results = page_content_collection.search(
    data=search_vector, 
    anns_field="page_content_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=2,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['page_content'],
    consistency_level="Strong"
)

In [40]:
results[0][0]

id: 2, distance: 2.0121756908256239e-13, entity: {'page_content': 'Cancer is the #2 cause of death in America–second only to heart disease. \n\nLast month, I announced our plan to supercharge  \nthe Cancer Moonshot that President Obama asked me to lead six years ago. \n\nOur goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases.  \n\nMore support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health.'}

## BELOW STILL IN PROGRESS

## Retrieval with Reranking (Boosting with Text Search)

In [None]:
from opensearchpy import OpenSearch

In [None]:
client = OpenSearch(
    hosts=OPENSEARCH_HTTPS_URL,
    http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWORD),
    use_ssl=True,
    verify_certs=False,
    ssl_assety_hostname=False,
    ssl_show_warn=False
    )

In [None]:
os_query = {
  'size': 5,
  'query': {
    'multi_match': {
      'query': query,
    #   'fields': ['title^2', 'director']
    }
  }
}

In [None]:
os_res = client.search(os_query)

In [None]:
os_res["hits"]["hits"]