In [1]:
## ENSURE THAT THE ETCD, MINIO, AND MILVUS SERVICES IN docker-compose.yml
### ARE UNCOMMENTED AND RUNNING IF YOU WANT TO USE THE CONTAINERIZED INSTANCES

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Milvus
from langchain.document_loaders import TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA

from textwrap import dedent

In [3]:
# MILVUS_HOST = "localhost" # When running notebook outside of compoose jupyter container
MILVUS_HOST = "milvus" # When running notebook inside of compose jupyter container
MILVUS_PORT = 19530
# MILVUS_HTTPS_URL= f"https://{MILVUS_HOST}:{MILVUS_PORT}"
MILVUS_USER = "root"
MILVUS_PASSWORD = "milvus"

In [4]:
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"

In [5]:
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

In [6]:

loader = TextLoader("../../../data/state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=75)
docs = text_splitter.split_documents(documents)

In [7]:
docsearch = Milvus.from_documents(
    docs,
    embeddings,
    connection_args={"host":MILVUS_HOST, "port":MILVUS_PORT}
)

In [8]:
query = "How much does the president want to cut the cancer death rate?"
docs = docsearch.similarity_search(query, k=10)

In [9]:
docs[0].page_content

'Cancer is the #2 cause of death in America–second only to heart disease. \n\nLast month, I announced our plan to supercharge  \nthe Cancer Moonshot that President Obama asked me to lead six years ago. \n\nOur goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases.  \n\nMore support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health.'

In [10]:
## IMPORTANT NOTE If you want to use the Mistral 7B Model
## As of 10/7/2023, need to run the pip install below, as Mistral is not included in main transformers library yet
# !pip install git+https://github.com/huggingface/transformers.git

## RESTART THE KERNEL AFTER INSTALLING

In [11]:
# model_name = "PY007/TinyLlama-1.1B-Chat-v0.3"
# model_name = "TheBloke/CollectiveCognition-v1.1-Mistral-7B-GPTQ"
model_name = "TheBloke/Llama-2-13B-chat-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto",
                                             )
# model=model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          device_map="auto"
                                          )
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

## Using the Model without any additional context

In [12]:
query = "How much does the president want to cut the cancer death rate?"

In [13]:
inputs = tokenizer.encode(query, return_tensors="pt").to("cuda")

In [14]:
output_tensor = model.generate(inputs, min_new_tokens=25, max_new_tokens=100, repetition_penalty=1.3)



In [15]:
# These are the token ID's for the generated response
output_tensor[0]

tensor([    1,  1128,  1568,   947,   278,  6673,   864,   304,  5700,   278,
        23900,  4892,  6554, 29973,    13,  1576,  7178, 30010, 29879,  7038,
          657,   363,   383,   275,  1052,  8905, 29871, 29906, 29900, 29896,
        29947,  7805,   263,  2009,   310,   395, 29946, 29889, 29955, 24464,
          363,   278,  3086,  1815,  2265,  8907,   313, 29940,  8426,   511,
          607,   338,   385,  7910,   310,   395, 29941, 29945, 29953,  7284,
          975,   278,  1857,  3233, 29889,   910, 11524,   263, 29871, 29929,
        10151,  7910,  2038,   278, 16436,  1052,  1629, 29871, 29906, 29900,
        29896, 29955,  7128,   362, 29892,   322,   723,  6963,  3001,   405,
        29902, 29950,  5220,   292,   304,   395, 29941, 29896, 29889, 29947,
        24464, 29889,   450, 23562,   884,  9551,   267,   304, 10127,   263,
          716,  1346,  6028,  2265], device='cuda:0')

In [16]:
# We decode the tokens like this (instead of decoding the entire output_tensor)
## so that we don't include the propmt itself in the output
generated_text = tokenizer.batch_decode(output_tensor[:, inputs.shape[1]:])[0]

In [17]:
print(generated_text)


The President’s Budget for Fiscal Year 2018 includes a request of $4.7 billion for the National Cancer Institute (NCI), which is an increase of $356 million over the current level. This represents a 9 percent increase above the fiscal year 2017 appropriation, and would bring total NIH funding to $31.8 billion. The budget also proposes to establish a new “Cancer


## Using the model with retrieval

In [18]:
def run_retrieval_qa(query, k=4, min_new_tokens=1, max_new_tokens=20, return_sources=True, repetition_penalty=1.0, remove_tokens=("<s>","</s>")):
    docs = docsearch.similarity_search(query, k=k)

    context = "\n".join([doc.page_content for doc in docs])

    template = PromptTemplate(template=dedent(f""""
    <SYS>You are a question-and-answer assistant that only uses information from the provided context when responding.</SYS>
    
    <s>[INST] Use the context to answer the question. Be detailed and specific in your answers, but do not include anything besides the answer to the question in your response.
    
    Context: {context}

    Question: {{query}}[/INST]

    Answer: 
    """), input_variables=["query"])

    prompt = template.format(query=query)

    input_tensor = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    output_tensor = model.generate(input_tensor, min_new_tokens=min_new_tokens, max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty)

    generated_tensor = output_tensor[:, input_tensor.shape[1]:]

    ## The batch_decode call below removes the input tokens
    generated_text = tokenizer.batch_decode(generated_tensor)[0]
    
    for token in remove_tokens:
        generated_text = generated_text.replace(token,"")
    generated_text = generated_text.strip('" \n')
    
    output = {
        "text": generated_text,
        "output_token_length":len(generated_tensor[0])
    }
    if return_sources: output["sources"] = docs
    return output

In [19]:
res = run_retrieval_qa(query, min_new_tokens=150, max_new_tokens=300, repetition_penalty=1.1)

In [20]:
res

{'text': 'The President wants to cut the cancer death rate by at least 50% over the next 25 years. This is part of the Cancer Moonshot initiative that was launched six years ago by President Obama, and the goal is to turn more cancers from death sentences into treatable diseases, with more support for patients and families. To achieve this ambitious goal, the President is calling on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. The focus is on making significant progress in the fight against cancer, which is currently the second leading cause of death in America, after heart disease. The aim is to save countless lives and improve the quality of life for those affected by cancer.',
 'output_token_length': 154,
 'sources': [Document(page_content='Cancer is the #2 cause of death in America–second only to heart disease. \n\nLast month, I announced our plan to supercharge  \nthe Cancer Moonshot that President Obama asked me to lead six years ago. \n\nOur goal is

## Indexing Multiple Vector Fields

### Create a "summary" field for the documents we want to index, so we will have both a "page_content" field and a "summary" field to embed and store

In [21]:
milvus_docs = [doc.dict() for doc in docs]

In [22]:
def summarize_text(text:str, min_new_tokens=1, max_new_tokens=200, repetition_penalty=1.1):
    
    summarization_template_string = """[SYS]You are an assistant with a great ability for summarizing text content.[/SYS]
    
    <s>[INST]Summarize the following information. Capture the important information, but be as concise as possible.

    Information: {document}[/INST]

    Summary: """
    summarization_template = PromptTemplate(template=summarization_template_string, input_variables=["document"])
    
    summarization_prompt = summarization_template.format(document=text)

    input_tensor = tokenizer.encode(summarization_prompt, return_tensors="pt").to("cuda")
    output_tensor = model.generate(
                                   input_tensor, 
                                   min_new_tokens=min_new_tokens, 
                                   max_new_tokens=max_new_tokens, 
                                   repetition_penalty=repetition_penalty
                                  )

    generated_text = tokenizer.batch_decode(output_tensor[:, input_tensor.shape[1]:])[0]
    

    try:
        generated_text = generated_text.split("Summary: ")[1]
    except IndexError:
        pass
    

    replace_tokens = ("<s>","</s>")
    for token in replace_tokens:
        generated_text = generated_text.replace(token,"")
    generated_text = generated_text.strip()
    
    return generated_text

In [23]:
for i, doc in enumerate(milvus_docs):
    doc["id"] = i+1
    text = doc["page_content"]
    summary = summarize_text(text)
    doc["summary"] = summary

## Create a Milvus DB

In [24]:
from pymilvus import connections, db, MilvusException, utility

conn = connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)

In [25]:
db_name = "mydb"
try:
    database = db.create_database(db_name)
except MilvusException as e:
    if e.code == 1:
        print(f"Database {db_name} already exists. Skipping creation...")
        db.using_database(db_name)
    else:
        raise e

RPC error: [create_database], <MilvusException: (code=1, message=database already exist: mydb)>, <Time:{'RPC start': '2023-10-15 16:01:29.626586', 'RPC error': '2023-10-15 16:01:29.628562'}>


Database mydb already exists. Skipping creation...


In [26]:
db.list_database()

['default', 'mydb']

### Create a Schema and Collection using the Schema

In [27]:
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
doc_id = FieldSchema(
  name="id",
  dtype=DataType.INT64,
  is_primary=True,
)
page_content = FieldSchema(
  name="page_content",
  dtype=DataType.VARCHAR,
  max_length=1000,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="NONE"
)
page_content_vector = FieldSchema(
  name="page_content_vector",
  dtype=DataType.FLOAT_VECTOR,
  dim=embeddings.client.get_sentence_embedding_dimension()
)
summary = FieldSchema(
  name="summary",
  dtype=DataType.VARCHAR,
  max_length=1000,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="NONE"
)
summary_vector = FieldSchema(
  name="summary_vector",
  dtype=DataType.FLOAT_VECTOR,
  dim=embeddings.client.get_sentence_embedding_dimension()
)
metadata = FieldSchema(
    name="metadata",
    dtype=DataType.JSON
)

## HAVE TO CREATE MULTIPLE COLLECTIONS BECAUSE MILVUS MULTI-VECTOR NOT SUPPORTED YET (as of 10/12/2023)
page_content_schema = CollectionSchema(
  fields=[doc_id, page_content, page_content_vector, summary, metadata],
  description="SOTU Search with page content vector",
  enable_dynamic_field=True
)
summary_schema = CollectionSchema(
  fields=[doc_id, page_content, summary, summary_vector, metadata],
  description="SOTU Search with summary vector",
  enable_dynamic_field=True
)
page_content_collection_name = "sotu_text"
summary_collection_name = "sotu_summary"

In [28]:
DROP_COLLECTION_IF_EXISTS=True
if DROP_COLLECTION_IF_EXISTS:
    for collection_name in [page_content_collection_name, summary_collection_name]:
        utility.drop_collection(collection_name)

In [29]:
page_content_collection = Collection(
    name=page_content_collection_name,
    schema=page_content_schema,
    using='default',
    shards_num=1
)
summary_collection = Collection(
    name=summary_collection_name,
    schema=summary_schema,
    using='default',
    shards_num=1
)

## TO GET EXISTING COLLECTIONS
# page_content_collection = Collection(page_content_collection_name)
# summary_collection = Collection(summary_collection_name)

### Create embeddings for the "page_content" and "summary" fields

In [30]:
def generate_embeddings(text:str, embedding_model=embeddings):
    return embedding_model.embed_documents(text)[0]

In [31]:
for doc in milvus_docs:
    for field in ["page_content", "summary"]:
        doc[f"{field}_vector"] = generate_embeddings(doc[field])    

### Write Documents to Milvus Collections

In [32]:
page_content_collection.upsert(milvus_docs)

(insert count: 10, delete count: 10, upsert count: 10, timestamp: 444959474457772035, success count: 10, err count: 0)

In [33]:
summary_collection.upsert(milvus_docs)

(insert count: 10, delete count: 10, upsert count: 10, timestamp: 444959474457772044, success count: 10, err count: 0)

### Index Vectors in Milvus

In [34]:
index_params = {
  "metric_type":"L2",
  "index_type":"IVF_FLAT",
  "params":{"nlist":1024}
}

In [35]:
page_content_collection.create_index(
  field_name="page_content_vector", 
  index_params=index_params
)

summary_collection.create_index(
  field_name="summary_vector", 
  index_params=index_params
)

Status(code=0, message=)

In [36]:
utility.index_building_progress("sotu_text")
# Output: {'total_rows': 0, 'indexed_rows': 0}

{'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0}

## Search Milvus

In [37]:
page_content_collection.load()

In [38]:
search_params = {
    "metric_type": "L2", 
    # "offset": 5, 
    "ignore_growing": False, 
    "params": {"nprobe": 10}
}

In [39]:
search_vector = embeddings.embed_documents("Cancer Moonshot initiative")

In [40]:
results = page_content_collection.search(
    data=search_vector, 
    anns_field="page_content_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=2,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['page_content'],
    consistency_level="Strong"
)

In [41]:
results[0][0]

id: 2, distance: 2.0121756908256239e-13, entity: {'page_content': 'Cancer is the #2 cause of death in America–second only to heart disease. \n\nLast month, I announced our plan to supercharge  \nthe Cancer Moonshot that President Obama asked me to lead six years ago. \n\nOur goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases.  \n\nMore support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health.'}

## BELOW STILL IN PROGRESS

## Retrieval with Reranking (Boosting with Text Search)