In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.document_loaders import TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA

from textwrap import dedent

In [2]:
# OPENSEARCH_HOST = "localhost" # When running notebook outside of compoose jupyter container
OPENSEARCH_HOST = "opensearch" # When running notebook inside of compose jupyter container
OPENSEARCH_PORT = 9200
OPENSEARCH_HTTPS_URL= f"https://{OPENSEARCH_HOST}:{OPENSEARCH_PORT}"
OPENSEARCH_USER = "admin"
OPENSEARCH_PASSWORD = "admin"

In [3]:
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"

In [4]:
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

In [5]:

loader = TextLoader("../data/state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [6]:
docsearch = OpenSearchVectorSearch.from_documents(
    docs,
    embeddings,
    engine="faiss",
    space_type="innerproduct",
    ef_construction=256,
    m=48,
    opensearch_url=OPENSEARCH_HTTPS_URL,
    http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWORD),
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

In [7]:
query = "How much does the president want to cut the cancer death rate?"
docs = docsearch.similarity_search(query, k=10)

In [8]:
docs[0].page_content

'Last month, I announced our plan to supercharge  \nthe Cancer Moonshot that President Obama asked me to lead six years ago. \n\nOur goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases.  \n\nMore support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more.  \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. \n\nA unity agenda for the nation. \n\nWe can do this. \n\nMy fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. \n\nIn this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. \n\nWe have fought for freedom, expanded liberty, defeated totalitarianism a

In [9]:
## IMPORTANT NOTE If you want to use the Mistral 7B Model
## As of 10/7/2023, need to run the pip install below, as Mistral is not included in main transformers library yet
# !pip install git+https://github.com/huggingface/transformers.git

## RESTART THE KERNEL AFTER INSTALLING

In [10]:
# model_name = "PY007/TinyLlama-1.1B-Chat-v0.3"
# model_name = "TheBloke/CollectiveCognition-v1.1-Mistral-7B-GPTQ"
model_name = "TheBloke/Llama-2-7B-chat-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto",
                                             )
# model=model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          device_map="auto"
                                          )
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

## Using the Model without any additional context

In [11]:
query = "How much does the president want to cut the cancer death rate?"

In [12]:
inputs = tokenizer.encode(query, return_tensors="pt").to("cuda")

In [13]:
output_tensor = model.generate(inputs, min_new_tokens=25, max_new_tokens=100, repetition_penalty=1.3)



In [14]:
# These are the token ID's for the generated response
output_tensor[0]

tensor([    1,  1128,  1568,   947,   278,  6673,   864,   304,  5700,   278,
        23900,  4892,  6554, 29973,    13,    13,  1576,  7178, 29915, 29879,
         1815,  2265,  4546,   787,  8711, 14511,  1230,   263,  9893,   304,
        10032,   278,  1353,   310, 23900,  4892, 29879,   491, 29871, 29945,
        29900, 29995,   975,   278,  2446,   316,  6332, 29889,   910,   626,
         2966,  2738,  7306,   674,  1996,  7282, 13258,  1860,   297,  5925,
        29892,   848, 19383, 29892,   322,   716,  7539,  1860, 29892,   408,
         1532,   408,  3620,   297,  9045, 18020, 24833,   322, 23274, 29889,
          450,  8037,  5619,   756,  7972,   395, 29896, 24464,   297,  5220,
          292,   363,   278,  1824, 29892,   607,   338,  3806,   304,   367,
        19228,   491,  2024,  1016,   943,   322,   916,  8974,   310, 29199,
        29889,     2], device='cuda:0')

In [15]:
# We decode the tokens like this (instead of decoding the entire output_tensor)
## so that we don't include the propmt itself in the output
generated_text = tokenizer.batch_decode(output_tensor[:, inputs.shape[1]:])[0]

In [16]:
print(generated_text)



The President's Cancer Moonshot initiative aims to reduce the number of cancer deaths by 50% over the next decade. This ambitious goal will require significant investments in research, data sharing, and new treatments, as well as changes in healthcare policies and practices. The White House has proposed $1 billion in funding for the program, which is expected to be matched by private donors and other sources of funds.</s>


## Using the model with retrieval

In [17]:
def run_retrieval_qa(query, k=4, min_new_tokens=1, max_new_tokens=20, return_sources=True, repetition_penalty=1.0, remove_tokens=("<s>","</s>")):
    docs = docsearch.similarity_search(query, k=k)

    context = "\n".join([doc.page_content for doc in docs])

    template = PromptTemplate(template=dedent(f""""
    <SYS>You are a question-and-answer assistant that only uses information from the provided context when responding.</SYS>
    
    <s>[INST] Use the context to answer the question. Be detailed and specific in your answers, but do not include anything besides the answer to the question in your response.
    
    Context: {context}

    Question: {{query}}[/INST]

    Answer: 
    """), input_variables=["query"])

    prompt = template.format(query=query)

    input_tensor = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    output_tensor = model.generate(input_tensor, min_new_tokens=min_new_tokens, max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty)

    ## The batch_decode call below removes the input tokens
    generated_text = tokenizer.batch_decode(output_tensor[:, input_tensor.shape[1]:])[0]
    
    for token in remove_tokens:
        generated_text = generated_text.replace(token,"")
    generated_text = generated_text.strip('" \n')
    
    output = {
        "text": generated_text
    }
    if return_sources: output["sources"] = docs
    return output

In [18]:
res = run_retrieval_qa(query, min_new_tokens=150, max_new_tokens=300, repetition_penalty=1.1)

In [19]:
print(res["text"])

The president wants to cut the cancer death rate by at least 50% over the next 25 years. This is a key component of the Cancer Moonshot initiative that he launched six years ago. The goal is to turn more cancers from death sentences into treatable diseases and provide more support for patients and families affected by cancer. To achieve this goal, the president has called on Congress to fund ARPA-H, an agency dedicated to driving breakthroughs in cancer research. By investing in research and development, the president believes that the United States can make significant progress in reducing cancer mortality rates and improving the lives of millions of Americans.


## Indexing Multiple Vector Fields

### Create a "summary" field for the documents we want to index, so we will have both a "page_content" field and a "summary" field to embed and store

In [20]:
opensearch_docs = [doc.dict() for doc in docs]

In [21]:
def summarize_text(text:str, min_new_tokens=1, max_new_tokens=200, repetition_penalty=1.1):
    
    summarization_template_string = """[SYS]You are an assistant with a great ability for summarizing text content.[/SYS]
    
    <s>[INST]Summarize the following information. Capture the important information, but be as concise as possible.

    Information: {document}[/INST]

    Summary: """
    summarization_template = PromptTemplate(template=summarization_template_string, input_variables=["document"])
    
    summarization_prompt = summarization_template.format(document=text)

    input_tensor = tokenizer.encode(summarization_prompt, return_tensors="pt").to("cuda")
    output_tensor = model.generate(
                                   input_tensor, 
                                   min_new_tokens=min_new_tokens, 
                                   max_new_tokens=max_new_tokens, 
                                   repetition_penalty=repetition_penalty
                                  )

    generated_text = tokenizer.batch_decode(output_tensor[:, input_tensor.shape[1]:])[0]
    

    try:
        generated_text = generated_text.split("Summary: ")[1]
    except IndexError:
        pass
    

    replace_tokens = ("<s>","</s>")
    for token in replace_tokens:
        generated_text = generated_text.replace(token,"")
    generated_text = generated_text.strip()
    
    return generated_text

In [22]:
for doc in opensearch_docs:
    text = doc["page_content"]
    summary = summarize_text(text)
    doc["summary"] = summary

### Create embeddings for the "page_content" and "summary" fields

In [23]:
def generate_embeddings(text:str, embedding_model=embeddings):
    return embedding_model.embed_documents(text)[0]

In [24]:
for doc in opensearch_docs:
    for field in ["page_content", "summary"]:
        doc[f"{field}_vector"] = generate_embeddings(doc[field])    

### Write Documents to Opensearch Index

In [25]:
index_name = "my-multi-vector-index"

In [26]:
from opensearchpy import OpenSearch
from hashlib import sha1
import json

In [27]:
client = OpenSearch(
    hosts=OPENSEARCH_HTTPS_URL,
    http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWORD),
    use_ssl=True,
    verify_certs=False,
    ssl_assety_hostname=False,
    ssl_show_warn=False
    )

In [28]:
def create_index(index, mappings=None, opensearch_client=client, replace_existing=False, number_of_shards=1):
    
    if replace_existing:
        opensearch_client.indices.delete(index, ignore_unavailable=True)
        print(f"Deleted existing index: {index}")

    index_body = {
        'settings': {
            'index': {
                'knn': True,
                "knn.algo_param.ef_search": 256,
                'number_of_shards':number_of_shards
            }
        },
        "mappings" : mappings
    }
    response = opensearch_client.indices.create(index=index_name,body=index_body)
    return response


In [29]:
mappings = {
    "properties" :  {
        "page_content_vector" : {
            "type" : "knn_vector",
            "dimension": embeddings.client.get_sentence_embedding_dimension(),
            "method": {
            "name": "hnsw"
          }
        },
        "summary_vector" : {
            "type" : "knn_vector",
            "dimension": embeddings.client.get_sentence_embedding_dimension(),
            "method": {
            "name": "hnsw"
          }
        }
    }
}


create_index(index=index_name, mappings=mappings, replace_existing=True)

Deleted existing index: my-multi-vector-index


{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'my-multi-vector-index'}

In [30]:
bulk_actions_string = "\n".join([
f"""{json.dumps({"index": {
                "_index":index_name,
                 "_id":sha1(doc["page_content"].encode()).hexdigest()
                 }})}
{json.dumps(doc)}
""" for doc in opensearch_docs])

In [31]:
client.bulk(bulk_actions_string)

{'took': 106,
 'errors': False,
 'items': [{'index': {'_index': 'my-multi-vector-index',
    '_id': 'ab8352845164f737ca53ec4512fecdee74c2b8bf',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'my-multi-vector-index',
    '_id': 'fd86971a87cc9f5715742fc51db79c48a902b144',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'my-multi-vector-index',
    '_id': '4e989efd47a337b4e12ba7acbfe0a68902939441',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 2,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'my-multi-vector-index',
    '_id': '4ca995a45bcdbea2499d687bfe71c00dc6a575c7',
    '_version': 1,
    'result': 'created',
    '

In [32]:
def vector_field_search(query:str, field_name:str, k=3, size=3, index=index_name, opensearch_client=client, embedding_model=embeddings):
    """k is the number of neighbors the search of each graph will return. You must also include the size option, which indicates how many results the query actually returns. 
    The plugin returns k amount of results for each shard (and each segment) and size amount of results for the entire query. The plugin supports a maximum k value of 10,000.
    """
    query_body = {
        "size": size,
        "query": {
            "knn": {
                field_name : {
                    "vector": generate_embeddings(text=query, embedding_model=embedding_model),
                    "k": k
                }
            }
        }
    }

    # query_body = {
    #     "size": size,
    #     "query": {
    #     "script_score": {
    #         "query": {
    #         "match_all": {}
    #         },
    #         "script": {
    #         "source": "knn_score",
    #         "lang": "knn",
    #         "params": {
    #             "field": field_name,
    #             "query_value": generate_embeddings(text=query, embedding_model=embedding_model),
    #             "space_type": "cosinesimil"
    #         }
    #         }
    #     }
    #     }
    #     }

    return opensearch_client.search(index=index, body=query_body)
    

In [33]:
results = vector_field_search(query="International supply chain", field_name="page_content_vector")

In [34]:
{k:v for k,v in results["hits"]["hits"][0]["_source"].items() if k in ["page_content","summary"]}

{'page_content': 'I have a better plan to fight inflation. \n\nLower your costs, not your wages. \n\nMake more cars and semiconductors in America. \n\nMore infrastructure and innovation in America. \n\nMore goods moving faster and cheaper in America. \n\nMore jobs where you can earn a good living in America. \n\nAnd instead of relying on foreign supply chains, let’s make it in America. \n\nEconomists call it “increasing the productive capacity of our economy.” \n\nI call it building a better America. \n\nMy plan to fight inflation will lower your costs and lower the deficit. \n\n17 Nobel laureates in economics say my plan will ease long-term inflationary pressures. Top business leaders and most Americans support my plan. And here’s the plan: \n\nFirst – cut the cost of prescription drugs. Just look at insulin. One in ten Americans has diabetes. In Virginia, I met a 13-year-old boy named Joshua Davis.',
 'summary': 'The author proposes a plan to combat inflation by increasing domestic p

## BELOW STILL IN PROGRESS

## Retrieval with Reranking (Boosting with Text Search)

In [None]:
from opensearchpy import OpenSearch

In [None]:
client = OpenSearch(
    hosts=OPENSEARCH_HTTPS_URL,
    http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWORD),
    use_ssl=True,
    verify_certs=False,
    ssl_assety_hostname=False,
    ssl_show_warn=False
    )

In [None]:
os_query = {
  'size': 5,
  'query': {
    'multi_match': {
      'query': query,
    #   'fields': ['title^2', 'director']
    }
  }
}

In [None]:
os_res = client.search(os_query)

In [None]:
os_res["hits"]["hits"]