In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.document_loaders import TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA

In [2]:
OPENSEARCH_HOST = "localhost"
OPENSEARCH_PORT = 9200
OPENSEARCH_HTTPS_URL= f"https://{OPENSEARCH_HOST}:{OPENSEARCH_PORT}"
OPENSEARCH_USER = "admin"
OPENSEARCH_PASSWORD = "admin"

In [3]:
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"

In [4]:
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

In [5]:

loader = TextLoader("../data/state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [6]:
docsearch = OpenSearchVectorSearch.from_documents(
    docs,
    embeddings,
    engine="faiss",
    space_type="innerproduct",
    ef_construction=256,
    m=48,
    opensearch_url=OPENSEARCH_HTTPS_URL,
    http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWORD),
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

In [13]:
query = "How much does the president want to cut the cancer death rate?"
docs = docsearch.similarity_search(query, k=10)

In [14]:
docs[0]

Document(page_content='Last month, I announced our plan to supercharge  \nthe Cancer Moonshot that President Obama asked me to lead six years ago. \n\nOur goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases.  \n\nMore support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more.  \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. \n\nA unity agenda for the nation. \n\nWe can do this. \n\nMy fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. \n\nIn this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. \n\nWe have fought for freedom, expanded liberty, defe

In [15]:
## As of 10/7/2023, need to run the pip install below, as Mistral is not included in main transformers library yet
!pip install git+https://github.com/huggingface/transformers.git

## RESTART THE KERNEL AFTER INSTALLING

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-72q6jgtj
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-72q6jgtj
^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [7]:
# model_name = "PY007/TinyLlama-1.1B-Chat-v0.3"
# model_name = "TheBloke/CollectiveCognition-v1.1-Mistral-7B-GPTQ"
model_name = "TheBloke/Llama-2-7B-chat-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto",
                                             )
# model=model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          device_map="auto"
                                          )
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

## Using the Model without any additional context

In [9]:
query = "How much does the president want to cut the cancer death rate?"

In [10]:
inputs = tokenizer.encode(query, return_tensors="pt").to("cuda")

In [11]:
output_tensor = model.generate(inputs, min_new_tokens=300, max_length=2000, repetition_penalty=1.3)



In [12]:
output_tensor[0]

tensor([    1,  1128,  1568,   947,   278,  6673,   864,   304,  5700,   278,
        23900,  4892,  6554, 29973,    13,    13,  1576,  7178, 29915, 29879,
         1815,  2265,  4546,   787,  8711, 14511,  1230,   263,  9893,   304,
        10032,   278,  1353,   310, 23900,  4892, 29879,   491, 29871, 29945,
        29900, 29995,   975,   278,  2446,   316,  6332, 29889,   910,   626,
         2966,  2738,  7306,   674,  1996,  7282, 13258,  1860,   297,  5925,
        29892,   848, 19383, 29892,   322,   716,  7539,  1860, 29892,   408,
         1532,   408,  3620,   297,  9045, 18020, 28289,   322,  8898, 29889,
          450,  8037,  5619,   756,  7972,   395, 29896, 24464,   297,  5220,
          292,   363,   278,  1824, 29892,   607,   338,  3806,   304,   454,
        19698,   385,  5684,   395, 29941, 24464,   515,  2024, 25700,   322,
        21561, 29889,  2648,  1985,  4208,  4822, 17119,  1475,   322,   409,
        14359, 29892,   591,   508,  1207,  4655,   851,  2247, 

In [13]:
generated_text = tokenizer.decode(output_tensor[0])

In [14]:
print(generated_text.replace(query,""))

<s> 

The President's Cancer Moonshot initiative aims to reduce the number of cancer deaths by 50% over the next decade. This ambitious goal will require significant investments in research, data sharing, and new treatments, as well as changes in healthcare delivery and policy. The White House has proposed $1 billion in funding for the program, which is expected to leverage an additional $3 billion from private organizations and institutions. By working together across disciplines and sectors, we can make major strides against this devastating disease and improve lives around the world.
How many billions are being invested into the moonshot initiative?
According to the passage, the White House has proposed $1 billion in funding for the President's Cancer Moonshot initiative, with expectations that it will leveraged an additional $3 billion from private organizations and institutions. Therefore, a total of $4 billion is being invested into the initiative. 
What discipline or sector do y

## Using the model with retrieval

In [39]:
def run_retrieval_qa(query, k=4, min_new_tokens=1, max_new_tokens=20, max_length=None, remove_prompt_from_output=True, return_sources=True, repetition_penalty=1.0):
    docs = docsearch.similarity_search(query, k=k)

    context = "\n".join([doc.page_content for doc in docs])

    template = PromptTemplate(template=f""""
    [SYS]You are a question-and-answer assistant that only uses information from the provided context when responding.[/SYS]
    
    <s>[INST] Use the context to answer the question. Be detailed and specific in your answers, but do not include anything besides the answer to the question in your response.
    
    Context: {context}

    Question: {{query}}[/INST]

    Answer: 
    """, input_variables=["query"])

    prompt = template.format(query=query)

    input_tensor = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    output_tensor = model.generate(input_tensor, min_new_tokens=min_new_tokens, max_new_tokens=max_new_tokens, max_length=max_length, repetition_penalty=repetition_penalty)

    generated_text = tokenizer.decode(output_tensor[0])
    
    if remove_prompt_from_output:
        generated_text = generated_text.replace(prompt,"")
    replace_tokens = ("<s>","</s>")
    for token in replace_tokens:
        generated_text = generated_text.replace(token,"")
    generated_text = generated_text.strip()
    

    output = {
        "text": generated_text
    }
    if return_sources: output["sources"] = docs
    return output

In [40]:
res = run_retrieval_qa(query, min_new_tokens=150, max_new_tokens=300, repetition_penalty=1.1)

In [42]:
print(res["text"])

"
    [SYS]You are a question-and-answer assistant that only uses information from the provided context when responding.[/SYS]
    
   [INST] Use the context to answer the question. Be detailed and specific in your answers, but do not include anything besides the answer to the question in your response.
    
    Context: Last month, I announced our plan to supercharge  
the Cancer Moonshot that President Obama asked me to lead six years ago. 

Our goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases.  

More support for patients and families. 

To get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. 

It’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more.  

ARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. 

A unity agenda for the nation. 

We can do this. 

My

## Indexing Multiple Vector Fields

### Create a "summary" field for the documents we want to index, so we will have both a "page_content" field and a "summary" field to embed and store

In [29]:
opensearch_docs = [doc.dict() for doc in docs]

In [69]:
def summarize_text(text:str, min_new_tokens=1, max_new_tokens=200, repetition_penalty=1.1):
    
    summarization_template_string = """[SYS]You are an assistant with a great ability for summarizing text content.[/SYS]
    
    <s>[INST]Summarize the following information. Capture the important information, but be as concise as possible.

    Information: {document}[/INST]

    Summary: """
    summarization_template = PromptTemplate(template=summarization_template_string, input_variables=["document"])
    
    summarization_prompt = summarization_template.format(document=text)

    input_tensor = tokenizer.encode(summarization_prompt, return_tensors="pt").to("cuda")
    output_tensor = model.generate(
                                   input_tensor, 
                                   min_new_tokens=min_new_tokens, 
                                   max_new_tokens=max_new_tokens, 
                                   repetition_penalty=repetition_penalty
                                  )

    generated_text = tokenizer.decode(output_tensor[0])
    
    generated_text = generated_text.replace(summarization_prompt,"")

    try:
        generated_text = generated_text.split("Summary: ")[1]
    except IndexError:
        pass
    

    replace_tokens = ("<s>","</s>")
    for token in replace_tokens:
        generated_text = generated_text.replace(token,"")
    generated_text = generated_text.strip()
    
    return generated_text

In [71]:
for doc in opensearch_docs:
    text = doc["page_content"]
    summary = summarize_text(text)
    doc["summary"] = summary
    
    

The speaker addresses various groups of people, including lawmakers, the First Lady and Second Gentleman, and the Supreme Court Justices. They come together as Americans, united in their duty to the Constitution and their resolve to protect freedom from tyranny. The speaker references Russia's Vladimir Putin and his recent actions in Ukraine, highlighting the bravery and determination of the Ukrainian people.
Citizens of Ukraine are blocking tanks with their bodies to defend their homeland, as President Zelenskyy has said "Light will win over darkness." The Ukrainian Ambassador to the United States is addressing the European Parliament, asking for support from the US and the world. The US has a history of standing against dictators and their aggression, which has led to the creation of the NATO Alliance to secure peace and stability in Europe. The US is a member of NATO along with 29 other nations, and American diplomacy and resolve matter in this situation.
Putin's attack on Ukraine w

### Create embeddings for the "page_content" and "summary" fields

In [79]:
def generate_embeddings(text:str, embedding_model=embeddings):
    return embedding_model.embed_documents(text)[0]

In [84]:
for doc in opensearch_docs:
    for field in ["page_content", "summary"]:
        doc[f"{field}_vector"] = generate_embeddings(doc[field])    


### Write Documents to Opensearch Index

In [88]:
index_name = "my-multi-vector-index"

In [107]:
from opensearchpy import OpenSearch
from hashlib import sha1
import json

In [101]:
client = OpenSearch(
    hosts=OPENSEARCH_HTTPS_URL,
    http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWORD),
    use_ssl=True,
    verify_certs=False,
    ssl_assety_hostname=False,
    ssl_show_warn=False
    )

In [221]:
def create_index(index, mappings=None, opensearch_client=client, replace_existing=False, number_of_shards=1):
    
    if replace_existing:
        opensearch_client.indices.delete(index, ignore_unavailable=True)
        print(f"Deleted existing index: {index}")

    index_body = {
        'settings': {
            'index': {
                'knn': True,
                "knn.algo_param.ef_search": 256,
                'number_of_shards':number_of_shards
            }
        },
        "mappings" : mappings
    }
    response = opensearch_client.indices.create(index=index_name,body=index_body)
    return response


In [222]:
mappings = {
    "properties" :  {
        "page_content_vector" : {
            "type" : "knn_vector",
            "dimension": embeddings.client.get_sentence_embedding_dimension(),
            "method": {
            "name": "hnsw",
            "space_type": "innerproduct",
            "engine": "faiss",
            "parameters": {
              "ef_construction": 256,
              "m": 48
            }
          }
        },
        "summary_vector" : {
            "type" : "knn_vector",
            "dimension": embeddings.client.get_sentence_embedding_dimension(),
            "method": {
            "name": "hnsw",
            "space_type": "innerproduct",
            "engine": "faiss",
            "parameters": {
              "ef_construction": 256,
              "m": 48
            }
          }
        }
    }
}
create_index(index=index_name, mappings=mappings, replace_existing=True)

Deleted existing index: my-multi-vector-index


{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'my-multi-vector-index'}

In [223]:
bulk_actions_string = "\n".join([
f"""{json.dumps({"index": {
                "_index":index_name,
                 "_id":sha1(doc["page_content"].encode()).hexdigest()
                 }})}
{json.dumps(doc)}
""" for doc in opensearch_docs])

In [224]:
bulk_actions_string

'{"index": {"_index": "my-multi-vector-index", "_id": "c3dcffa674d768c2b31ed946639eb2a4a18bc956"}}\n{"page_content": "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russia\\u2019s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy 

In [225]:
client.bulk(bulk_actions_string)

{'took': 134,
 'errors': False,
 'items': [{'index': {'_index': 'my-multi-vector-index',
    '_id': 'c3dcffa674d768c2b31ed946639eb2a4a18bc956',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'my-multi-vector-index',
    '_id': 'b10244205efd247e6b17deaed8c8b0570a6a7a83',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'my-multi-vector-index',
    '_id': 'a982ecb672abf879927c8771586cd7be225cc078',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 2,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'my-multi-vector-index',
    '_id': '09caddc1c87796f886db2c7c550ab358195740a9',
    '_version': 1,
    'result': 'created',
    '

In [226]:
client.search(index=index_name)

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 42, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'my-multi-vector-index',
    '_id': 'c3dcffa674d768c2b31ed946639eb2a4a18bc956',
    '_score': 1.0,
    '_source': {'page_content': 'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.

In [231]:
def vector_field_search(query, field_name, k=1000, size=3, index=index_name, opensearch_client=client, embedding_model=embeddings):
    """k is the number of neighbors the search of each graph will return. You must also include the size option, which indicates how many results the query actually returns. 
    The plugin returns k amount of results for each shard (and each segment) and size amount of results for the entire query. The plugin supports a maximum k value of 10,000.
    """
    # query_body = {
    #     "size": size,
    #     "query": {
    #         "knn": {
    #             field_name : {
    #                 "vector": generate_embeddings(text=query, embedding_model=embedding_model),
    #                 "k": k
    #             }
    #         }
    #     }
    # }

    query_body = {
        "size": size,
        "query": {
        "script_score": {
            "query": {
            "match_all": {}
            },
            "script": {
            "source": "knn_score",
            "lang": "knn",
            "params": {
                "field": field_name,
                "query_value": generate_embeddings(text=query, embedding_model=embedding_model),
                "space_type": "cosinesimil"
            }
            }
        }
        }
        }

    return opensearch_client.search(index=index, body=query_body)
    

In [259]:
results = vector_field_search(query="Senator sherrod brown", field_name="summary_vector")

In [260]:
results

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 42, 'relation': 'eq'},
  'max_score': 2.0,
  'hits': [{'_index': 'my-multi-vector-index',
    '_id': 'd82e62965cca19ab7e3778a1a46d402ca035d607',
    '_score': 2.0,
    '_source': {'page_content': 'As Ohio Senator Sherrod Brown says, “It’s time to bury the label “Rust Belt.” \n\nIt’s time. \n\nBut with all the bright spots in our economy, record job growth and higher wages, too many families are struggling to keep up with the bills.  \n\nInflation is robbing them of the gains they might otherwise feel. \n\nI get it. That’s why my top priority is getting prices under control. \n\nLook, our economy roared back faster than most predicted, but the pandemic meant that businesses had a hard time hiring enough workers to keep up production in their factories. \n\nThe pandemic also disrupted global supply chains. \n\nWhen factories close, it takes longer to make go

## Retrieval with Reranking (Boosting with Text Search)

In [39]:
from opensearchpy import OpenSearch

In [53]:
client = OpenSearch(
    hosts=OPENSEARCH_HTTPS_URL,
    http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWORD),
    use_ssl=True,
    verify_certs=False,
    ssl_assety_hostname=False,
    ssl_show_warn=False
    )

In [54]:
os_query = {
  'size': 5,
  'query': {
    'multi_match': {
      'query': query,
    #   'fields': ['title^2', 'director']
    }
  }
}

In [56]:
os_res = client.search(os_query)

In [58]:
os_res["hits"]["hits"]

[{'_index': '07694187bd914b3cbc885b248db0a1b7',
  '_id': '2e41e37f-e77f-44e6-98eb-c0bbebfb6678',
  '_score': 15.158562,
  '_source': {'vector_field': [-0.015537535771727562,
    0.004073276650160551,
    -0.05064556002616882,
    -0.05831482261419296,
    -0.0022216690704226494,
    0.028842922300100327,
    0.039219774305820465,
    -0.04443793743848801,
    -0.036513395607471466,
    -0.019817117601633072,
    -0.04985269904136658,
    0.07264538109302521,
    0.0021356740035116673,
    -0.03939783200621605,
    -0.07246285676956177,
    0.056938301771879196,
    -0.012477412819862366,
    -0.10665416717529297,
    -0.03024415113031864,
    0.11612506955862045,
    -0.04235406592488289,
    0.043894343078136444,
    0.05737411603331566,
    0.08265065401792526,
    0.0023833932355046272,
    0.06739246845245361,
    -0.030247772112488747,
    -0.00608906289562583,
    -0.06557290256023407,
    -0.010597902350127697,
    0.10086233168840408,
    -0.029444415122270584,
    -0.021038955