In [1]:
import os
import getpass

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.document_loaders import TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA

In [2]:
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"

In [3]:
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

In [4]:

loader = TextLoader("../data/state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)


In [5]:
docsearch = OpenSearchVectorSearch.from_documents(
    docs,
    embeddings,
    engine="faiss",
    space_type="innerproduct",
    ef_construction=256,
    m=48,
    opensearch_url="https://localhost:9200",
    http_auth=("admin", "admin"),
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

In [6]:
query = "How much does the president want to cut the cancer death rate?"
docs = docsearch.similarity_search(query, k=10)

In [7]:
## As of 10/7/2023, need to run the pip install below, as Mistral is not included in main transformers library yet
# pip install git+https://github.com/huggingface/transformers.git

In [8]:
# model_name = "PY007/TinyLlama-1.1B-Chat-v0.3"
model_name = "TheBloke/CollectiveCognition-v1.1-Mistral-7B-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto",
                                             )
# model=model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          device_map="auto"
                                          )
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
inputs = tokenizer.encode(query, return_tensors="pt").to("cuda")

In [None]:
output_tensor = model.generate(inputs, min_new_tokens=100, max_length=2000)

In [None]:
output_tensor[0]

tensor([   1, 1602, 1188,  ..., 2627,  272, 5541], device='cuda:0')

In [None]:
generated_text = tokenizer.decode(output_tensor[0])

In [None]:
generated_text

'<s> How much does the president want to cut the cancer death rate?\n\nThat’s the question at the heart of a new report from the White House Council on Cancer. The report, released Tuesday, sets a goal of reducing the cancer death rate by at least 25 percent over the next 25 years.\n\nThe report says the goal is “ambitious but achievable.” It’s based on a review of the latest scientific evidence and input from experts in cancer prevention, treatment and research.\n\nThe report says the goal is “ambitious but achievable.” It’s based on a review of the latest scientific evidence and input from experts in cancer prevention, treatment and research.\n\nThe report says the goal is “ambitious but achievable.” It’s based on a review of the latest scientific evidence and input from experts in cancer prevention, treatment and research.\n\nThe report says the goal is “ambitious but achievable.” It’s based on a review of the latest scientific evidence and input from experts in cancer prevention, t

In [62]:
def run_retrieval_qa(query, k=4, min_new_tokens=1, max_length=2000, return_sources=True):
    docs = docsearch.similarity_search(query, k=k)

    context = "\n".join([doc.page_content for doc in docs])

    template = PromptTemplate(template=f""""
    Context - {context}

    Question: {{query}}

    ASSISTANT: 
    """, input_variables=["query"])

    prompt = template.format(query=query)

    input_tensor = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    output_tensor = model.generate(input_tensor, min_new_tokens=min_new_tokens, max_length=max_length)

    generated_text = tokenizer.decode(output_tensor[0]).replace(prompt,"")
    replace_tokens = ("<s>",)
    for token in replace_tokens:
        generated_text = generated_text.replace(token,"")
    generated_text = generated_text.strip()
    

    output = {
        "text": generated_text
    }
    if return_sources: output["sources"] = docs
    return output

In [63]:
run_retrieval_qa(query)

{'text': "The President wants to cut the cancer death rate by at least 50% over the next 25 years. This is a bold and ambitious goal, but one that is achievable with the right investments and strategies.\n\n    The goal is to turn more cancers from death sentences into treatable diseases. This means developing new treatments, improving early detection and screening methods, and finding ways to prevent cancer in the first place.\n\n    The President's Cancer Moonshot, which he launched in 2016, is a national effort to accelerate progress in cancer research and care. The Moonshot aims to double the rate of progress in cancer research over the next decade, and to make more treatments available to more patients, particularly those from underserved communities.\n\n    The Moonshot is just one part of the President's plan to supercharge cancer research and care. The ARPA-H (Advanced Research Projects Agency for Health) is another key component. ARPA-H will be a new federal agency modeled aft

In [65]:
docsearch.similarity_search("cancer moonshot 2016")

[Document(page_content='But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n\nDanielle says Heath was a fighter to the very end. \n\nHe didn’t know how to stop fighting, and neither did she. \n\nThrough her pain she found purpose to demand we do better. \n\nTonight, Danielle—we are. \n\nThe VA is pioneering new ways of linking toxic exposures to diseases, already helping more veterans get benefits. \n\nAnd tonight, I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers. \n\nI’m also calling on Congress: pass a law to make sure veterans devastated by toxic exposures in Iraq and Afghanistan finally get the benefits and comprehensive health care they deserve. \n\nAnd fourth, let’s end cancer as we know it. \n\nThis is personal to me and Jill, to Kamala, and to so many of you. \n\nCancer is the #2 cause of death in America–second only to heart disease.', metadata={'source': '../data/state_of_the_union.txt'}),
 Document(p