# Vector Similarity
1. Manhattan distance L1
2. Euclidean distance L2
3. Cosine distance
4. Dot product distance

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
os.environ['USER_AGENT'] = 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'

openai.api_key  = os.environ['OPENAI_API_KEY']

In [18]:
import sagemaker
from langchain_community.llms.sagemaker_endpoint import LLMContentHandler
from sagemaker.predictor import retrieve_default
from llama_index.embeddings.sagemaker_endpoint import SageMakerEmbedding
from langchain_community.embeddings import SagemakerEndpointEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain_community.llms import SagemakerEndpoint
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
import json
from typing import Dict
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
import bs4
from langchain_community.document_loaders import WebBaseLoader
import requests
from collections import deque
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pathlib
import os
import string
from langchain.chains import create_extraction_chain

sess = sagemaker.session.Session() 

In [3]:
class NIAIDEmbeddingsContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, text_inputs: list[str], model_kwargs: dict) -> bytes:
        input_str = json.dumps(
            {
                "text_inputs": text_inputs,
                **model_kwargs
            }
        )
        return input_str.encode("utf-8")
    def transform_output(self, output: bytes) -> list[list[float]]:
        response_json = json.loads(output.read().decode("utf-8"))
        print(response_json)
        return response_json

In [89]:
endpoint_name = "jumpstart-dft-llama-3-1-8b-instruct-20240820-143856"
predictor = retrieve_default(endpoint_name)
sm_embedding = SageMakerEmbedding(
  endpoint_name=predictor.endpoint_name
)

hf_embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)
embedding = LangchainEmbedding(hf_embedding)
# sagemaker_embeddings = SagemakerEndpointEmbeddings(
#     endpoint_name=endpoint_name,
#     region_name=sess._region_name,
#     model_kwargs={"mode": "embedding"},
#     content_handler=NIAIDEmbeddingsContentHandler(),
# )


In [90]:
parameters = {
    "do_sample": True,
    "top_p": 0.95,
    "temperature": 0.1,
    "max_new_tokens": 256,
    "num_return_sequences": 4,
}
    
class ContentHandler(LLMContentHandler):
        content_type = "application/json"
        accepts = "application/json"
        
        def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
            input_str = json.dumps({"inputs": prompt, "parameters": parameters, **model_kwargs})
            return input_str.encode('utf-8')
        
        def transform_output(self, output: bytes) -> str:
            response_json = json.loads(output.read().decode("utf-8"))
            return response_json['generated_text']


In [91]:
llm = SagemakerEndpoint(
     endpoint_name=endpoint_name,
     region_name=sess._region_name,
     model_kwargs={"temperature": 1e-10},
     content_handler=ContentHandler()
 )

print(type(llm))

# from sagemaker.jumpstart.model import JumpStartModel
# my_model = JumpStartModel(
#     model_id="meta-textgeneration-llama-2-7b-f"
# )
# JumpStartModel.

<class 'langchain_community.llms.sagemaker_endpoint.SagemakerEndpoint'>


In [74]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len,
    separators=["\n\n", "\n"]
)

In [170]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [86]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=False,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

NameError: name 'QA_CHAIN_PROMPT' is not defined

In [1]:
def create_vector_store(db_path: str):
    vectorstore = Chroma("langchain_store", hf_embedding, persist_directory=db_path)
    return vectorstore


In [81]:
def create_qa_chain(llm, vectorstore:Chroma, chain_type, prompt, k):
    # define retriever
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chatbot chain. Memory is managed externally.
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=llm, 
        chain_type=chain_type, 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
        combine_docs_chain_kwargs={"prompt": prompt},
        verbose=True,
    )
    return qa_chain

In [41]:
def load_pdf(file, vectorstore):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    docs = text_splitter.split_documents(documents)
    # load documents
    vectorstore.add_documents(docs)
    

In [42]:
files_extension_to_exclude = ['.pdf', '.doc', '.docx', '.xls', '.xlsx',
                              'jpg', '.jpeg', '.png', '.gif', '.ico',
                              '.svg', 'webp']
def load_web(vectorstore, url, limit = 5):
    root = '/'
    nodes_list = deque() # stack
    visited = set()      # set

    # add root
    nodes_list.append(root)

    # title and content only
    bs4_strainer = bs4.SoupStrainer(['h1', "article"])
    
    # set counter to 0
    counter = 0
    while len(nodes_list) > 0 and (counter < limit or limit == -1):
        counter = counter + 1
        node=nodes_list.pop()
        visited.add(node)
        full_node_url = url + node
        print('Getting content from', full_node_url)
        loader = WebBaseLoader(
            web_paths=(full_node_url,),
            bs_kwargs={"parse_only": bs4_strainer},
        )
        loader.session.headers['User-Agent'] = 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'

        # load and tokenize
        docs = loader.load_and_split(text_splitter)
        # to find all a tags
        scraped_doc = loader.scrape()
        # embed the docs
        vectorstore.add_documents(docs)
        # update nodes_list using anchor tags
        for a in scraped_doc.findAll('a'):
            # make sure the href exists and is internal
            if(not a.has_attr('href') or not a['href'].startswith('/')):
                continue
            # exclude files
            url_path = urlparse(a['href']).path
            file_extension = os.path.splitext(url_path)[1]
            if(file_extension in files_extension_to_exclude):
                continue
            # maintain stack
            if(a['href'] not in visited and a['href'] not in nodes_list):
                nodes_list.append(a['href'])

In [43]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

def create_prompt():
    # Build prompt
    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
    {context}
    Question: {question}
    Helpful Answer:"""
    prompt = PromptTemplate.from_template(template)
    return prompt

In [77]:
vectorstore = create_vector_store('db/chroma')
# load web
load_web(vectorstore, 'https://www.niaid.nih.gov', 10)

vectorstore.persist()

NameError: name 'embedding_model' is not defined

In [94]:
import gradio as gr
import random
import time
from itertools import chain
# nums of docs
print(vectorstore._collection.count())
# create a prompt
prompt = create_prompt()
# create a question-answer chain
qa_chain = create_qa_chain(llm, vectorstore, "stuff", prompt, 3)
question='who is Jeanne Marrazzo'
bot_message = qa_chain({"question": question, "chat_history": []})
print(bot_message)
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])
    history = []
    def respond(message, chat_history):
        bot_message = qa_chain({"question": message, "chat_history": history})
        print(bot_message)
        history.extend([(message, bot_message['answer'])])
        chat_history.append((message, bot_message['answer']))
        time.sleep(2)
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

demo.launch(inline=True)

40


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
    Volunteer for Mpox Clinical Study
                     
STOMP is an NIAID-funded clinical trial to evaluate the efficacy of the antiviral tecovirimat, also know as TPOXX, for the treatment of mpox. If you think you might have mpox, find out more and volunteer.

 















Director

                    Jeanne Marrazzo, M.D., M.P.H.
                



                    Dr. Marrazzo began her tenure as the sixth NIAID Director in the fall of 2023.  She oversees a $6.6 billion budget that supports research to advance the understanding, diagnosis, treatment





[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
    Grantee will provide NIAID Contractor with access to the relevant administrative staff and records.
At the end of the site visit, grantee will receive a copy of the preliminary FOS Review report and the attendees will have an opportunity to review and discuss the report with the NIAID Contractor onsite.

After the Site Visit

NIAID staff will send the grantee an official final FOS Review report. Grantee will carefully review the report and respond as directed. 

Note: Grantee must address or provide a plan to address the compliance issues within 60 calendar days o