In [1]:
### LLM
llama3_70B = "nvidia/Llama3-ChatQA-1.5-70B"
llama3_8B = "meta-llama/Meta-Llama-3-8B-Instruct"
phi = "microsoft/Phi-3-mini-128k-instruct"
qwen = "Qwen/Qwen1.5-72B-Chat"

In [2]:
from langchain.prompts import PromptTemplate
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = llama3_8B
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map = "auto", trust_remote_code=True)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens = 1000)
llama_hf = HuggingFacePipeline(pipeline=pipe)

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from langchain_community.embeddings import GPT4AllEmbeddings
class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        embeddings = GPT4AllEmbeddings.embed_query()
        return embeddings


In [None]:
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
import chromadb
chroma_client = chromadb.HttpClient(host='localhost', port=8000)
chroma_client.list_collections()
# collection = chroma_client.get_collection(name="rare-hack-vd-SC-GPT4ALL-256-10")

In [None]:
### Generate

from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

# Prompt
prompt = PromptTemplate(   
    template="""<s>[INST]<<SYS>>You are an assistant for answering questions. Only respond to queries about Ehlers-Danlos Syndrome and Hypophosphotasia. 
    If the question is about anything else, please say that it is out of your capabilities. 
    Use the following pieces of retrieved context from medical research papers and verified text to answer the question. 
    You will also be provided with retrieval source 
    metadata along with the context. Show the URL at the end of your response as a source for your output. 
    Use three sentences maximum. Do not mention the context in the response.<</SYS>>
    Question: {question} 
    Context: {context}[/INST]""",
    #Metadata: {metadata}[/INST]""",
    input_variables=["question" "context"]#, "metadata"],
)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = prompt | llama_hf | StrOutputParser()

# Run
question = "What is Ehlers-Danlos Syndrome?"
# retrieval_results = retriever.invoke(question, k = 3)
retrieval_results = vd_collection.query(GPT4AllEmbeddings().embed_query(question), n_results = 3)
retrieval_results
# docs =  retrieval_results["documents"][0]
# metadata = retrieval_results["metadatas"][0]
# generation = rag_chain.invoke({"context": format_docs(retrieval_results), "question": question})#, "metadata": metadata})
# generation

In [None]:
def retrieval_grader(query, retrieval):
    pass
    

In [None]:
#Retrieval Grader
from langchain_core.output_parsers import StrOutputParser

prompt = PromptTemplate(
    template="""<s>[INST]<<SYS>> You are a grader assessing relevance  of a retrieved document to a user question. 
    If the document contains keywords related to the user question, 
    grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
    Provide the binary score as a JSON with a single key 'score'. Do not explain anything else!<</SYS>>
    Here is the retrieved document: \n\n {document} \n\n
    Here is the user question: {question} \n
    JSON: [/INST]""",
    input_variables=["question", "document"],
)

retrieval_grader = prompt | llama_hf | StrOutputParser()

In [None]:
question = "What is adams oliver syndrome?"
docs = vectorstore.as_retriever().invoke(question, k=3)

print(retrieval_grader.invoke({"question": question, "document": docs}))

### Graph Build

In [None]:
# Build graph
workflow.set_conditional_entry_point(
    route_question,
    {
        "websearch": "websearch",
        "vectorstore": "retrieve",
    },
)

workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "websearch": "websearch",
        "generate": "generate",
    },
)
workflow.add_edge("websearch", "generate")
workflow.add_conditional_edges(
    "generate",
    grade_generation_v_documents_and_question,
    {
        "not supported": "generate",
        "useful": END,
        "not useful": "websearch",
    },
)

In [None]:
# Compile
app = workflow.compile()

# Test
from pprint import pprint
inputs = {"question": "What are the types of agent memory?"}
for output in app.stream(inputs):
    for key, value in output.items():
        pprint(f"Finished running: {key}:")
pprint(value["generation"])

Trace: 

https://smith.langchain.com/public/8d449b67-6bc4-4ecf-9153-759cd21df24f/r

In [None]:
# Compile
app = workflow.compile()

# Test
from pprint import pprint
inputs = {"question": "Who are the Bears expected to draft first in the NFL draft?"}
for output in app.stream(inputs):
    for key, value in output.items():
        pprint(f"Finished running: {key}:")
pprint(value["generation"])

Trace: 

https://smith.langchain.com/public/c785f9c0-f519-4a38-ad5a-febb59a2139c/r