# Test

In [1]:
import os
import dotenv
from scholarly import scholarly
from langchain_community.tools.google_scholar import GoogleScholarQueryRun
from langchain_community.utilities.google_scholar import GoogleScholarAPIWrapper

### Nemo Guardrails

In [None]:
from nemoguardrails import RailsConfig
from nemoguardrails.integrations.langchain.runnable_rails import RunnableRails
# Nemo Guardrails
# ... initialize `some_chain`
config = RailsConfig.from_path("path/to/config")

# Using LCEL, you first create a RunnableRails instance, and "apply" it using the "|" operator
guardrails = RunnableRails(config)
chain_with_guardrails = guardrails | some_chain


### Basic

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langserve import add_routes

# 1. Create prompt template
system_template = "Translate the following into {language}:"
prompt_template = ChatPromptTemplate.from_messages([
    ('system', system_template),
    ('user', '{text}')
])

# 2. Create model
# model = ChatOpenAI()

# 3. Create parser
parser = StrOutputParser()

# 4. Create chain
chain = prompt_template | llm | parser


# 4. App definition
app = FastAPI(
  title="LangChain Server",
  version="1.0",
  description="A simple API server using LangChain's Runnable interfaces",
)

# 5. Adding chain route

add_routes(
    app,
    chain,
    path="/chain",
)

if __name__ == "__main__":
    import uvicorn

    uvicorn.run(app, host="localhost", port=8000)

### Benchmark

In [None]:
# Vector db #1
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
from langchain.embeddings import OllamaEmbeddings  

# DATA_PATH="/mnt/c/Users/beene/Downloads/papers/"
DATA_PATH="/mnt/c/Users/beene/Downloads/tests/"
DB_PATH = "./vectorstores/db/"

#load the LLM
def load_llm():
    llm = Ollama(
        model="llama3",
        base_url="http://localhost:11434",
        verbose=True,
        callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    )
    return llm

def create_vector_db():
    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    print(f"Processed {len(documents)} pdf files")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    texts=text_splitter.split_documents(documents)
    vectorstore = Chroma.from_documents(documents=texts, embedding=OllamaEmbeddings(), persist_directory=DB_PATH)

def load_vector_db():
    return Chroma(persist_directory=DB_PATH, embedding_function=OllamaEmbeddings())

if __name__=="__main__":
    llm = load_llm()
    # create_vector_db()
    vectorstore = load_vector_db()

In [None]:
from fastapi import FastAPI

from langserve import add_routes

app = FastAPI(
    title="LangChain Server",
    version="1.0",
    description="Spin up a simple api server using Langchain's Runnable interfaces",
)

add_routes(
    app,
    llm,
    path="/llm",
)
# add_routes(
#     app,
#     ChatAnthropic(model="claude-3-haiku-20240307"),
#     path="/anthropic",
# )

async def main():
    import uvicorn

    uvicorn.run(app, host="localhost", port=8000)

if __name__ == "__main__":
    await main()

In [None]:
# Vector db #2
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings  

persistent_client = chromadb.PersistentClient()
collection = persistent_client.create_collection("default")
collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])

# langchain_chroma = Chroma(
#     client=persistent_client,
#     collection_name="default",
#     embedding_function=OllamaEmbeddings(),
# )

In [None]:
langchain_chroma.get()

In [None]:
test = chromadb.PersistentClient(
    path = "./vectorstores/db/"
)

In [None]:
test2 = Chroma(
    client = test,
    collection_name = "default",
    embedding_function = GPT4AllEmbeddings()
)

In [None]:
test2.get()

In [None]:
# Contextual compression retriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank

compressor = FlashrankRerank(top_n = 1)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "What did the president say about Ketanji Jackson Brown"
)
compressed_docs

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multiquery_retriever = MultiQueryRetriever.from_llm( 
    retriever=retriever,
    llm=llm,
)
compression_retriever.invoke("What did the president say about Ketanji Jackson Brown")

In [None]:
#import required dependencies
from langchain import hub
from langchain.embeddings import GPT4AllEmbeddings
from langchain.vectorstores import Chroma
import chainlit as cl
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
# Set up RetrievelQA model
QA_CHAIN_PROMPT = hub.pull("rlm/rag-prompt-mistral")


def retrieval_qa_chain(llm,vectorstore):
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
        return_source_documents=True,
    )
    return qa_chain


def qa_bot(): 
    llm=load_llm() 
    vectorstore = load_vector_db()

    qa = retrieval_qa_chain(llm,vectorstore)
    return qa 

@cl.on_chat_start
async def start():
    chain=qa_bot()
    msg=cl.Message(content="Firing up the research info bot...")
    await msg.send()
    msg.content= "Hi, welcome to research info bot. What is your query?"
    await msg.update()
    cl.user_session.set("chain",chain)

@cl.on_message
async def main(message):
    chain=cl.user_session.get("chain")
    cb = cl.AsyncLangchainCallbackHandler(
    stream_final_answer=True,
    answer_prefix_tokens=["FINAL", "ANSWER"]
    )
    cb.answer_reached=True
    # res=await chain.acall(message, callbacks=[cb])
    res=await chain.acall(message.content, callbacks=[cb])
    print(f"response: {res}")
    answer=res["result"]
    answer=answer.replace(".",".\n")
    sources=res["source_documents"]

    if sources:
        answer+=f"\nSources: "+str(str(sources))
    else:
        answer+=f"\nNo Sources found"

    await cl.Message(content=answer).send() 

# Main

### LLM

In [2]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

#load the LLM
def load_llm():
    llm = Ollama(
        model = "llama3",
        base_url = "http://localhost:11434",
        temperature = 0,
        verbose = True,
        callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),
    )
    return llm

### Prompt

In [56]:
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate

# Set up RetrievelQA model
prompt = hub.pull("rlm/rag-prompt-llama3")

# Create a ChatPromptTemplate same with the above llama3 prompt
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a research assistant specializing in question-answering tasks. "
        "Use the provided context to answer the question concisely. "
        "If you don't know the answer, simply state that you don't know. "
        "Keep your response to a maximum of three sentences. "
        "Summarize the main points in bullet points. This is the highest priority. "
        "Provide a detailed description with sufficient detail for understanding, without unnecessary elaboration. "
        "Include any additional relevant information, if available, but keep it brief and optional. "
    ),
    (
        "user", 
        "Question: {question}\n" 
        "Context: {context}\n"
        # "Requests:\n"
        # "0. Start your response directly without the first introductory phrases.\n"
        # "0. Start your response with a cute and unique arrow emoji.\n"
        # "1. Summarize the main points in bullet points. This is the highest priority.\n"
        # "2. Provide a detailed description with sufficient detail for understanding, without unnecessary elaboration.\n"
        # "3. Include any additional relevant information, if available, but keep it brief and optional.\n"
        "Answer:",
    ),
    (
        "assistant", 
        "**Main Points:**\n"
        "**Detailed Description:**\n"
        "**Additional Information:**\n"
    )
])    

### Chroma db

In [131]:
import uuid
import chromadb
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.embeddings import GPT4AllEmbeddings
# from gpt4all import GPT4AllEmbedding

# DATA_PATH="/mnt/c/Users/beene/Downloads/papers/"
DATA_PATH="/mnt/c/Users/beene/Downloads/tests/"
DB_PATH = "./chroma"

def create_documents():
    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    texts = text_splitter.split_documents(documents)
    return texts

def create_embeddings(documents):
    '''Create embeddings for the documents in the DATA_PATH directory'''
    embedder = GPT4AllEmbeddings()
    return [embedder.embed_query(doc.page_content) for doc in documents]

def clear_db(client):
    list_collections = client.list_collections()
    if len(list_collections):
        for i in list_collections:
            client.delete_collection(i.dict()["name"])

def create_vector_db(client):
    '''Create a vector db from the documents in the DATA_PATH directory'''
    # Make original paper db and duplicated db for search purposes
    # , which are saved to the DB_PATH directory
    _ = client.create_collection(
        name = "Original",
    )
    _ = client.create_collection(
        name = "Search",
    )

def add_documents_to_db(client):
    # Load the vector db client
    collection = client.get_collection("Original")

    # Create documents and embeddings
    documents = create_documents()
    embeddings = create_embeddings(documents)

    # Add the documents and embeddings to the collection
    collection.add(
        ids = [str(uuid.uuid4()) for _ in documents],
        documents = [doc.page_content for doc in documents],
        metadatas = [doc.metadata for doc in documents],
        embeddings = embeddings
    )

def duplicate_db(client):
    '''Duplicate the existing vector db for iterative search purposes'''
    # Clear the search db
    client.delete_collection("Search")

    # Load the vector db client
    collection = client.get_collection("Original")
    collection2 = client.create_collection("Search")
    
    # Dupliacte a collection
    total_documents_count = collection.count()
    batch_size = 10
    for i in range(0, total_documents_count, batch_size):
        batch = collection.get(
            include=["metadatas", "documents", "embeddings"],
            limit=batch_size,
            offset=i
        )
        collection2.add(
            ids=batch["ids"],
            documents=batch["documents"],
            metadatas=batch["metadatas"],
            embeddings=batch["embeddings"]
        )    

def implementation_db(client):
    '''Implementation of the above functions'''
    clear_db(client) # Clean up the existing db
    create_vector_db(client)
    add_documents_to_db(client)    

def load_vector_db(collection_name):
    '''Load the vector db for chaining purposes'''
    return Chroma(
        persist_directory = DB_PATH,
        collection_name = collection_name,
        embedding_function = GPT4AllEmbeddings()
    )

def extract_source_document(response):
    '''Extract the source document from the response'''
    source_document = response.get("source_documents")[0].dict()
    return source_document["metadata"]["source"]

def delete_searched_document(client, response):
    '''Delete the searched document from the db'''
    collection = client.get_collection("Search")
    collections = collection.get() # get a dict db

    # find the id
    source_document = extract_source_document(response)
    ids = collections["ids"]
    ids_to_delete = []
    for i in range(len(ids)):
        if collections["metadatas"][i]["source"] == source_document:
            ids_to_delete.append(ids[i])

    collection.delete(ids = ids_to_delete)    

In [132]:
if __name__=="__main__":
    llm = load_llm()
    client = chromadb.PersistentClient(path = DB_PATH)
    implementation_db(client)
    duplicate_db(client)

In [137]:
delete_searched_document(client, test)

### Retriever

In [47]:
# Contextual compression retriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain.chains import RetrievalQA

compressor = FlashrankRerank(top_n = 1)
chroma_db = load_vector_db("Search")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, 
    base_retriever=chroma_db.as_retriever()
)

def retrieval_qa_chain(llm,vectorstore):
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=compression_retriever,
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True,
    )
    return qa_chain

### Application

In [57]:
def qa_bot(): 
    llm=load_llm() 
    chroma_db = load_vector_db("ResearchPapers")

    qa = retrieval_qa_chain(llm, chroma_db)
    return qa 



chain=qa_bot()

INFO:chromadb.api.segment:Collection ResearchPapers is not created.


In [105]:
test = chain.invoke("What is PLV")

System: Based on the provided context, I found that PLV stands for Phase-Locked Value.

**Main Points:**

* PLV is an abbreviation for Phase-Locked Value.
* It is a measure used in neuroscience to quantify the synchronization of neural activity between different brain regions.

**Detailed Description:**
PLV is a statistical method used to analyze the phase-locking behavior of neural oscillations. It measures the degree to which two or more brain regions are synchronized in terms of their neural activity, particularly in the frequency domain.

**Additional Information:**
None available in this context.

In [106]:
test.get("source_documents")[0].dict()["metadata"]["source"]

'/mnt/c/Users/beene/Downloads/tests/Schizophrenia ref1.pdf'

In [None]:
test_list = []
for i in range(len(test["ids"])):
    test_list.append(test["documents"][i]) if test["metadatas"][i]["page"] == 0 else None