### Install requirements and import all necessary packages

In [1]:
#!pip install -r requirements.txt

In [2]:
import pandas as pd
import os
import lancedb
from torch import cuda
import urllib.request
import gradio as gr

from langchain.retrievers import EnsembleRetriever
from langchain.schema import Document
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain_community.vectorstores.lancedb import LanceDB
from langchain_community.retrievers import BM25Retriever
from langchain_community.llms import LlamaCpp

from langchain_core.documents.base import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnablePick
from langchain_core.prompts import ChatPromptTemplate

In [3]:
# Remove db folder if you want to recreate LanceDB database
# !rm -rf db

### Settings to run the solution

In [4]:
# path_to_data_csv = 'data/preprocessed_data/master_without_embeddings_first_100.csv'
path_to_data_csv = 'data/preprocessed_data/master_without_embeddings_all.csv'

path_to_database = 'db'

embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'

HF_AUTH = os.getenv('HF_AUTH', None)
os.environ['HF_HOME'] = os.getenv('HF_HOME', 'models')
model_id='llama-2-7b-chat.Q2_K.gguf'

chunk_size = 400
chunk_overlap = 50

retrieve_top_k_docs_bm25 = 1
retrieve_top_k_docs_vector = 1
context_length_for_llm = chunk_size*(retrieve_top_k_docs_bm25 + retrieve_top_k_docs_vector)+200 #not larger than 2048
retrievers_weights_bm25 = 0.4 #probability
llama_temperature = 0.75 #randomness parameter

### Load the data into type Document

In [5]:
df = pd.read_csv(path_to_data_csv)

documents=[]
for index, row in df.iterrows():
    doc = Document(page_content = row['chunk'],
                   metadata={'id': row['id'], 'title': row['title'], 'authors': row['authors'], 'sources': row['sources']})
    documents.append(doc)

print(f'---\n--- Read {len(documents)} documents from {path_to_data_csv}')

---
--- Read 214381 documents from data/preprocessed_data/master_without_embeddings_all.csv


### Create BM25- and LanceDB retrievers

In [6]:
print(f'---\n--- Creating retrievers...')

bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k =  retrieve_top_k_docs_bm25

device = 'cuda' if cuda.is_available() else 'cpu'

# Create embedding
embed_model = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 256}
)

# Try if the LanceDB exists, if yes, use if, if no, create new one
try:
    print("--- Trying to connect to LanceDB")
    db = lancedb.connect(path_to_database)
    table = db.open_table("chatmaja_test")
    docsearch = LanceDB(connection=table, embedding=embed_model)
    print("--- LanceDB found, connected successfully")
except:
    print("--- Error connecting to LanceDB, creating new one")
    db = lancedb.connect(path_to_database)
    table = db.create_table("chatmaja_test", data=[
            {"vector": embed_model.embed_query("Hello World"), "text": "Hello World", "id": "1", "authors": "authoors", "sources": "sourcees", "title": "tiitle"}
        ], mode="overwrite")
    print("--- LanceDB created and connected successfully")
    table.delete('authors = "authoors"')
    docsearch = LanceDB.from_documents(documents, embed_model, connection=table)
    print("--- Finished loading documents to LanceDB")

retriever_lancedb = docsearch.as_retriever(search_kwargs={"k": retrieve_top_k_docs_vector})

# Create ensemble retriver
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever_lancedb],
                                       weights=[retrievers_weights_bm25, 1-retrievers_weights_bm25])

print("---\n--- Created BM25 and vector search retrievers")

---
--- Creating retrievers...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()


--- Trying to connect to LanceDB
--- Error connecting to LanceDB, creating new one
--- LanceDB created and connected successfully
--- Finished loading documents to LanceDB
---
--- Created BM25 and vector search retrievers


### Get model

In [7]:
# Create directory if it does not exist
os.makedirs(os.getenv('HF_HOME'), exist_ok=True)

# Download model if not exists
path_to_model = os.path.join(os.getenv('HF_HOME'), model_id)
link_to_model = f"https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/{model_id}"

if not os.path.isfile(path_to_model):
    print(f"--- Downloading {model_id}...")
    urllib.request.urlretrieve(link_to_model, path_to_model)
    print(f"--- Downloaded {model_id} successfully.")
else:
    print(f"--- Model {model_id} already downloaded.")


# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
n_gpu_layers = -1 if device == 'cuda' else 0
llm = LlamaCpp(
    model_path=path_to_model,
    temperature=llama_temperature,
    max_tokens=min(context_length_for_llm*2, 4096),
    n_gpu_layers=n_gpu_layers,
    n_ctx=min(context_length_for_llm, 2048),
    top_p=1,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

--- Downloading llama-2-7b-chat.Q2_K.gguf...


llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from models/llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32           

--- Downloaded llama-2-7b-chat.Q2_K.gguf successfully.


llm_load_print_meta: n_gqa            = 1
llm_load_print_meta: n_embd_k_gqa     = 4096
llm_load_print_meta: n_embd_v_gqa     = 4096
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: n_ff             = 11008
llm_load_print_meta: n_expert         = 0
llm_load_print_meta: n_expert_used    = 0
llm_load_print_meta: pooling type     = 0
llm_load_print_meta: rope type        = 0
llm_load_print_meta: rope scaling     = linear
llm_load_print_meta: freq_base_train  = 10000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_yarn_orig_ctx  = 4096
llm_load_print_meta: rope_finetuned   = unknown
llm_load_print_meta: model type       = 7B
llm_load_print_meta: model ftype      = Q2_K - Medium
llm_load_print_meta: model params     = 6.74 B
llm_load_print_meta: model size       = 2.63 GiB (3.35 BPW) 
llm_load_print_meta: genera

### Create pipeline of the solution

In [8]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

# Prompt
rag_prompt_llama = ChatPromptTemplate.from_messages([
    ("human", """[INST]<<SYS>> You are an assistant for ques
     tion-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise.<</SYS>> \nQuestion: {question} \nContext: {context} \nAnswer: [/INST]"""),
])

# Chain
chain = (
    RunnablePassthrough.assign(context=RunnablePick("context") | format_docs)
    | rag_prompt_llama
    | llm
    | StrOutputParser()
)

def answer_query(question):
    """
    Get answer for provided question.

    Args:
        question (str): question from the user.
    """
    docs = ensemble_retriever.get_relevant_documents(question)
    answer = chain.invoke({"context": docs, "question": question})
    sources = "Sources:\n - " + "\n - ".join([
        d.metadata['title'] + ", " + d.metadata['authors'] + ", " + d.metadata['sources']
        for d in docs])
    answer_with_sources = answer + '\n\n' + sources
    return answer_with_sources

def answer_query_streaming(message: str, history: list):
    """
    Get answer for provided question for streaming.

    Args:
        question (str): question from the user.
        history (list): list of pairs of strings.
    """

    docs = ensemble_retriever.get_relevant_documents(message)
    sources = "Sources:\n - " + "\n - ".join([
        d.metadata['title'] + ", " + d.metadata['authors'] + ", " + d.metadata['sources']
        for d in docs])

    printed_so_far = ''
    for chunk in chain.stream({"context": docs, "question": message}):
        printed_so_far += chunk
        yield printed_so_far

    answer_with_sources = printed_so_far + '\n\n' + sources
    yield answer_with_sources

### Sample usage

`answer_query` waits before the whole answer is returned. In Jupyter it also streams the output, but for the UI (as we did in Flask) is does not feel responsive at all. See below for a better solution.

In [9]:
query = "What is used in brain cancer imaging?"
answer_with_sources = answer_query(query)

  Great! I'd be happy to help answer your question about brain cancer imaging. Based on the context provided, here are my answers to your question:
1. What is used in brain cancer imaging?
Brain cancer imaging typically involves the use of various imaging modalities, such as magnetic resonance imaging (MRI), computed tomography (CT) scans, and positron emission tomography (PET) scans. These imaging modalities help doctors visualize and detect brain tumors, as well as monitor their progression over time. In addition, newer techniques such as functional MRI (fMRI) and diffusion tensor imaging (DTI) may be used to provide further insights into brain function and tumor progression.
2. How well do ML systems perform in liver CT imaging?
While machine learning (ML) has shown promise in various medical imaging applications, its performance in liver CT imaging can vary depending on the specific technique used and the quality of the images. In general, ML algorithms can perform well in liver CT


llama_print_timings:        load time =     259.59 ms
llama_print_timings:      sample time =     256.41 ms /   450 runs   (    0.57 ms per token,  1755.00 tokens per second)
llama_print_timings: prompt eval time =    3260.62 ms /   294 tokens (   11.09 ms per token,    90.17 tokens per second)
llama_print_timings:        eval time =   15562.72 ms /   449 runs   (   34.66 ms per token,    28.85 tokens per second)
llama_print_timings:       total time =   21631.97 ms /   743 tokens


In [10]:
print(answer_with_sources)

  Great! I'd be happy to help answer your question about brain cancer imaging. Based on the context provided, here are my answers to your question:
1. What is used in brain cancer imaging?
Brain cancer imaging typically involves the use of various imaging modalities, such as magnetic resonance imaging (MRI), computed tomography (CT) scans, and positron emission tomography (PET) scans. These imaging modalities help doctors visualize and detect brain tumors, as well as monitor their progression over time. In addition, newer techniques such as functional MRI (fMRI) and diffusion tensor imaging (DTI) may be used to provide further insights into brain function and tumor progression.
2. How well do ML systems perform in liver CT imaging?
While machine learning (ML) has shown promise in various medical imaging applications, its performance in liver CT imaging can vary depending on the specific technique used and the quality of the images. In general, ML algorithms can perform well in liver CT

# Gradio UI

Run the cell below and read logs to either access Gradio directly here or in the external addres in your web browser.

The cool part about this approach is that the tokens are streamed, which means UI feels responsive as new words appear on the screen as the model generates them.

In [11]:
gradio_app = gr.ChatInterface(
    answer_query_streaming,
    title="Chat Maja - Your PubMed expert",
    description='''Retrieval-Augmented Question Answering chatbot, based on quite a few abstracts from PubMed. <br>
           ''',
)

gradio_app.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://58e1e04b2b25edeb99.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


