## Installing Necessary dependencies

In [4]:
!pip install -q chromadb


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!pip install -q pandas langchain openai transformers
!pip install langchain_community
!pip install -q langchainhub
!pip install -q sentence-transformers
!pip install -q sentence_transformers
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q faiss-cpu

In [15]:
!pip install -q SPARQLWrapper


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## Importing Libraries

In [1]:
import os,shutil
from glob import glob
from pathlib import Path
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain,LLMChain
from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
import pandas as pd

## Loading, Splitting and Indexing Data in Chroma DB

This step is to be done once. Initially all the data in the preprocessed.csv file will be loaded, splitted and indexed in the Chroma DB stored locally. 

Next time when we need to reload the vector db, we can easily use the presisiting db and load it from the memory saving computation in restoring the same data again and again.

In [3]:
DATA_PATH = "data"
CHROMA_PATH = "chroma"

def load_documents_from_csv(csv_path):
    # Assuming your CSV has a column named "content" containing the text
    data = pd.read_csv(csv_path)
    documents = [Document(content) for content in data['content']]
    return documents

def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True
    )
    chunks = text_splitter.split_documents(documents)

    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

def save_to_chroma(chunks: list[Document]):

    model_name = "sentence-transformers/all-mpnet-base-v2"
    model_kwargs = {"device": "cpu"}
    encode_kwargs = {"normalize_embeddings": True}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
    )

    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    vectordb = Chroma.from_documents(
        chunks, embedding=embeddings, persist_directory = CHROMA_PATH
    )
    vectordb.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}")
    return vectordb

def generate_data_store():
    documents = load_documents_from_csv("processed_articles.csv")
    chunks = split_text(documents)
    vectordb = save_to_chroma(chunks)
    return vectordb

In [None]:
# vectordb = generate_data_store()
# retriever = vectordb.as_retriever(search_type = "similarity", search_kwargs={"k": 5})

## Loading Vector Database stored locally

In [3]:
DATA_PATH = "data"
CHROMA_PATH = "chroma"

In [4]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceEmbeddings(
        model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
    )
vectordb = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)



## Question Answering Function

In [None]:
def main(query):
    llm = HuggingFaceHub(
        repo_id = "mistralai/Mistral-7B-Instruct-v0.1",
        model_kwargs={"temperature":0.2, "max_length":512},
        huggingfacehub_api_token = "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"
    )

    print('In Search Mode')
    rqa_prompt_template = """Use the following pieces of context to answer the questions at the end.
                        Answer only from the context. If you don't know the answer, say you do not know.
                    {context}
                    Explain in detail.
                    Question: {question}
                    """
    RQA_PROMPT = PromptTemplate(
        template = rqa_prompt_template, input_variables = ["context","question"]
    )
    rqa_chain_type_kwargs = {"prompt": RQA_PROMPT}

    retriever = vectordb.as_retriever(search_type = "similarity", search_kwargs={"k": 5})
    
    qa = RetrievalQA.from_chain_type(llm,
                                     chain_type="stuff",
                                     retriever = retriever,
                                     chain_type_kwargs=rqa_chain_type_kwargs,
                                     return_source_documents = True,
                                     verbose = False)
    result = qa({"query": query})
    return result

if __name__ == "__main__":
    query = input("Ask a question: ")
    print("looking for results")
    result = main(query)
    print(result['result'])

In [17]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

retriever = vectordb.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

llm = HuggingFaceHub(
        repo_id = "mistralai/Mistral-7B-Instruct-v0.1",
        model_kwargs={"temperature":0.2, "max_length":2000},
        huggingfacehub_api_token = "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"
    )

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def format_output(output):
    output_ind = output.find("Answer")
    return output[output_ind:]

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | format_output
)

ans = rag_chain.invoke("What happened at the Al-Shifa Hospital?")

In [18]:
ans

'Answer: The Al-Shifa Hospital in Gaza was attacked by the Israeli military in November 2020. The hospital was used as a command center by Hamas, according to the military, but the World Health Organization (WHO) described the conditions in the hospital as a bloodbath, with new patients arriving every minute and the injured being sutured with little to no anesthesia. The hospital was once the largest in Gaza but has been closed for weeks.'

## Enhancing the context with More information from online sources

In [16]:
from SPARQLWrapper import SPARQLWrapper, JSON

def fetch_dbpedia_data(query):
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    sparql.setQuery(f"""
    SELECT ?abstract WHERE {{
        ?article dbo:abstract ?abstract .
        ?article rdfs:label "{query}"@en .
        FILTER (lang(?abstract) = 'en')
    }}
    LIMIT 1
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    if results["results"]["bindings"]:
        return results["results"]["bindings"][0]["abstract"]["value"]
    return "No relevant information found on DBpedia."

In [21]:
data = fetch_dbpedia_data("Al-Shifa Hospital?")
data

'No relevant information found on DBpedia.'

In [33]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel

retriever = vectordb.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

llm = HuggingFaceHub(
        repo_id = "mistralai/Mistral-7B-Instruct-v0.1",
        model_kwargs={"temperature":0.2, "max_length":2000},
        huggingfacehub_api_token = "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"
    )

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def format_output(output):
    output_ind = output.find("Answer")
    return output[output_ind:]


def augment_context_with_dbpedia(question):
    data = fetch_dbpedia_data(question)
    return data

rag_chain = (
  {
    "context": RunnableParallel({"context 1":retriever | format_docs, "context 2": augment_context_with_dbpedia}),
    "question": RunnablePassthrough(),
  }
  | prompt
  | llm
  | format_output
)

ans = rag_chain.invoke("What happened at the Al-Shifa Hospital?")
print(ans)

Answer: The Al-Shifa Hospital in Gaza was attacked by Israeli forces in November 2020. The hospital was used as a command center by Hamas, according to the Israeli military, but the World Health Organization (WHO) disputed this claim. The WHO described the conditions in the hospital as a "bloodbath" and reported that several patients had fled on foot because ambulances could not reach the facility. The Israeli military detained dozens of suspected militants
