# Retrieval Augment Generation (RAG) Approaches 

## Common Imports

In [1]:
import os
import json
import uuid
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler(stream=sys.stdout))


### Dataset Preprocessing

In [None]:
# from PyPDF2 import PdfWriter, PdfReader
# input_pdf = PdfReader("../datasets/presentations/innovation-onepagers-ibm.pdf")

# index = 1
# for page in input_pdf.pages:
#     if index > 0:
#         output = PdfWriter()
#         output.add_page(page)
#         with open('../datasets/presentations/splits/slide'+str(index) +'.pdf', "wb") as output_stream:
#             output.write(output_stream)
#     index = index + 1

## RAG With VectorDB

In [2]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

# from langchain.chains import load_qa_chain
from langchain.chains.question_answering import load_qa_chain
# from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document

try:
    from sentence_transformers import SentenceTransformer
except ImportError:
    raise ImportError("Could not import sentence_transformers: Please install sentence-transformers package.")

try:
    import chromadb
    from chromadb.api.types import EmbeddingFunction
except ImportError:
    raise ImportError("Could not import chromdb: Please install chromadb package.")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

class MiniLML6V2EmbeddingFunction(EmbeddingFunction):
    MODEL = SentenceTransformer('all-MiniLM-L6-v2')
    def __call__(self, texts):
        return MiniLML6V2EmbeddingFunction.MODEL.encode(texts).tolist()


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /var/folders/c7/kjprf9q109x1lhfjfm_h6jfr0000gn/T/tmph6j3sal1
Created a temporary directory at /var/folders/c7/kjprf9q109x1lhfjfm_h6jfr0000gn/T/tmph6j3sal1
INFO:torch.distributed.nn.jit.instantiator:Writing /var/folders/c7/kjprf9q109x1lhfjfm_h6jfr0000gn/T/tmph6j3sal1/_remote_module_non_scriptable.py
Writing /var/folders/c7/kjprf9q109x1lhfjfm_h6jfr0000gn/T/tmph6j3sal1/_remote_module_non_scriptable.py
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu
Use pytorch device: cpu


In [4]:
embedding_function = MiniLML6V2EmbeddingFunction()

chroma_client = chromadb.PersistentClient(path="../vectors")

chroma_collection = chroma_client.create_collection(
    name="search", 
    get_or_create=True,
    embedding_function=embedding_function
    )

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


In [5]:
def load_documents(DATASET_DIR):
    # DATASET_DIR = "../datasets/presentations"
    loader = DirectoryLoader(DATASET_DIR, glob='*.pdf')
    documents = loader.load()
    logger.info(f'You have {len(documents)} document(s) in your data')
    logger.info(f'There are {len(documents[0].page_content)} characters in first page')
    # logger.info(f'First Page Content: \n\n{documents[0].page_content} \n\n')
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = text_splitter.split_documents(documents)
    logger.info(f'We have total documents after split: {len(docs)}')

    ids = []
    metadatas = []
    documents = []
    for doc in docs:
        ids.append(str(uuid.uuid1()))
        documents.append(doc.page_content)
        file_name = os.path.basename(doc.metadata['source'])
        metadatas.append({'source': file_name})   
        # self._collection.add(ids=[str(uuid.uuid1())], metadatas=doc.metadata, documents=doc.page_content)
    
    chroma_collection.add(ids=ids, metadatas=metadatas, documents=documents)   
    try:
        chroma_client.persist() 
        # del chroma_client
        # del chroma_collection
    except Exception as e:
        print(e)

In [6]:
load_documents("../datasets/presentations/splits")

INFO:root:You have 58 document(s) in your data
You have 58 document(s) in your data
INFO:root:There are 1780 characters in first page
There are 1780 characters in first page
INFO:root:We have total documents after split: 109
We have total documents after split: 109


Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.83it/s]

'SegmentAPI' object has no attribute 'persist'





In [7]:
def fetchContextUsingVectorDB(query):
    docs = []
    result = chroma_collection.query(query_texts=query, n_results=2, include=["documents","metadatas"])    
    # print(json.dumps(result, indent=2))
    for c in result["documents"][0]:    
        pageContent = ''.join(c)
        # print(pageContent)
        doc = Document(page_content = pageContent);
        docs.append(doc)
    
    return docs

### Test Results from VectorDB

In [8]:
query = "Can you give me summary of the AI Learning Helper case?"
result = fetchContextUsingVectorDB(query)
for c in result:    
    print(f"{c.page_content} \n\n ******************* \n\n")

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.67it/s]

Provide support to counselors and assist them during conversation with children

Counselors/supporters might not be specialists in all areas, often only in 2- 3 areas. Topics outside their scope of comfort are difficult. An AI can help them on topics where they lack knowledge

A generic “Ethical Advisory Agent” which uses the Federate Question Answer (Bot of Bots) approach and ethical AI principles to help counsellors and case workers help their clients in real time – fast and fair. Based on the asset created for “Children's Welfare” in 2019 – in combination with eVA and FQA.

AI platform with opportunity for federated learning across related domains

Help tool for guidance material and advice suggestion

AI as a service: Plugin to 3rd party chat/support tools 

 ******************* 


Ethical Advisor – Helping children with AI

itelligence uses AI technology to assist child counselors in their interaction with children. The goal of the AI is to make the counselors more efficient by au




## RAG With IBM Watson Discovery

In [9]:
from ibm_watson import DiscoveryV2
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import text_extensions_for_pandas as tp

In [16]:

env_path = Path('../.env')
load_dotenv(dotenv_path=env_path)
discovery = None
WD_VERSION = '2023-03-31'
WD_PROJECT_ID = os.getenv("WD_PROJECT_ID", None)
WD_COLLECTION_ID = os.getenv("WD_COLLECTION_ID", "0952b45a-15b2-3b48-0000-018ac070b2b4")

WD_THRESHOLD = 0.06
SHOW_WD_RESULTS = False

In [18]:
def init_wd():
    WD_API_KEY = os.getenv("WD_API_KEY", None)
    WD_ENDPOINT = os.getenv("WD_ENDPOINT", None)
    if WD_API_KEY is None or WD_ENDPOINT is None:
        print("Either api_key or api_url is None. Please make sure your credentials are correct.")
    
    authenticator = IAMAuthenticator(WD_API_KEY)
    global discovery
    discovery = DiscoveryV2(
        version=WD_VERSION,
        authenticator=authenticator
    )
    return discovery

In [19]:
init_wd()
collections = discovery.list_collections(
  project_id=WD_PROJECT_ID
).get_result()

print(collections)

{'collections': [{'name': 'cos-search', 'collection_id': '0952b45a-15b2-3b48-0000-018ac070b2b4'}, {'name': 'CarManuals', 'collection_id': '8a630ea5-f1ea-fcb5-0000-018aa8a3e4f2'}, {'name': 'TechSupportFAQ', 'collection_id': 'e1d38500-4eea-126f-0000-01899be30019'}, {'name': 'BusinessSlides', 'collection_id': '80abebd4-c560-8257-0000-018ac03e6553'}]}


In [20]:
WD_COLLECTION_ID = "0952b45a-15b2-3b48-0000-018ac070b2b4"

In [21]:
def query_wd(payload: any):
    global discovery
    if(discovery is None):
        discovery = init_wd()
    discovery_results = discovery.query(
        project_id = payload['project_id'],
        collection_ids = payload['collection_ids'],
        natural_language_query= payload['query'],
        count = 3,
        return_ = ["metadata.source.SourceUrl","document_id","extracted_metadata.filename", "title", "text", "BusinessNeeds", "Solutions", "Outcomes"]
    ).get_result()

    # print(json.dumps(discovery_results, indent=2))
    
    return discovery_results

In [22]:
def showAsList(textArr, joinText=False):
    result = ""
    # print(f"Input textArr: {textArr}")
    if textArr is None or len(textArr) == 0:
        # print(f"RETURNING EMPTY: >> {textArr}")
        return ""

    if len(textArr) == 1:
        return textArr[0]

    if joinText is True:
        result = ' '.join(textArr) + "\n"
        # print(f"joinText: {joinText}, so RETURN RESULT: {result}")
        return result
    
    index = 0
    for text in textArr:
        temp = text.encode().decode("utf-8").replace(u"\u2022", "")
        temp = temp.strip()
        if len(temp) > 5:
            if index > 0:
                result =  f"{result} {index}: {temp} \n"
            else:
                result =  f"\n{result} {temp} \n"
            index = index + 1        

    # print(f"RETURNING RESULT: >> {result}")
    return result

## Using the Re-Ranker (Currently not implemented)

### Good Article for Re-Ranking the results

[Improving RAG (Retrieval Augmented Generation) Answer Quality with Re-ranker](https://medium.com/towards-generative-ai/improving-rag-retrieval-augmented-generation-answer-quality-with-re-ranker-55a19931325)

In [None]:
# from primeqa.components.reranker.colbert_reranker import ColBERTReranker

In [30]:
def fetch_context_from_wd(query):
    payload = {
            "query": query,
            "project_id": WD_PROJECT_ID,
            "collection_ids": [WD_COLLECTION_ID]
        }
    discovery_results = query_wd(payload)    
    docs = []

    if SHOW_WD_RESULTS:
        print(json.dumps(discovery_results, indent=2))

    for c in discovery_results['results']:

        if c['result_metadata']['confidence'] <= WD_THRESHOLD:
            print(f"Skipping Document: {c['document_id'][0]}")
            continue

        if 'title' in c:
            pageContent = "Title: " +showAsList(c['title'], joinText=True)
        if 'text' in c:
            pageContent += showAsList(c['text'], joinText=True)             
        if 'Solutions' in c:
            pageContent += showAsList(c['Solutions'])
        if 'Outcomes' in c:
            pageContent += showAsList(c['Outcomes'])
        if 'BusinessNeeds' in c:
            pageContent += showAsList(c['BusinessNeeds'])
        pageContent += "\n\n\n"
      
        doc = Document(page_content = pageContent);
        docs.append(doc)
    return docs  

### Test Results from IBM Watson Discovery

In [24]:
query = "Can you give me summary of the AI Learning Helper case ?"
result = fetch_context_from_wd(query)
for c in result:    
    print(f"{c.page_content} \n\n ******************* \n\n")

Skipping Document: c
Lab Preview Home School Helper
The “Home School Helper” is an artificial intelligent companion living on a screen or in a virtual reality environment. Each student will have their own. T IM&C
 Solution 
 1: Just like a real teacher, the Home School Helper understands the needs of the child and the context of its learning journey. It will be able to understand the child's emotions and change teaching strategy if, for example, the helper detects that the child is frustrated, de-motivated or does not have the right learning flow Tech focus: 
 2: IoT/Intelligent Edge, User Experience, Data Intelligence (Edge AI), Ethical AI, IoT integrating with Humans, Robotized processes 

 Outcomes 
 1: Potential: 50% reduction in cost of homeschooling, resulting in a total cost saving of 5% of a typical school budget. 
 2: Enhanced working environment for teachers. 
 3: Enhanced learning experience and subject knowledge for children with home schooling. 

 Lessons & exercises 
 1: 

## Using IBM Watsonx.ai 

In [25]:
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM


In [26]:

# env_path = Path('../app') / '.env'
# load_dotenv(dotenv_path=env_path)

IBM_CLOUD_API_KEY = os.getenv("IBM_CLOUD_API_KEY", None)
WATSONX_AI_ENDPOINT = os.getenv("WATSONX_AI_ENDPOINT", None)
WX_PROJECT_ID = os.getenv("WX_PROJECT_ID", None)

wx_credentials = {
    "url": WATSONX_AI_ENDPOINT,
    "apikey": IBM_CLOUD_API_KEY
}

In [27]:
parameters = {
    GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
    GenParams.MAX_NEW_TOKENS: 500,
    GenParams.MIN_NEW_TOKENS: 60,
    "repetition_penalty": 2
    # GenParams.REPETITION_PENALTY: 2
    # GenParams.TEMPERATURE: 0.5,
    # GenParams.TOP_K: 50,
    # GenParams.TOP_P: 1
}


wx_model = Model(
    model_id=ModelTypes.FLAN_UL2, 
    params=parameters, 
    credentials=wx_credentials,
    project_id=WX_PROJECT_ID)


watsonx_llm = WatsonxLLM(model=wx_model)

INFO:ibm_watson_machine_learning.client:Client successfully initialized
Client successfully initialized


In [28]:
def main(query, context):
   
    qa_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:\n"""
  
    PROMPT = PromptTemplate(
        template=qa_template, input_variables=["context", "question"]
    )
    
    chain_type_kwargs = {"prompt": PROMPT}

    chain = load_qa_chain(watsonx_llm, chain_type="stuff", verbose=True)
    # chain = load_qa_chain(watsonx_llm, chain_type="stuff", prompt=PROMPT, verbose=True)
    # chain = RetrievalQA.from_chain_type(llm=watsonx_llm, chain_type="stuff", chain_type_kwargs=chain_type_kwargs)
    
    result = chain.run(input_documents=context, question=query)
    print(result)
    
    # chain({"input_documents": docs, "question": payload['query']}, return_only_outputs=True)

In [34]:
# query = "If my vehicle has airbags, why should I have to wear safety belts?"
# query = "How to Wear Safety Belts Properly"
# query = "What can happen if my shoulder belt is too loose ?"

# query = "What solutions do we have for Farmbot network ?"
# query = "Show me the outcomes of our Car Damage Detection solution"
# query = "What's our take on certifications using blockchain technology ?"

# query = "What solutions do we have for the Home School Helper?"
# query = "Do we have any cases where we used drones?"
# query = "Who’s managing the portfolio?"
query = "What were the outcomes of the Wingcopter project?"
# query = "I have a customer that’s interested in the digital assistant. Can you write me a summary of the solution?"
# query = "Can you give me summary of the AI Learning Helper case ?"


# context = fetchContextUsingVectorDB(query)
context = fetch_context_from_wd(query)
# print(context)

main(query, context)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Title: W i n g c o p t e r   |   Z a m w a m b a
Drone delivery platform to supply rural areas in Malawi (Africa) with medical goods and samples
 Solution 
 1: Delivery drones (VTOL) for medical goods transfers to health facilities  Payload up to 6 kg, speed up to 120 kph  Cold chain transports possible  CO2 neutral transportation  Platform for flight scheduling and medical goods orders  Based on SAP BTP, it.XIA and 
 2: Microsoft Azure 

 Outcomes 
 1: No dependence on road/land infrastructure  Delivery times for medical goods and samples went down from a day’s march to approx. 
 2: 20 minutes  Urgent deliveries can be prioritized, e.g. in life threatening situations  Supporting Malawi’s med