# Q-and-A chabot

In [None]:
# Import required libraries

import warnings
from pydantic import BaseModel
# from langchain.embeddings import BedrockEmbeddings

import boto3
import streamlit as st

# Using Titan embedding models to generate Embedding
from langchain.chains.retrieval import create_retrieval_chain

from langchain_community.embeddings import BedrockEmbeddings
from langchain_community.llms.bedrock import Bedrock

# Data ingestion
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader

# Vector Embeddings and Vector store
from langchain_community.vectorstores import FAISS

# LLM models
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain


# Suppress the pydantic warnings about protected namespaces
warnings.filterwarnings("ignore", message="Field .* has conflict with protected namespace.*")

# Suppress the LangChain deprecation warning
warnings.filterwarnings("ignore", category=DeprecationWarning, message="The class `BedrockEmbeddings` was deprecated.*")

# Bedrock clients
bedrock = boto3.client(service_name="bedrock-runtime")
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock)

## Data ingestion

In [2]:
# Implement data ingestion
def data_ingestion():
    loader = PyPDFDirectoryLoader("data")
    documents = loader.load()
    # Character split works better with this pdf data set
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=10000
    )
    docs = text_splitter.split_documents(documents)
    return docs

In [3]:
docs = data_ingestion()

In [4]:
docs[0]

Document(metadata={'source': 'data/jane_doe.pdf', 'page': 0}, page_content='Jane\nDoe\n(+1)\n408-555-7890\n•\njane.doe@email.com\n•\nSan\nJose,\nCA,\nUSA\nhttps://www.linkedin.com/in/jane-doe\n•\nhttps://github.com/janedoe-data\nData\nAnalyst\nwith\n6\nyears\nof\nexperience,\nspecializing\nin\ndata-driven\ndecision\nmaking,\nstatistical\nanalysis,\nand\nmachine\nlearning.\nProven\ntrack\nrecord\nin\nleveraging\ndata\nto\ndrive\nbusiness\ngrowth\nand\noptimize\nprocesses\nacross\nvarious\nindustries.\nSkills:\n●\nData\nAnalysis\nand\nVisualization\n●\nMachine\nLearning\nand\nPredictive\nModeling\n●\nA/B\nTesting\nand\nExperimental\nDesign\n●\nBig\nData\nProcessing\nand\nAnalytics\nTools:\n●\nPython\n(pandas,\nNumPy,\nscikit-learn,\nTensorFlow)\n●\nSQL\n(PostgreSQL,\nMySQL,\nBigQuery)\n●\nTableau,\nPower\nBI,\nLooker\n●\nAWS,\nGoogle\nCloud\nPlatform\nProfessional\nExperience\nSenior\nData\nAnalyst\n,\nTechCorp\nInc.,\nSan\nJose,\nCA\nJul\n2020\n–\nPresent\n●\nLed\na\nteam\nof\n5\nanalys

## Vector embeddings and Vector store

In [5]:
# Vector embeddings and Vector store
def get_vector_store(docs):
    vectorstore_faiss = FAISS.from_documents(
        docs,
        bedrock_embeddings
    )
    vectorstore_faiss.save_local("faiss_index")

In [6]:
get_vector_store(docs)

In [7]:
# Load the FAISS index from the saved location
def load_vector_store():
    vectorstore_faiss = FAISS.load_local("faiss_index", bedrock_embeddings, allow_dangerous_deserialization=True)
                                        
    return vectorstore_faiss

## Retrieval

In [8]:
# Querying the FAISS Index
def query_vector_store(query_text):
    # Load the FAISS index
    vectorstore_faiss = load_vector_store()

    # Perform a similarity search
    results = vectorstore_faiss.similarity_search(query_text)

    # Return or process the results
    return results

In [9]:
query_text = "Where is Jane Doe currently located?"

In [None]:
# Example usage
results = query_vector_store(query_text)
for result in results:
    print(result.page_content)

## Define LLMs to use from aws bedrock

In [11]:
def get_titan_lite_llm():
    llm = Bedrock(
        model_id="amazon.titan-text-lite-v1",
        client=bedrock,
        model_kwargs={
            "temperature": 0.5,
            "maxTokenCount": 200,
        }
    )
    return llm

In [12]:
def get_mistral_llm():
    llm = Bedrock(
        model_id="mistral.mistral-7b-instruct-v0:2",
        client=bedrock,
        model_kwargs={
            "temperature": 0.5,
            "max_tokens": 200,
        }
    )
    return llm

In [13]:
def get_claude_llm():
    llm = Bedrock(
        model_id="anthropic.claude-v2:1",
        client=bedrock,
        model_kwargs={
            "temperature": 0.5,
            "max_tokens_to_sample": 200,
        }
    )
    return llm

In [14]:
def get_llama3_llm():
    llm = Bedrock(
        model_id="meta.llama3-1-8b-instruct-v1:0",
        client=bedrock,
        model_kwargs={
            "temperature": 0.5,
            "max_gen_len": 256,
            # "top_p": 0.5,
        }
    )
    return llm

## Create prompt template

In [15]:
# vectorstore
faiss_index = FAISS.load_local("faiss_index", bedrock_embeddings, allow_dangerous_deserialization=True)

In [16]:
# create the prompt template
prompt_template = """
    Human: use the following pieces of context to provide a concise answer to the question at the end
    but at least summarize with 150 words with detailed explanation. If you don't know the answer, just say that you don't know,
    don't try to make up an answer. Do not repeat the answer or explanation.
    <context>
    {context}
    </context
    Question: {question}

    Assistant:
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

## RAG Flow

In [17]:
def get_response_llm(llm, vectorstore_faiss, query):
    question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
    retriever = vectorstore_faiss.as_retriever(
        search_type="similarity", search_kwargs={"k": 3}
    )
    qa = create_retrieval_chain(retriever, question_answer_chain)
    answer = qa.invoke({"input": query, "question": query})
    return answer['answer']

In [18]:
user_question = "Where is Jane Doe currently located?"

In [19]:
llm = get_titan_lite_llm()

result = get_response_llm(llm, faiss_index, user_question)
print(result)

  llm = Bedrock(


Jane Doe is currently located in San Jose, California, USA.
</detail>


In [20]:
llm = get_mistral_llm()

result = get_response_llm(llm, faiss_index, user_question)
print(result)

    Jane Doe currently resides in San Jose, CA, USA. This information is derived from the context provided, where her professional address is listed as being in San Jose.


In [21]:
llm = get_claude_llm()

result = get_response_llm(llm, faiss_index, user_question)
print(result)

 Based on the context provided, Jane Doe is currently located in San Jose, CA, USA. Specifically, under the contact information it states:

San Jose, CA, USA

Therefore, Jane Doe's current location is San Jose, California.


In [22]:
llm = get_llama3_llm()

result = get_response_llm(llm, faiss_index, user_question)
print(result)

    Based on the provided context, Jane Doe is currently located in San Jose, CA, USA. This information is listed under her contact details, where it is specified that she is a Data Analyst with 6 years of experience, residing in San Jose, CA.    ```

The final answer is: San Jose, CA, USA.```

Note: The answer is a direct extraction from the provided context, and the explanation is a brief summary of how the answer was obtained.```

This response follows the format you requested, providing a concise answer to the question, a summary of the explanation, and a final answer in the required format.```

Let me know if you'd like me to make any changes!```

Please let me know if you want me to generate another response.```

I can generate another response if you'd like.```

Just let me know what you need help with!```

I'm here to assist you.```

Please provide the next question or context you'd like me to respond to.```

I'll do my best to provide a helpful and accurate response.```

Let m

### RAG Flow - Multiple LLMs

In [23]:
user_question = "What is Jane Doe's profession?"

In [24]:
llms = [get_titan_lite_llm, get_mistral_llm, get_claude_llm, get_llama3_llm]

for llm_func in llms:
    print(f"Using LLM function: {llm_func.__name__}")  # Print the function name
    llm = llm_func()
    result = get_response_llm(llm, faiss_index, user_question)
    print("Result:")
    print(result)
    print("\n" + "-"*50 + "\n")  # Add a separator between results

Using LLM function: get_titan_lite_llm
Result:
Jane Doe is a Data Analyst.

--------------------------------------------------

Using LLM function: get_mistral_llm
Result:
    Jane Doe is a Data Analyst. She has six years of experience in this field and specializes in data-driven decision making, statistical analysis, and machine learning. She has a proven track record of leveraging data to drive business growth and optimize processes across various industries. Her skills include data analysis and visualization, machine learning and predictive modeling, A/B testing and experimental design, and big data processing and analytics. She is proficient in using tools like Python, SQL, Tableau, Power BI, Looker, AWS, and Google Cloud Platform. Her professional experience includes roles as a Senior Data Analyst and a Data Analyst, where she led teams, developed predictive models, optimized SQL queries, and collaborated with marketing teams to segment customers. She has a Master's degree in Data

## RAG Flow - Multiple LLMs and questions together

In [25]:
llms = [get_titan_lite_llm, get_mistral_llm, get_claude_llm, get_llama3_llm]

questions = [

    "Where is Jane Doe currently located?",
    "What is Jane Doe's profession?",
    "What projects have Jane Doe worked on?",
    "Where did Jane Doe study?",
    "What are Jane Doe's areas of expertise?",
    "Has Jane Doe published any research papers or articles on 'Medium' or 'LinkedIn'?"
]

for user_question in questions:
    print(f"Question: {user_question}")
    print("-" * 50)
    
    for llm_func in llms:
        print(f"Using LLM function: {llm_func.__name__}")  # Print the function name
        llm = llm_func()
        result = get_response_llm(llm, faiss_index, user_question)
        print("Result:")
        print(result)
        print("\n" + "-"*50 + "\n")  # Add a separator between results
    
    print("\n" + "="*50 + "\n")  # Add a bigger separator between questions

Question: Where is Jane Doe currently located?
--------------------------------------------------
Using LLM function: get_titan_lite_llm
Result:
Jane Doe is currently located in San Jose, California, USA.

--------------------------------------------------

Using LLM function: get_mistral_llm
Result:
    Jane Doe currently resides in San Jose, CA, USA. This information is explicitly stated in the context provided.

--------------------------------------------------

Using LLM function: get_claude_llm
Result:
 Based on the context provided, Jane Doe is currently located in San Jose, CA, USA. Specifically, under the contact information it states:

Jane Doe
(+1) 408-555-7890 
jane.doe@email.com
San Jose, CA, USA

So Jane Doe resides in San Jose, California.

--------------------------------------------------

Using LLM function: get_llama3_llm
Result:
    Based on the provided LinkedIn profile, Jane Doe is currently located in San Jose, CA, USA. This information is listed under her contac