# Build Index

In this notebook we will demonstrate,
- (1) Upload documents to S3
- (2) Build index with give documents on S3 on Chroma
- (3) Retrieve documents from Chroma

**This notebook serves a template such that you can easily replace the example dataset by your own to build a custom question and asnwering application.**

## Step 1. Python Installation

In [None]:
!pip install -U langchain 
#!pip install pdfplumber
#!pip install unstructured
!pip install chromadb
!pip install -U sentence-transformers
!pip install pydantic==1.10.11 #use 1.10.11 version due to stability

In [None]:
import boto3
import botocore
import sagemaker
from sagemaker.session import Session
from IPython.display import Image, display, JSON
import os
import pandas as pd

# variables
sagemaker_session = Session()
data_bucket = sagemaker.Session().default_bucket()
region = boto3.session.Session().region_name
aws_role = sagemaker_session.get_caller_identity_arn()

# boto3 clients
s3=boto3.client('s3')
endpoint_name = "<your llm endpoint name>" #your llm endpoint name
print(f"Region is {region}, IAM Role: {aws_role}, S3 Bucket: {data_bucket}")

In [None]:
def parse_response_model_flan_t5(query_response):
    model_predictions = json.loads(query_response["Body"].read())
    generated_text = model_predictions["generated_texts"]
    return generated_text

_MODEL_CONFIG_ = {
    "huggingface-text2text-flan-t5-small": {
        "instance type": "ml.g5.xlarge",
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1"},
        "parse_function": parse_response_model_flan_t5,
        "prompt": """Answer based on context:\n\n{context}\n\n{question}""",
    }
}

question = "Here is what customer said in the call: 'Yes, I have received a defective product, and I am extremely angry about it! This is unacceptable, and I want it resolved immediately!' What does customer want?"


In [None]:
payload = {
    "text_inputs": question,
    "max_length": 100,
    "num_return_sequences": 1,
    "top_k": 10,
    "top_p": 0.95, #0.95,
    "do_sample": True,
}


for model_id in _MODEL_CONFIG_:
    endpoint_name = _MODEL_CONFIG_[model_id]["endpoint_name"]
    query_response = query_endpoint_with_json_payload(
        json.dumps(payload).encode("utf-8"), endpoint_name=endpoint_name
    )
    generated_texts = _MODEL_CONFIG_[model_id]["parse_function"](query_response)
    print(f"For model: {model_id}, the generated output is: {generated_texts[0]}\n")

In [None]:
customer_ask = generated_texts[0]

## Step 2. Langchain workflow

In [None]:
from langchain.document_loaders import S3DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import NLTKTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
import sagemaker

data_bucket = sagemaker.Session().default_bucket()
prefix='llm/sample'

embeddings = HuggingFaceEmbeddings()
loader = S3DirectoryLoader(data_bucket, prefix=prefix)
docs = loader.load()
text_splitter = NLTKTextSplitter(chunk_size=550)
texts = text_splitter.split_documents(docs)
vectordb = Chroma.from_documents(texts, embeddings)

In [None]:
from langchain import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import json
from typing import Dict

class QAContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
        input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
        return input_str.encode('utf-8')
    
    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json["generated_texts"][0]

qa_content_handler = QAContentHandler()
prompt_template=""" Given the following text from a document, answer the question to the best of your abilities. Answer only from the provided document,, if you do not know the answer 
just say you don't know. DO NOT make up an answer.

Document: {document}
Question: {question}
Answer:
"""

prompt=PromptTemplate(
    input_variables=["document", "question"], 
    template=prompt_template)

qa_chain = LLMChain(
    llm=SagemakerEndpoint(
        endpoint_name=endpoint_name, # replace with your endpoint name if needed
        region_name=region,
        model_kwargs=FLAN_T5_PARAMETERS,
        content_handler=qa_content_handler
    ),
    prompt=prompt
)


In [None]:
similar_docs = vectordb.similarity_search(question, k=3) #see also : max_marginal_relevance_search_by_vector(query, k=3)
context_list = [a.page_content for a in similar_docs]
metadata_list = [a.metadata.get('source') for a in similar_docs]
context = "\n\n".join(context_list)
context

In [None]:
qa_chain.run({
    'document': context,
    'question': f"Can you tell me more about {customer_ask}?",
    })