# Initialization

In [70]:
import os
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [71]:
query = "List all the features provided by AWS on security governance?"
index_name = "aws-idx"
k = 5

In [72]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'), model="text-embedding-ada-002")

# k-NN Search

In [73]:
from langchain_community.vectorstores import OpenSearchVectorSearch
client = OpenSearchVectorSearch(
    embedding_function=embeddings,
    index_name=index_name,
    opensearch_url="http://localhost:9200",
    http_auth=("admin", "Severus11#"),
    use_ssl = False,
    verify_certs = False,
    timeout=300,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

In [74]:
knndocs = client.similarity_search(
    "What did the president say about Ketanji Brown Jackson",
    search_type="painless_scripting",
    space_type="cosineSimilarity",
    k=k
)
print(knndocs[:2])

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:POST http://localhost:9200/aws-idx/_search [status:200 request:0.147s]


[Document(page_content='An Overview of the AWS Cloud Adoption Framework\nAWS Whitepaper\nPoint of contact\n• Dr. Saša Baškarada, Worldwide Lead, AWS Cloud Adoption Framework\n35', metadata={'file_path': 'C:/workspace/experiments/GenAI/Projects/Test Data/aws.pdf', 'creator': 'ZonBook XSL Stylesheets with Apache FOP', 'modDate': '', 'keywords': '', 'trapped': '', 'author': 'Amazon Web Services', 'subject': '', 'file_name': 'aws.pdf', 'format': 'PDF 1.4', 'source': 'C:/workspace/experiments/GenAI/Projects/Test Data/aws.pdf', 'total_pages': 42, 'title': 'An Overview of the AWS Cloud Adoption Framework - AWS Whitepaper', 'creationDate': 'D:20240525051807Z', 'terms': ['aws cloud adoption framework aws whitepaper point', 'aws cloud adoption framework 35', 'contact • dr', 'worldwide lead', 'saša baškarada', 'overview'], 'producer': 'Apache FOP Version 2.6', 'page': 38}), Document(page_content='An Overview of the AWS Cloud Adoption Framework\nAWS Whitepaper\nAlso available on Audible, Kindle, a

# Neural Search

In [77]:
neural_search_query = {
    'size': k,
    'query': {
        "neural": {
            "bert_embeddings": {
                "query_text": query,
                "model_id": "acBUIZABJGEOAwdvUors",
                "k": k
            }
        }
    }
}


In [81]:
from langchain.schema.document import Document

neural_response = client.client.search(
    body = neural_search_query,
    index = index_name
)
neuraldocs = []
for item in neural_response['hits']['hits']:
    neuraldocs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

print(neuraldocs[:2])

INFO:opensearch:POST http://localhost:9200/aws-idx/_search [status:200 request:0.115s]


[Document(page_content='An Overview of the AWS Cloud Adoption Framework\nAWS Whitepaper\nNotices\nCustomers are responsible for making their own independent assessment of the information in \nthis document. This document: (a) is for informational purposes only, (b) represents current AWS \nproduct oﬀerings and practices, which are subject to change without notice, and (c) does not create \nany commitments or assurances from AWS and its aﬃliates, suppliers or licensors. AWS products or \nservices are provided “as is” without warranties, representations, or conditions of any kind, whether \nexpress or implied. The responsibilities and liabilities of AWS to its customers are controlled by \nAWS agreements, and this document is not part of, nor does it modify, any agreement between \nAWS and its customers.\n© 2021 Amazon Web Services, Inc. or its aﬃliates. All rights reserved.\n38', metadata={'file_path': 'C:/workspace/experiments/GenAI/Projects/Test Data/aws.pdf', 'creator': 'ZonBook XSL 

# Neural Sparse Search

In [84]:
neural_sparse_search_query = {
    'size': k,
    'query': {
        "neural_sparse": {
            "oss_sparse_embeddings": {
                "query_text": query,
                "model_id": "asBUIZABJGEOAwdvU4oe"
            }
        }
    }
}


In [85]:
from langchain.schema.document import Document

neural_sparse_response = client.client.search(
    body = neural_sparse_search_query,
    index = index_name
)
neuralsparsedocs = []
for item in neural_sparse_response['hits']['hits']:
    neuralsparsedocs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

print(neuralsparsedocs[:2])

INFO:opensearch:POST http://localhost:9200/aws-idx/_search [status:200 request:5.337s]


[Document(page_content='An Overview of the AWS Cloud Adoption Framework\nAWS Whitepaper\nSecurity perspective: compliance and assurance\nThe security perspective helps you achieve the conﬁdentiality, integrity, and availability of your \ndata and cloud workloads. It comprises nine capabilities shown in the following ﬁgure. Common \nstakeholders include CISO, CCO, internal audit leaders, and security architects and engineers.\nAWS CAF Security perspective capabilities\n• Security governance – Develop, maintain, and eﬀectively communicate security roles, \nresponsibilities, accountabilities, policies, processes, and procedures. Ensuring clear lines of \naccountability is critical to the eﬀectiveness of your security program. Understanding your assets, \nsecurity risks, and compliance requirements that apply to your industry and/or organization \nwill help you prioritize your security eﬀorts. Providing ongoing direction and advice will help \naccelerate your transformation by allowing you

# Keyword Search - BM25(tf-idf)

In [90]:
keyword_search_query = {
    'size': k,
    'query': {
        "match": {
            "text": {
                "query": query,
                "analyzer": "english"
            }
        }
    }
}


In [91]:
from langchain.schema.document import Document

keyword_response = client.client.search(
    body = keyword_search_query,
    index = index_name
)
keyworddocs = []
for item in keyword_response['hits']['hits']:
    keyworddocs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

print(keyworddocs[:2])

INFO:opensearch:POST http://localhost:9200/aws-idx/_search [status:200 request:0.334s]


[Document(page_content="An Overview of the AWS Cloud Adoption Framework\nAWS Whitepaper\nAn Overview of the AWS Cloud Adoption Framework: AWS \nWhitepaper\nCopyright © 2024 Amazon Web Services, Inc. and/or its aﬃliates. All rights reserved.\nAmazon's trademarks and trade dress may not be used in connection with any product or service \nthat is not Amazon's, in any manner that is likely to cause confusion among customers, or in any \nmanner that disparages or discredits Amazon. All other trademarks not owned by Amazon are \nthe property of their respective owners, who may or may not be aﬃliated with, connected to, or \nsponsored by Amazon.", metadata={'file_path': 'C:/workspace/experiments/GenAI/Projects/Test Data/aws.pdf', 'creator': 'ZonBook XSL Stylesheets with Apache FOP', 'modDate': '', 'keywords': '', 'trapped': '', 'author': 'Amazon Web Services', 'subject': '', 'file_name': 'aws.pdf', 'format': 'PDF 1.4', 'source': 'C:/workspace/experiments/GenAI/Projects/Test Data/aws.pdf', 'to

## Test with OpenAi

In [29]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.callbacks import StdOutCallbackHandler

In [31]:
handler = StdOutCallbackHandler()
model = ChatOpenAI(
    temperature=0, 
    model_name="gpt-4",
    callbacks=[handler]
)

In [32]:
class Response(BaseModel):
    answer: str = Field(description="the answer of the question")
    score: int = Field(description="the LLM evaluation score for the answer")
    justification: str = Field(description="the justification from LLM for the generated answer")

In [33]:
parser = JsonOutputParser(pydantic_object=Response)

In [34]:
format_instruction = parser.get_format_instructions()

In [35]:
print(format_instruction)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"answer": {"title": "Answer", "description": "the answer of the question", "type": "string"}, "score": {"title": "Score", "description": "the LLM evaluation score for the answer", "type": "integer"}, "justification": {"title": "Justification", "description": "the justification from LLM for the generated answer", "type": "string"}}, "required": ["answer", "score", "justification"]}
```


In [36]:
template = """
Given the following context:
{context}
The question:
{question}
Answer the user query as json object with following three fields:
1. answer: The answer to the question
2. score: A score between 0 to 10
3. justification: A justification from AI
"""

In [37]:
contexts = [d.page_content for d in knndocs]
context = '\n'.join(contexts)
print(context)

An Overview of the AWS Cloud Adoption Framework
AWS Whitepaper
Point of contact
• Dr. Saša Baškarada, Worldwide Lead, AWS Cloud Adoption Framework
35
An Overview of the AWS Cloud Adoption Framework
AWS Whitepaper
Also available on Audible, Kindle, and as an eBook.
iv
An Overview of the AWS Cloud Adoption Framework
AWS Whitepaper
Identify lead curators with responsibility for moderating the Data Catalog. In line with your 
data monetization strategy, catalog key data products, including structured and unstructured 
data. Identify and capture relevant technical and business metadata, including lineage. Leverage 
standard ontologies, business glossaries, and automation (including machine learning) to tag, 
index, and auto-classify data. Augment with manual tagging as necessary and appropriately 
handle any personally identiﬁable information (PII). Consider crowdsourcing data enrichment 
through social curation. In other words, consider empowering data consumers to rate, review, 
and annot

In [48]:
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
    output_parser=parser
)

In [49]:
print(prompt)

input_variables=['context', 'question'] output_parser=JsonOutputParser(pydantic_object=<class '__main__.Response'>) partial_variables={'format_instructions': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"answer": {"title": "Answer", "description": "the answer of the question", "type": "string"}, "score": {"title": "Score", "description": "the LLM evaluation score for the answer", "type": "integer"}, "justification": {"title": "Justification", "description": "the justification from LLM for the generated answer", "type": "string"}}, "required": ["answer", "score", "

In [50]:
verbose=True

In [51]:
from langchain.chains import LLMChain
chain = LLMChain(llm=model, prompt=prompt, verbose=True)

In [52]:
result = chain.invoke({"question": query, "context": context})



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Given the following context:
An Overview of the AWS Cloud Adoption Framework
AWS Whitepaper
Point of contact
• Dr. Saša Baškarada, Worldwide Lead, AWS Cloud Adoption Framework
35
An Overview of the AWS Cloud Adoption Framework
AWS Whitepaper
Also available on Audible, Kindle, and as an eBook.
iv
An Overview of the AWS Cloud Adoption Framework
AWS Whitepaper
Identify lead curators with responsibility for moderating the Data Catalog. In line with your 
data monetization strategy, catalog key data products, including structured and unstructured 
data. Identify and capture relevant technical and business metadata, including lineage. Leverage 
standard ontologies, business glossaries, and automation (including machine learning) to tag, 
index, and auto-classify data. Augment with manual tagging as necessary and appropriately 
handle any personally identiﬁable information (PII). Consider crowdsourcing data enri

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



[1m> Finished chain.[0m


In [53]:
print(result)

{'question': 'List all the features provided by AWS on security governance?', 'context': 'An Overview of the AWS Cloud Adoption Framework\nAWS Whitepaper\nPoint of contact\n• Dr. Saša Baškarada, Worldwide Lead, AWS Cloud Adoption Framework\n35\nAn Overview of the AWS Cloud Adoption Framework\nAWS Whitepaper\nAlso available on Audible, Kindle, and as an eBook.\niv\nAn Overview of the AWS Cloud Adoption Framework\nAWS Whitepaper\nIdentify lead curators with responsibility for moderating the Data Catalog. In line with your \ndata monetization strategy, catalog key data products, including structured and unstructured \ndata. Identify and capture relevant technical and business metadata, including lineage. Leverage \nstandard ontologies, business glossaries, and automation (including machine learning) to tag, \nindex, and auto-classify data. Augment with manual tagging as necessary and appropriately \nhandle any personally identiﬁable information (PII). Consider crowdsourcing data enrichmen