In [None]:
from dotenv import load_dotenv, find_dotenv
from typing import TypedDict, Annotated
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage
from langchain_openai import ChatOpenAI
from langchain_community.tools.tavily_search import TavilySearchResults
import os
import getpass
import sys
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [13]:
_ = load_dotenv(find_dotenv())

In [9]:
# langchain document with 3 key elements
#page_content: a string representing the content;
#metadata: a dict containing arbitrary metadata;
#id: (optional) a string identifier for the document.


documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]
print(documents)
print(documents[0].page_content)
print(documents[0].metadata)



[Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}), Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'})]
Dogs are great companions, known for their loyalty and friendliness.
{'source': 'mammal-pets-doc'}


In [10]:
# loading pdfs
#PyPDFLoader loads one Document object per PDF page. For each, we can easily access:
file_path = "examples/AWS_Certified_Developer_Associate_Updated_June_2018_Exam_Guide_v1.3.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()
print(len(docs))

3


In [11]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

Version 1.3 DVA-C01                                                                                                                                                                      Page |1  
 
 
A

{'source': 'examples/AWS_Certified_Developer_Associate_Updated_June_2018_Exam_Guide_v1.3.pdf', 'page': 0}


In [14]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)
print(all_splits[0].page_content)

Version 1.3 DVA-C01                                                                                                                                                                      Page |1  
 
 
AWS Certified Developer–Associate  
(DVA-C01) Examination Guide 
 
Introduction  
This AWS Certified Developer-Associate Examination (DVA-001) is intended for individuals who perform a 
Developer role.   
 
It validates an examinee’s ability to: 
  
 Demonstrate an understanding of core AWS services, uses, and basic AWS architecture best practices . 
 Demonstrate proficiency in developing, deploying, and debugging cloud-based applications using AWS. 
 
Examination Prerequisite 
There are no prerequisites for taking the Developer-Associate examination. 
 
Recommended AWS Knowledge 
 One or more years of hands-on experience developing and maintaining an AWS based application 
 In-depth knowledge of at least one high-level programming language


# Embeddings

In [16]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])




Generated vectors of length 3072

[-0.018799082711563133, -0.01684536731497232, -0.006476204171409481, 0.029218896297402392, 0.04165031256394557, 0.006610069680074223, -0.008972617513173404, 0.03884275134478352, -0.004536961061508215, 0.02856765783187212]


# Vector Stores

In [21]:
from langchain_chroma import Chroma
db = Chroma.from_documents(all_splits, OpenAIEmbeddings())





In [22]:
query = "What is the Recommended AWS Knowledge"
docs = db.similarity_search(query)
print(docs[0].page_content)

Recommended AWS Knowledge 
 One or more years of hands-on experience developing and maintaining an AWS based application 
 In-depth knowledge of at least one high-level programming language 
 Understanding of core AWS services, uses, and basic AWS architecture best practices  
 Proficiency in developing, deploying, and debugging cloud-based applications using AWS 
 Ability to use the AWS service APIs, AWS CLI, and SDKs to write applications 
 Ability to identify key features of AWS services  
 Understanding of the AWS shared responsibility model 
 Understanding of application lifecycle management 
 Ability to use a CI/CD pipeline to deploy applications on AWS 
 Ability to use or interact with AWS services  
 Ability to apply a basic understanding of cloud-native applications to write code 
 Ability to write code using AWS security best practices (e.g., not using secret and access keys in the code, 
instead using IAM roles)


## Similarity search by vector

In [23]:
embedding_vector = OpenAIEmbeddings().embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)

Recommended AWS Knowledge 
 One or more years of hands-on experience developing and maintaining an AWS based application 
 In-depth knowledge of at least one high-level programming language 
 Understanding of core AWS services, uses, and basic AWS architecture best practices  
 Proficiency in developing, deploying, and debugging cloud-based applications using AWS 
 Ability to use the AWS service APIs, AWS CLI, and SDKs to write applications 
 Ability to identify key features of AWS services  
 Understanding of the AWS shared responsibility model 
 Understanding of application lifecycle management 
 Ability to use a CI/CD pipeline to deploy applications on AWS 
 Ability to use or interact with AWS services  
 Ability to apply a basic understanding of cloud-native applications to write code 
 Ability to write code using AWS security best practices (e.g., not using secret and access keys in the code, 
instead using IAM roles)


In [31]:
docs = await db.asimilarity_search(query)
docs

[Document(page_content='Recommended AWS Knowledge \n\uf0b7 One or more years of hands-on experience developing and maintaining an AWS based application \n\uf0b7 In-depth knowledge of at least one high-level programming language \n\uf0b7 Understanding of core AWS services, uses, and basic AWS architecture best practices  \n\uf0b7 Proficiency in developing, deploying, and debugging cloud-based applications using AWS \n\uf0b7 Ability to use the AWS service APIs, AWS CLI, and SDKs to write applications \n\uf0b7 Ability to identify key features of AWS services  \n\uf0b7 Understanding of the AWS shared responsibility model \n\uf0b7 Understanding of application lifecycle management \n\uf0b7 Ability to use a CI/CD pipeline to deploy applications on AWS \n\uf0b7 Ability to use or interact with AWS services  \n\uf0b7 Ability to apply a basic understanding of cloud-native applications to write code \n\uf0b7 Ability to write code using AWS security best practices (e.g., not using secret and access

# Creating a retriever from a vectorstore

In [33]:
retriever = db.as_retriever()
docs = retriever.invoke("What is the Recommended AWS Knowledge")
print(docs)




[Document(page_content='Recommended AWS Knowledge \n\uf0b7 One or more years of hands-on experience developing and maintaining an AWS based application \n\uf0b7 In-depth knowledge of at least one high-level programming language \n\uf0b7 Understanding of core AWS services, uses, and basic AWS architecture best practices  \n\uf0b7 Proficiency in developing, deploying, and debugging cloud-based applications using AWS \n\uf0b7 Ability to use the AWS service APIs, AWS CLI, and SDKs to write applications \n\uf0b7 Ability to identify key features of AWS services  \n\uf0b7 Understanding of the AWS shared responsibility model \n\uf0b7 Understanding of application lifecycle management \n\uf0b7 Ability to use a CI/CD pipeline to deploy applications on AWS \n\uf0b7 Ability to use or interact with AWS services  \n\uf0b7 Ability to apply a basic understanding of cloud-native applications to write code \n\uf0b7 Ability to write code using AWS security best practices (e.g., not using secret and access

## Maximum marginal relevance retrieval
By default, the vector store retriever uses similarity search. If the underlying vector store supports maximum marginal relevance search, you can specify that as the search type.



In [34]:
retriever1 = db.as_retriever(search_type="mmr")
docs = retriever1.invoke("What is the Recommended AWS Knowledge")
print(docs)








[Document(page_content='Recommended AWS Knowledge \n\uf0b7 One or more years of hands-on experience developing and maintaining an AWS based application \n\uf0b7 In-depth knowledge of at least one high-level programming language \n\uf0b7 Understanding of core AWS services, uses, and basic AWS architecture best practices  \n\uf0b7 Proficiency in developing, deploying, and debugging cloud-based applications using AWS \n\uf0b7 Ability to use the AWS service APIs, AWS CLI, and SDKs to write applications \n\uf0b7 Ability to identify key features of AWS services  \n\uf0b7 Understanding of the AWS shared responsibility model \n\uf0b7 Understanding of application lifecycle management \n\uf0b7 Ability to use a CI/CD pipeline to deploy applications on AWS \n\uf0b7 Ability to use or interact with AWS services  \n\uf0b7 Ability to apply a basic understanding of cloud-native applications to write code \n\uf0b7 Ability to write code using AWS security best practices (e.g., not using secret and access

# Passing search parameters
We can pass parameters to the underlying vectorstore's search methods using search_kwargs.



In [39]:
retriever = db.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.7}
)
docs = retriever.invoke("Whos is Jeff Bezos")
print(docs)
print(len(docs))


[]
0




In [42]:
docs = retriever.invoke("What is the Recommended AWS Knowledge")
print(docs)
len(docs)


[Document(page_content='Recommended AWS Knowledge \n\uf0b7 One or more years of hands-on experience developing and maintaining an AWS based application \n\uf0b7 In-depth knowledge of at least one high-level programming language \n\uf0b7 Understanding of core AWS services, uses, and basic AWS architecture best practices  \n\uf0b7 Proficiency in developing, deploying, and debugging cloud-based applications using AWS \n\uf0b7 Ability to use the AWS service APIs, AWS CLI, and SDKs to write applications \n\uf0b7 Ability to identify key features of AWS services  \n\uf0b7 Understanding of the AWS shared responsibility model \n\uf0b7 Understanding of application lifecycle management \n\uf0b7 Ability to use a CI/CD pipeline to deploy applications on AWS \n\uf0b7 Ability to use or interact with AWS services  \n\uf0b7 Ability to apply a basic understanding of cloud-native applications to write code \n\uf0b7 Ability to write code using AWS security best practices (e.g., not using secret and access

1

# Specifying top k
We can also limit the number of documents k returned by the retriever.

In [41]:
retriever = db.as_retriever(search_kwargs={"k": 1})
docs = retriever.invoke("What is the Recommended AWS Knowledge")
len(docs)

1