# Semantic Search using pgvector and Amazon Aurora PostgreSQL

## Setup
Install required python libraries for the setup.

In [None]:
#Download the Bedrock dependencies.
!bash ./download-dependencies.sh

In [None]:
%pip install --no-build-isolation --force-reinstall \
    ./bedrock-python-sdk/awscli-*-py3-none-any.whl \
    ./bedrock-python-sdk/boto3-*-py3-none-any.whl \
   ./bedrock-python-sdk/botocore-*-py3-none-any.whl
%pip install --quiet langchain==0.0.249 "pypdf>=3.8,<4"

In [None]:
!pip install -r requirements.txt

In [None]:
BEDROCK_ENDPOINT_URL = "https://prod.us-west-2.frontend.bedrock.aws.dev"
embeddings_model_endpoint_name='amazon.titan-embed-g1-text-02'
BEDROCK_REGION = "us-west-2"
SOURCE_BUCKET = "llmwhitepapers09152023"
input_data_dir="data"
CHUNK_SIZE_FOR_DOC_SPLIT = 600
CHUNK_OVERLAP_FOR_DOC_SPLIT = 0

## Open-source extension pgvector for PostgreSQL

[pgvector](https://github.com/pgvector/pgvector) is an open-source extension for PostgreSQL that allows you to store and search vector embeddings for exact and approximate nearest neighbors. It is designed to work seamlessly with other PostgreSQL features, including indexing and querying.

PGVector integration with LangChain needs the connection string to the database. 

In [None]:
import psycopg2
from pgvector.psycopg2 import register_vector
import boto3 
import json 

client = boto3.client('secretsmanager')

response = client.get_secret_value(
    SecretId='apgpg-pgvector-secret'
)
database_secrets = json.loads(response['SecretString'])

dbhost = database_secrets['host']
dbport = database_secrets['port']
dbuser = database_secrets['username']
dbpass = database_secrets['password']


dbconn = psycopg2.connect(host=dbhost, user=dbuser, password=dbpass, port=dbport, connect_timeout=10)
dbconn.set_session(autocommit=True)

cur = dbconn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
register_vector(dbconn)
cur.close()
dbconn.close()
print ("Extension created.")
     

In [None]:
#from PyPDF2 import PdfReader
from langchain.vectorstores.pgvector import PGVector, DistanceStrategy
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from typing import List, Tuple
#from langchain.document_loaders import S3DirectoryLoader


def get_vectorstore(docs):
    embeddings = BedrockEmbeddings(client=boto3_bedrock,model_id=embeddings_model_endpoint_name)
    if docs is None:
        return PGVector(
            connection_string=connection_string,
            embedding_function=embeddings,
        )
    return PGVector.from_documents(documents=docs, embedding=embeddings, collection_name=collection_name, connection_string=connection_string)


#Bedrock embeddings
boto3_bedrock = boto3.client(
     service_name='bedrock',
     region_name=BEDROCK_REGION,
     endpoint_url=BEDROCK_ENDPOINT_URL
)

embeddings = BedrockEmbeddings(client=boto3_bedrock,model_id=embeddings_model_endpoint_name)
collection_name = "llm_docs"
connection_string = PGVector.connection_string_from_db_params(                                                  
    driver = "psycopg2",
    user = dbuser,                                  
    password = dbpass,                                  
    host = dbhost,                                          
    port = dbport,                                     
    database = "postgres",                             
)

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", " "],
    chunk_size=CHUNK_SIZE_FOR_DOC_SPLIT,
    chunk_overlap=CHUNK_OVERLAP_FOR_DOC_SPLIT,
    length_function=len
)
loader = PyPDFDirectoryLoader(input_data_dir)
docs = loader.load_and_split(text_splitter)

# s3loader = S3DirectoryLoader(SOURCE_BUCKET)
# s3documents = s3loader.load()

vectorstore = get_vectorstore(docs)

print('documents loaded ...')

## Similarity search with score

Run a similarity search using the [similarity_search_with_score](https://python.langchain.com/docs/modules/data_connection/vectorstores/integrations/pgvector) function from pgvector.

In [None]:
query = "What security measures does SageMaker have?"
docs_with_score = vectorstore.similarity_search_with_score(query)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print(doc.metadata)
    print("-" * 80)

## Calculate cosine similarity

Use the Cosine function to refine the results to the best possible match.

In [None]:
store = PGVector(
    connection_string=connection_string, 
    embedding_function=embeddings, 
    collection_name="llm_docs",
    distance_strategy=DistanceStrategy.COSINE
)

retriever = store.as_retriever(search_kwargs={"k": 1})

In [None]:
retriever.get_relevant_documents(query='What ML governance tools does SageMaker provide?')