In [1]:
import json
import os
import sys
import numpy as np
from urllib.request import urlretrieve
import ssl
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

import boto3

bedrock_runtime = boto3.client(
    service_name='bedrock-runtime', 
    aws_access_key_id=os.getenv('aws_access_key_id'),
    aws_secret_access_key=os.getenv('aws_secret_access_key'),
    region_name='us-west-2'
)

In [2]:
llm = Bedrock(model_id="anthropic.claude-v2", client=bedrock_runtime, model_kwargs={'max_tokens_to_sample':200})
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_runtime)

In [3]:
ssl._create_default_https_context = ssl._create_unverified_context

os.makedirs("data", exist_ok=True)
files = [
    "https://www.cdc.gov/vaccines/imz-managers/downloads/COVID-19-Vaccination-Program-Interim_Playbook.pdf",
    "https://iris.who.int/bitstream/handle/10665/89966/9789241506021_eng.pdf",
]
for url in files:
    file_path = os.path.join("data", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [4]:
loader = PyPDFDirectoryLoader("./data/")

documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
)
text_to_process = text_splitter.split_documents(documents)

In [5]:
def compute_avg_length(text_to_process):
    total_length = sum(len(text.page_content) for text in text_to_process)
    return total_length // len(text_to_process)

initial_avg_length = compute_avg_length(documents)
final_avg_length = compute_avg_length(text_to_process)

print(f'Initial average length across {len(documents)} loaded documents is {initial_avg_length} characters.')
print(f'Post-split, there are {len(text_to_process)} documents, an increase from the original {len(documents)} documents.')
print(f'Final average length across {len(text_to_process)} documents is {final_avg_length} characters.')

Initial average length across 166 loaded documents is 3108 characters.
Post-split, there are 639 documents, an increase from the original 166 documents.
Final average length across 639 documents is 836 characters.


In [6]:
example_embedding = np.array(bedrock_embeddings.embed_query(text_to_process[0].page_content))
print("Example embedding for a document segment: ", example_embedding)
print("Dimensions of the embedding: ", example_embedding.shape)

Example embedding for a document segment:  [ 1.203125    0.25195312 -0.19238281 ...  0.296875   -0.234375
 -0.53515625]
Dimensions of the embedding:  (1536,)


In [7]:
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

vectorstore_faiss = FAISS.from_documents(text_to_process,bedrock_embeddings)
wrapper_store_faiss = VectorStoreIndexWrapper(vectorstore=vectorstore_faiss)

In [8]:
query = """What measures are recommended for monitoring and evaluating the effectiveness of the vaccination program?"""

In [9]:
query_embedding = vectorstore_faiss.embedding_function(query)
np.array(query_embedding)

array([-1.0839844e-01, -1.7675781e-01,  6.7968750e-01, ...,
        8.9645386e-04,  9.0234375e-01, -2.1972656e-01])

In [10]:
matched_documents = vectorstore_faiss.similarity_search_by_vector(query_embedding)
print(f'{len(matched_documents)} documents retrieved relevant to the query.')
print('----------------------------------------')
for idx, match_doc in enumerate(matched_documents):
    print(f'## Document {idx+1}: {match_doc.page_content}.......')
    print('----------------------------------------')

4 documents retrieved relevant to the query.
----------------------------------------
## Document 1: Immediate Priorities for Immunization Programs Related to Data Reporting:
• Determine and implement a solution for documenting vaccine administration in temporary or high
-volume settings (e.g.,
VAMS or similar application, IIS or module
that interfaces with the IIS, or other jurisdiction -based solution)
• Ensure system capacity for data exchange,
security, storage, and reporting
• Enroll vaccination provider facilities/organizations anticipated to vaccinate essential workers
• Connect IIS to the IZ Gateway
• Establish required data use agreements
• Assess and improve data quality
o Ensure data  are available, secure,
complete, timely, valid, accurate,
consistent, and unique.......
----------------------------------------
## Document 2: o Focus on ensuring equitable vaccination  access across the entire pop ulation.
Monitor vaccine
uptake and coverage ; reassess strategy to increase up