In [1]:
# Client Setup
import boto3

#client = boto3.client("bedrock-runtime", region_name="us-east-1")
session = boto3.Session(profile_name="bedrock-dev")
client = session.client("bedrock-runtime", region_name="us-west-2")
# Claude model ID â€” must be correct and available in the region
model_id = "us.anthropic.claude-3-sonnet-20240229-v1:0"

# Note: You might have to request access to this model on the AWS Bedrock console
#embedding_model_id = "amazon.titan-embed-text-v2:0"
embedding_model_id = "amazon.titan-embed-text-v1"


In [2]:
# Chunk by section
import re


def chunk_by_section(document_text):
    pattern = r"\n## "
    return re.split(pattern, document_text)

In [3]:
# Embedding Generation
import json


def generate_embedding_org(
    text,
    embedding_model_id="amazon.titan-embed-text-v1",
    dimensions=1024,
    normalize=True,
):
    #request_body = {
    #    "inputText": text,
    #    "dimensions": dimensions,
    #    "normalize": normalize,
    #}
    request_body = {
        "inputText": text,
    }

    request_json = json.dumps(request_body)
    response = client.invoke_model(
        modelId=embedding_model_id,
        body=request_json,
        accept="application/json",
        contentType="application/json",
    )
    response_body = json.loads(response.get("body").read())

    return response_body["embedding"]

In [4]:
def generate_embedding(text):
    request_body = {
        "inputText": text
    }

    request_json = json.dumps(request_body)

    response = client.invoke_model(
        modelId=embedding_model_id,
        body=request_json,
        accept="application/json",
        contentType="application/json",
    )

    response_body = json.loads(response['body'].read())
    return response_body['embedding']


In [5]:
# VectorIndex implementation
import math
from typing import Callable, Optional, Any, List, Dict, Tuple


class VectorIndex:
    def __init__(
        self,
        distance_metric: str = "cosine",
        embedding_fn: Optional[Callable[[str], List[float]]] = None,
    ):
        self.vectors: List[List[float]] = []
        self.documents: List[Dict[str, Any]] = []
        self._vector_dim: Optional[int] = None
        if distance_metric not in ["cosine", "euclidean"]:
            raise ValueError("distance_metric must be 'cosine' or 'euclidean'")
        self._distance_metric = distance_metric
        self._embedding_fn = embedding_fn

    def add_document(self, document: Dict[str, Any]):
        if not self._embedding_fn:
            raise ValueError(
                "Embedding function not provided during initialization."
            )
        if not isinstance(document, dict):
            raise TypeError("Document must be a dictionary.")
        if "content" not in document:
            raise ValueError(
                "Document dictionary must contain a 'content' key."
            )

        content = document["content"]
        if not isinstance(content, str):
            raise TypeError("Document 'content' must be a string.")

        vector = self._embedding_fn(content)
        self.add_vector(vector=vector, document=document)

    def search(
        self, query: Any, k: int = 1
    ) -> List[Tuple[Dict[str, Any], float]]:
        if not self.vectors:
            return []

        if isinstance(query, str):
            if not self._embedding_fn:
                raise ValueError(
                    "Embedding function not provided for string query."
                )
            query_vector = self._embedding_fn(query)
        elif isinstance(query, list) and all(
            isinstance(x, (int, float)) for x in query
        ):
            query_vector = query
        else:
            raise TypeError(
                "Query must be either a string or a list of numbers."
            )

        if self._vector_dim is None:
            return []

        if len(query_vector) != self._vector_dim:
            raise ValueError(
                f"Query vector dimension mismatch. Expected {self._vector_dim}, got {len(query_vector)}"
            )

        if k <= 0:
            raise ValueError("k must be a positive integer.")

        if self._distance_metric == "cosine":
            dist_func = self._cosine_distance
        else:
            dist_func = self._euclidean_distance

        distances = []
        for i, stored_vector in enumerate(self.vectors):
            distance = dist_func(query_vector, stored_vector)
            distances.append((distance, self.documents[i]))

        distances.sort(key=lambda item: item[0])

        return [(doc, dist) for dist, doc in distances[:k]]

    def add_vector(self, vector: List[float], document: Dict[str, Any]):
        if not isinstance(vector, list) or not all(
            isinstance(x, (int, float)) for x in vector
        ):
            raise TypeError("Vector must be a list of numbers.")
        if not isinstance(document, dict):
            raise TypeError("Document must be a dictionary.")
        if "content" not in document:
            raise ValueError(
                "Document dictionary must contain a 'content' key."
            )

        if not self.vectors:
            self._vector_dim = len(vector)
        elif len(vector) != self._vector_dim:
            raise ValueError(
                f"Inconsistent vector dimension. Expected {self._vector_dim}, got {len(vector)}"
            )

        self.vectors.append(list(vector))
        self.documents.append(document)

    def _euclidean_distance(
        self, vec1: List[float], vec2: List[float]
    ) -> float:
        if len(vec1) != len(vec2):
            raise ValueError("Vectors must have the same dimension")
        return math.sqrt(sum((p - q) ** 2 for p, q in zip(vec1, vec2)))

    def _dot_product(self, vec1: List[float], vec2: List[float]) -> float:
        if len(vec1) != len(vec2):
            raise ValueError("Vectors must have the same dimension")
        return sum(p * q for p, q in zip(vec1, vec2))

    def _magnitude(self, vec: List[float]) -> float:
        return math.sqrt(sum(x * x for x in vec))

    def _cosine_distance(self, vec1: List[float], vec2: List[float]) -> float:
        if len(vec1) != len(vec2):
            raise ValueError("Vectors must have the same dimension")

        mag1 = self._magnitude(vec1)
        mag2 = self._magnitude(vec2)

        if mag1 == 0 and mag2 == 0:
            return 0.0
        elif mag1 == 0 or mag2 == 0:
            return 1.0

        dot_prod = self._dot_product(vec1, vec2)
        cosine_similarity = dot_prod / (mag1 * mag2)
        cosine_similarity = max(-1.0, min(1.0, cosine_similarity))

        return 1.0 - cosine_similarity

    def __len__(self) -> int:
        return len(self.vectors)

    def __repr__(self) -> str:
        has_embed_fn = "Yes" if self._embedding_fn else "No"
        return f"VectorIndex(count={len(self)}, dim={self._vector_dim}, metric='{self._distance_metric}', has_embedding_fn='{has_embed_fn}')"

In [11]:
with open("./013_report.md", "r") as f:
    text = f.read()

In [12]:
# 1. Chunk the text by section
chunks = chunk_by_section(text)
chunks[3]

'Methodology\n\nThe insights compiled within this Annual Interdisciplinary Research Review represent a synthesis of findings drawn from standard departmental reporting cycles, specialized project updates, and cross-functional review meetings conducted throughout the year. Data sources included internal project databases, laboratory notebooks, financial reporting systems, legal case summaries, security incident logs, and minutes from dedicated working groups. A central review committee, comprising representatives nominated by each division head, was tasked with identifying key developments and potential cross-domain implications. This committee utilized a standardized reporting template to capture essential details, including unique identifiers (project codes, error numbers, case references, etc.) and progress metrics. Subsequent analysis focused on identifying thematic overlaps, shared challenges, and opportunities for synergistic development, forming the basis of this consolidated rep

In [13]:
# 2. Generate embeddings for each chunk
# loop and generating embedding for each one

embeddings = [generate_embedding(chunk) for chunk in chunks]
embeddings[0]

[1.0703125,
 0.54296875,
 -0.21484375,
 -0.43359375,
 0.045654296875,
 -0.412109375,
 -0.0556640625,
 -0.0003414154052734375,
 -0.2109375,
 -0.1953125,
 0.55078125,
 -0.1328125,
 0.171875,
 -0.1630859375,
 -0.13671875,
 -0.51171875,
 -0.04736328125,
 -0.486328125,
 -0.69921875,
 0.1845703125,
 -0.0732421875,
 0.173828125,
 0.34375,
 -0.52734375,
 -0.294921875,
 0.08935546875,
 0.57421875,
 0.255859375,
 0.07177734375,
 0.466796875,
 -0.2216796875,
 1.1015625,
 0.3359375,
 0.1318359375,
 0.2138671875,
 0.279296875,
 -0.11181640625,
 0.2177734375,
 0.26171875,
 0.4609375,
 -0.38671875,
 -0.72265625,
 0.76953125,
 0.1513671875,
 -0.70703125,
 -0.71875,
 0.12890625,
 -0.37109375,
 0.34375,
 -0.279296875,
 0.365234375,
 -0.1298828125,
 -0.59765625,
 0.2890625,
 -0.119140625,
 -0.2236328125,
 0.48828125,
 0.400390625,
 0.1318359375,
 -0.16015625,
 0.384765625,
 -0.279296875,
 0.1650390625,
 1.1875,
 -0.0025177001953125,
 -0.361328125,
 0.2314453125,
 0.08642578125,
 0.2392578125,
 0.3203125,

In [14]:
# 3. Create a vector store and add each embedding to it
store = VectorIndex()


for embedding, chunk in zip(embeddings,chunks):
    # we generate don't have info about vector nnumber
    # we can associate chunk with it just to remmeber
    store.add_vector(embedding, {"content": chunk}) 
    
    

In [15]:
# 4. Some time later, a user will ask a question. Generate an embedding for it
user_embedding = generate_embedding("what happened with INC-2023-Q4-Q11")

In [None]:
# 5. Search the store with the embedding, find the 2 most relevant chunks
# INC-2023-Q4-011 --> this is appearing in section2. now search with this terms.
# to improve search we will implement parallel lexical search system
results = store.search(user_embedding, 2)

for doc, distance in results:
    print(distance,"\n", doc["content"][0:200], "\n")

0.4319377534761152 
 Section 10: Cybersecurity Analysis - Incident Response Report: INC-2023-Q4-011

The Cybersecurity Operations Center successfully contained and remediated a targeted intrusion attempt tracked as `INC-2 

0.4808950090253611 
 Section 3: Financial Analysis - Q3 Performance and Outlook

Quarterly financial analysis revealed a complex picture. Overall group revenue saw modest growth of 3.1% year-over-year, primarily driven by 

