In [8]:
import json
import sqlite3
from typing import Tuple, Iterator
from typing import Sequence, List, Optional
from langchain.schema import BaseStore
from pypika import Query, Table, Field, Column


class ChromaStore(BaseStore[str, bytes]):

    def __init__(self, path, user_id):
        self.path = path
        self.table_name = "docstore_{}".format(user_id)
        self.table = Table(self.table_name)
        self.id_column = Field('id')
        self.data_column = Field('data')
        self._create_table()

    def get_connection(self):
        return sqlite3.connect('{path}/chroma.sqlite3'.format(path=self.path))

    def _create_table(self):
        id_column = Column('id', 'VARCHAR(50)', nullable=False)
        data_column = Column('data', 'VARCHAR(2500)', nullable=False)
        create_table_query = Query.create_table(self.table).columns(id_column, data_column).if_not_exists()
        with self.get_connection() as connection:
            cursor = connection.cursor()
            cursor.execute(create_table_query.get_sql())
            cursor.close()

    def mget(self, keys: Sequence[str]) -> List[Optional[bytes]]:
        select_query = Query.from_(self.table).select(self.data_column).where(self.id_column.isin(keys))
        with self.get_connection() as connection:
            cursor = connection.cursor()
            cursor.execute(select_query.get_sql())
            results = cursor.fetchall()

            cursor.close()

            data_list = []
            for result in results:
                if result[0] is not None:
                    data_list.append(json.loads(result[0]).encode("utf-8"))
                else:
                    data_list.append(None)

            return data_list

    def mset(self, key_value_pairs: Sequence[Tuple[int, bytes]]) -> None:
        insert_queries = []
        for key, value in key_value_pairs:
            insert_query = Query.into(self.table).columns(self.id_column, self.data_column).insert(key, json.dumps(
                value.decode('utf-8')))
            insert_queries.append(insert_query)
        with self.get_connection() as connection:
            cursor = connection.cursor()
            for query in insert_queries:
                cursor.execute(query.get_sql())
            connection.commit()
            cursor.close()

    def mdelete(self, keys: Sequence[int]) -> None:
        delete_query = Query.from_(self.table).delete().where(self.id_column.isin(keys))
        with self.get_connection() as connection:
            cursor = connection.cursor()
            cursor.execute(delete_query.get_sql())
            connection.commit()
            cursor.close()

    def yield_keys(self, prefix: Optional[str] = None) -> Iterator[str]:
        select_query = Query.from_(self.table).select(self.id_column)
        if prefix:
            select_query = select_query.where(self.id_column.like(f'{prefix}%'))
        with self.get_connection() as connection:
            cursor = connection.cursor()
            cursor.execute(select_query.get_sql())

            for row in cursor.fetchall():
                yield row[0]

            cursor.close()

In [9]:
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_postgres import PGVector
from langchain.storage._lc_store import create_kv_docstore


embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma(collection_name="summaries", embedding_function=embedding_function, persist_directory="../data/")
cs = ChromaStore("../data/", "summaries")

store = create_kv_docstore(cs)

id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)



In [28]:
import chromadb
from chromadb.config import Settings
client = chromadb.Client(Settings(is_persistent=True,
                                    persist_directory="../data/",
                                ))
# coll = client.list("sd")

if "summaries" in [c.name for c in client.list_collections()]:
    print("Heelo")

Heelo


In [25]:
coll

Collection(name=summaries)

In [4]:
docs = retriever.invoke(
    "Please Explain Attention Mechanism"
)

In [6]:
retriever.vectorstore.similarity_search_with_score("Please Explain Attention Mechanism")

[(Document(metadata={'doc_id': '6a9d4db7-a0a2-462c-9127-7dc1d8b7b355'}, page_content='The image presents a flowchart illustrating the process of using Scaled Dot-Product Attention (SDPA) to combine the outputs of multiple attention heads in a neural network.\n\n** Overview **\nThe flowchart is divided into several sections, each representing a different stage in the process.\n\n** Main Points **\n\n* **Input**\n\t+ Three input vectors: V, K, and Q\n\t+ Each vector has an arrow pointing to a box labeled "Linear"\n* **Linear Transformation**\n\t+ Each'),
  0.9818795248202128),
 (Document(metadata={'doc_id': 'bf818020-71c4-4575-a026-7d40aaafe723'}, page_content='The image presents a diagram illustrating the architecture of two neural network models: Scaled Dot-Product Attention and Multi-Head Attention. The diagram is divided into two sections, with the left side representing Scaled Dot-Product Attention and the right side representing Multi-Head Attention.\n\n**Scaled Dot-Product Attenti

In [7]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage, HumanMessage
from base64 import b64decode
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate


import base64
from IPython.display import Image, display

def display_base64_image(base64_code):
    image_data = base64.b64decode(base64_code)
    display(Image(data=image_data))


def parse_docs(docs):
    b64 = []
    text = []
    for doc in docs:
        try:
            b64decode(doc.page_content)
            b64.append(doc.page_content)
        except Exception as e:
            text.append(doc)
    return {"images": b64, "texts": text}


def build_prompt(kwargs):

    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]
    
    
    context_text = ""
    if len(docs_by_type["texts"]) > 0:
        for text_element in docs_by_type["texts"]:
            context_text += text_element.page_content

    # construct prompt with context (including images)
    prompt_template = f"""
    Answer the question based only on the following context, which can include text, tables, and the images. 
    
    Context: {context_text}
    
    Question: {user_question}
    """

    prompt_content = [{"type": "text", "text": prompt_template}]

    if len(docs_by_type["images"]) > 0:
        for image in docs_by_type["images"]:
            prompt_content.append(
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image}"},
                }
            )
    return [{"role":"user","content":prompt_content}]
def genai_model(prompt_list):
    output = client.chat.completions.create(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
        messages=prompt_list,
        max_tokens=1200
    )
    
    return output.choices[0].message.content

chain_with_sources = {
    "context": retriever | RunnableLambda(parse_docs),
    "question": RunnablePassthrough(),
} | RunnablePassthrough().assign(
    response=(
        RunnableLambda(build_prompt)
        | RunnableLambda(genai_model)
        | StrOutputParser()
    )
)

response = chain_with_sources.invoke(
    "Explain Attention Mechanism"
)

print("Response:", response['response'])

print("\n\nContext:")
for text in response['context']['texts']:
#     print(text.text)
    print("Page number: ", text.metadata['page_number'])
    print("\n" + "-"*50 + "\n")
for image in response['context']['images']:
    display_base64_image(image)

NameError: name 'StrOutputParser' is not defined

In [27]:
from unstructured.partition.docx import partition_docx

class WordDocParser:
    def __init__(self, docx_filename):
        self.docx_filename = docx_filename

    def parse_docx(self):
        docx_path = self.docx_filename
        chunks = partition_docx(
            filename=docx_path,
            infer_table_structure=True,
            strategy="hi_res",
            extract_image_block_types=["Image"],
            extract_image_block_to_payload=True,
            chunking_strategy="by_title",
            max_characters=4000,
            combine_text_under_n_chars=1000,
            new_after_n_chars=3000,
        )

        tables_list = []
        texts_list = []

        for chunk in chunks:
            if "Table" in str(type(chunk)):
                tables_list.append(chunk)
            if "CompositeElement" in str(type(chunk)):
                texts_list.append(chunk)

        images_list = self._get_images_base64(chunks)
        return texts_list, tables_list, images_list

    def _get_images_base64(self, chunks):
        images_b64 = []
        for chunk in chunks:
            if "CompositeElement" in str(type(chunk)):
                chunk_els = chunk.metadata.orig_elements
                for el in chunk_els:
                    if "Image" in str(type(el)):
                        images_b64.append(el.metadata.image_base64)
        return images_b64


In [28]:
az = WordDocParser(r"C:\Users\rajneesh.jha\Downloads\GGX Demo Details V1.docx").parse_docx()

In [29]:
az

([<unstructured.documents.elements.CompositeElement at 0x19968823d90>,
  <unstructured.documents.elements.CompositeElement at 0x199688212d0>,
  <unstructured.documents.elements.CompositeElement at 0x19968821590>],
 [],
 [])

In [26]:
az[0][0].text

'GGX Demo:\n\nOverview of GGX:\n\nIntroduction to GGX project and platform\n\nRisks associated with the GenAi Pipeline\n\nRegulatory requirements for GenAI pipelines\n\nData Vault:\n\nA place to host all the useful datasets and track it within Corridor environment.\n\nAbility to connect to the client’s data lake or upload custom datasets.\n\nRun pre configured multiple data related checks.\n\nGGX curated validation datasets for testing/validation of different GGX components.\n\nOpen Source datasets already present in the platform for some of the famous benchmarking tests.\n\nIntent Recognition Validation Data: https://genai2.corridorplatforms.com/data-vault/table-registry/35/details\n\nResponse Validation Data: https://genai2.corridorplatforms.com/data-vault/table-registry/37/details\n\nGGX Prompt Injection Strategies:\n\nhttps://genai2.corridorplatforms.com/data-vault/table-registry/38/details\n\nGenAi Studio:\n\nCentral place to organize all the GenAI related work with proper access 

In [34]:
from gtts import gTTS
import pygame
from io import BytesIO

# The text that you want to convert to audio
mytext = """Attention in this context refers to an attention function, which is a mechanism that maps a query and a set of key-value pairs to an output. The input consists of queries and keys of dimension dk, and values of dimension dv. The attention function computes the dot products of the queries with all keys, applies a softmax function to obtain the weights on the query with all keys, divides each by √ dk, and then multiplies the values with these weights. This process is also known as Scaled Dot-Product Attention.

Attention is a crucial component of various deep learning models, including the Transformer model. It allows the model to focus on specific parts of the input data, weigh them appropriately, and produce an output based on these weighted inputs. In the context of the Transformer model, attention is used in multiple layers, including encoder-decoder attention layers, self-attention layers in the encoder, and self-attention layers in the decoder. It plays a key role in enhancing the model's ability to capture complex relationships between different elements of the input data.

In summary, attention refers to the attention function used in deep learning models, which computes the weighted sum of values based on their compatibility with queries, as determined by the dot product of the queries with keys, normalized by a scaling factor."""

# Language in which you want to convert
language = 'en'

# Passing the text and language to the engine
tts = gTTS(text=mytext, lang=language, slow=False)

# Create an in-memory bytes buffer to save the audio
audio_stream = BytesIO()
tts.write_to_fp(audio_stream)

# Rewind the buffer to the beginning
audio_stream.seek(0)

# Initialize the mixer module
pygame.mixer.init()

# Load the mp3 data from the bytes stream
pygame.mixer.music.load(audio_stream, "mp3")

# Play the loaded mp3 data
pygame.mixer.music.play()

# Keep the program running until the audio finishes playing
while pygame.mixer.music.get_busy():
    pass


In [38]:
import speech_recognition as sr

# Initialize recognizer
recognizer = sr.Recognizer()

# Use the microphone as the audio source
with sr.Microphone() as source:
    print("Adjusting for ambient noise... Please wait.")
    recognizer.adjust_for_ambient_noise(source, duration=1)
    print("Listening for your speech...")
    try:
        audio = recognizer.listen(source)
        text = recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        print("Sorry, I could not understand the audio.")
    except sr.RequestError as e:
        print(f"Error with Google Speech Recognition service: {e}")


Adjusting for ambient noise... Please wait.
Listening for your speech...
Processing audio...
You said: what is attention
