In [6]:
import os
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_upstage import UpstageEmbeddings
from langchain_upstage import UpstageLayoutAnalysisLoader
from langchain.docstore.document import Document
from langchain_upstage import ChatUpstage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter
)

In [7]:
load_dotenv('../')
api_key = os.environ['UPSTAGE_API_KEY']

In [8]:
# load pdf into html file
pdf_filepath="./sample_pdfs/Attention Is All You Need.pdf"
print('parsing pdf into html...')
layzer = UpstageLayoutAnalysisLoader(pdf_filepath, output_type="html")
docs = layzer.load()
print('Done')

parsing pdf into html...


KeyboardInterrupt: 

In [None]:
# split text into text chunks
text_spliiter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=1000, chunk_overlap=100, language=Language.HTML
)
splits = text_spliiter.split_documents(docs)
print(f"Splits: {len(splits)}")
unique_splits=[]
for split in splits:
    if split not in unique_splits:
        unique_splits.append(split)
print(f"unique_splits: {len(unique_splits)}")

Splits: 78
unique_splits: 77


In [None]:
# generate embeddings for the given text chunks and save it
embedding_name="attention"
persist_directory=f"./chroma_db/{embedding_name}/"

vectorstore = Chroma.from_documents(
    documents=unique_splits,
    ids=[doc.page_content for doc in unique_splits],
    embedding=UpstageEmbeddings(model="solar-embedding-1-large"),
    persist_directory=persist_directory,
)

# vectorstore = Chroma(
#     persist_directory=f"./chroma_db/{embedding_name}",
#     embedding_function=UpstageEmbeddings(model="solar-embedding-1-large"),
# )

In [None]:
vectorstore=None

In [None]:
vectorstore

In [None]:
vectorstore = Chroma(
    persist_directory=persist_directory,
    embedding_function=UpstageEmbeddings(model="solar-embedding-1-large"),
    collection_name="paper"
)
vectorstore.get()

{'ids': ['<br>',
  '<br><h1 id=\'5\' style=\'font-size:16px\'>Attention Visualizations</h1><figure><img id=\'6\' style=\'font-size:20px\' alt="It is in this spirit that a majority of American governments have passed new laws since 2009 making the registration or voting process more difficult . <EOS>',
  "<br><p id='1' data-category='paragraph' style='font-size:18px'>Provided proper attribution is provided, Google hereby grants permission to<br>reproduce the tables and figures in this paper solely for use in journalistic or<br>scholarly works.</p><h1 id='2' style='font-size:20px'>Attention Is All You Need</h1><table id='3' style='font-size:16px'><tr><td>Ashish Vaswani ∗</td><td>Noam Shazeer ∗</td><td>Niki Parmar ∗</td><td>Jakob Uszkoreit ∗</td></tr><tr><td>Google Brain</td><td>Google Brain</td><td>Google Research</td><td>Google Research</td></tr><tr><td>avaswani@google.com</td><td>noam@google.com</td><td>nikip@google.com</td><td>usz@google.com</td></tr></table><table id='4' style='font-

In [None]:
# generate function for generate embedding

def generate_embeddings(pdf_filepath, embedding_name):
    """
    Generate Embeddings for the given pdf, return 1 if success otherwise return 0
    """
    try:
        layzer = UpstageLayoutAnalysisLoader(pdf_filepath, output_type="html")
        docs = layzer.load()
        # split text into text chunks
        text_spliiter = RecursiveCharacterTextSplitter.from_language(
            chunk_size=1000, chunk_overlap=100, language=Language.HTML
        )
        splits = text_spliiter.split_documents(docs)
        unique_splits=[]
        for split in splits:
            if split not in unique_splits:
                unique_splits.append(split)
        persist_directory=f"./chroma_db/{embedding_name}/"
        vectorstore = Chroma.from_documents(
            documents=unique_splits,
            ids=[doc.page_content for doc in unique_splits],
            embedding=UpstageEmbeddings(model="solar-embedding-1-large"),
            persist_directory=persist_directory,
        )
        print(f"embedding saved in ./chroma_db/{embedding_name}/")
        return 1
    except Exception as e:
        print(e)
        return 0

In [None]:
def retrieve(retriever, input_query):
    return retriever.invoke(input_query)

def get_retriever(embedding_name):
    vector_db = Chroma(
        persist_directory=f"./chroma_db/{embedding_name}/",
        embedding_function=UpstageEmbeddings(model="solar-embedding-1-large")
    )
    return vector_db.as_retriever()

def inference(question, embedding_names):
    name_list = os.listdir('./chroma_db/')
    embedding_names = list(set(embedding_names))
    # check if their are any invalid names of embeddings
    print("[[CHECKING EMBEDDING NAMES...]]")
    for name in embedding_names:
        if name not in embedding_names:
            raise Exception(f"{name} embedding not found.")

    print("[[LOADING EMBEDDINGS...]]")
    # accumulate retrivers into a single list
    retrievers=[]
    for name in embedding_names:
        retriever = get_retriever(name)
        print(retriever)
        retrievers.append((retriever, name))
    # generate documents
    print("[[RETRIEVING RELEVANT DOCS...]]")
    context=""
    for retriever in retrievers:
        result = retrieve(retriever[0], question)
        print(result)
        #TODO post process result
        refined_result=""+f"from [{retriever[1]}]"
        context+=refined_result+"\n\n"
    
    system_msg = SystemMessagePromptTemplate.from_template(
        "You are an assistant for question-answering tasks.\n"
        "Use the following pieces of retrieved context to answer the question.\n"
        "If you don't know the answer, just say that you don't know.\n"
        "Use three sentences maximum and keep the answer concise.\n"
    )
    human_msg = HumanMessagePromptTemplate.from_template(
    "Question: {question}\n\n" 
    "Context: {context}" 
    "Answer:"
    )
    chat_prompt = ChatPromptTemplate.from_messages(
        [system_msg, human_msg]
    )
    model = ChatUpstage(api_key=api_key)
    chain = chat_prompt | model
    output = chain.invoke({'question': question, 'context': context})
    return output.content
    

    
    
        

In [None]:
inference("What is attention?", ['attention'])

[[CHECKING EMBEDDING NAMES...]]
[[LOADING EMBEDDINGS...]]
tags=['Chroma', 'UpstageEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x147992620>
[[RETRIEVING RELEVANT DOCS...]]
[Document(metadata={'total_pages': 15}, page_content='<p id=\'31\' data-category=\'paragraph\' style=\'font-size:16px\'>An attention function can be described as mapping a query and a set of key-value pairs to an output,<br>where the query, keys, values, and output are all vectors. The output is computed as a weighted sum</p><h1 id=\'33\' style=\'font-size:20px\'>Scaled Dot-Product Attention</h1><figure><img id=\'34\' alt="" data-coord="top-left:(351,196); bottom-right:(505,465)" /></figure><br><h1 id=\'35\' style=\'font-size:20px\'>Multi-Head Attention</h1><figure><img id=\'36\' alt="" data-coord="top-left:(715,168); bottom-right:(975,505)" /></figure><p id=\'37\' data-category=\'paragraph\' style=\'font-size:20px\'>Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attenti

'Attention is a cognitive process that involves selecting and focusing on relevant stimuli while ignoring irrelevant ones. It allows individuals to concentrate on specific tasks or information, enabling effective processing and understanding.'

In [1]:
from utils import inference, generate_embeddings
from pprint import pprint

# generate_embeddings('./sample_pdfs/BERT-Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf', 'bert')
# generate_embeddings('./sample_pdfs/Language Models are Unsupervised Multitask Learners.pdf', 'gpt')

output = inference("What is attention mechanism and how is it used in bert and gpt?", ['attention', 'bert', 'gpt'])
pprint(output)

[[CHECKING EMBEDDING NAMES...]]
[[LOADING EMBEDDINGS...]]
tags=['Chroma', 'UpstageEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x107feae30>
tags=['Chroma', 'UpstageEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x148dbc700>
tags=['Chroma', 'UpstageEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1492b6830>
[[RETRIEVING RELEVANT DOCS...]]
[Document(metadata={'total_pages': 15}, page_content="<p id='14' data-category='paragraph' style='font-size:18px'>Attention mechanisms have become an integral part of compelling sequence modeling and transduc-<br>tion models in various tasks, allowing modeling of dependencies without regard to their distance in<br>the input or output sequences [ 2 , 19 ]. In all but a few cases [ 27 ], however, such attention mechanisms<br>are used in conjunction with a recurrent network.</p><br><p id='15' data-category='paragraph' style='font-size:18px'>In this work we propose the Transfor