# Multimodal RAG

In [48]:
import os
import sys
import glob

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, ".."))
repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

from utils.sambanova_endpoint import SambaNovaEndpoint
from dotenv import load_dotenv
load_dotenv(os.path.join(repo_dir,'.env'))

import requests
import json
import base64
from pprint import pprint

## utils

In [49]:
def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        image_binary = image_file.read()
        base64_image = base64.b64encode(image_binary).decode()
        return base64_image

## Multimodal call

In [60]:
# sambastudio call
def llava_call(prompt, image_path):
    image=image_to_base64(image_path)
    endpoint_url = f"{os.environ.get('BASE_URL')}/api/predict/generic/{os.environ.get('PROJECT_ID')}/{os.environ.get('ENDPOINT_ID')}"
    endpoint_key = os.environ.get('API_KEY')
    # Define the data payload
    data = {
        "instances": [{
            "prompt": prompt,
            "image_content": f"{image}"
        }],
        "params": {
            "do_sample": {"type": "bool", "value": "false"},
            "max_tokens_to_generate": {"type": "int", "value": "512"},
            "temperature": {"type": "float", "value": "1"},
            "top_k": {"type": "int", "value": "50"},
            "top_logprobs": {"type": "int", "value": "0"},
            "top_p": {"type": "float", "value": "1"}
        }
    }
    # Define headers
    headers = {
        "Content-Type": "application/json",
        "key": endpoint_key
    }
    response = requests.post(endpoint_url, headers=headers, data=json.dumps(data))
    return response.json()["predictions"][0]['completion']

### QA Llava Call

In [61]:
prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the humans question. USER: <image>\nhow many birds could you find at 4pm?. ASSISTANT:"
image_path = os.path.join(kit_dir,"data","sample_docs","sample.png")
llava_call(prompt, image_path)

'At 4 pm, you could find approximately 10 birds on the tree.'

### Summary Llava call

In [59]:
prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the humans question. USER: <image>\nDescribe the image in detail. Be specific about graphs, such as bar plots, scatter plots, or others. ASSISTANT:"
llava_call(prompt, image_path)

'The image displays a graph showing the number of birds on a tree at different times of the day. The graph is a combination of a bar plot and a scatter plot, with the bar plot showing the number of birds at various times of the day, and the scatter plot showing the number of birds on a tree at a specific time.\n\nThe graph is divided into two main sections. The first section, which is the bar plot, shows the number of birds on a tree at different times of the day, with the bars extending from 10 am to 11 pm. The second section, which is the scatter plot, shows the number of birds on a tree at a specific time, with the x-axis representing the time and the y-axis representing the number of birds.\n\nThe graph is labeled with the time of the day, and the number of birds is represented by the number of orange dots on the graph. The dots are scattered throughout the graph, with some appearing closer to the bars and others appearing closer to the scatter plot.'

## PDF extraction

In [63]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

# Path to save images
file_path=os.path.join(kit_dir, "data", "sample_docs", "sample.pdf")
output_path=os.path.splitext(file_path)[0]

# Get elements
raw_pdf_elements = partition_pdf(
    filename=file_path,
    extract_images_in_pdf=True,
    strategy='hi_res',
     hi_res_model_name="yolox",
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=1000,
    new_after_n_chars=800,
    combine_text_under_n_chars=500,
    extract_image_block_output_dir=output_path,
)

### View Elements

In [152]:
for i, element in enumerate(raw_pdf_elements):
    print(f"\033[95m ELEMENT {i}\033[00m")
    print(f"TYPE: {type(element)}")
    print(f"META: {element.metadata.to_dict()}")
    print(f"TEXT: {element.text}")
    print("\n\n##########\n")

[95m ELEMENT 0[00m
TYPE: <class 'unstructured.documents.elements.CompositeElement'>
META: {'filetype': 'application/pdf', 'languages': ['eng'], 'last_modified': '2024-05-06T12:02:48', 'page_number': 1, 'file_directory': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit/multimodal_knowledge_retriever/data/sample_docs', 'filename': 'sample.pdf'}
TEXT: 4 2 0 2

r p A 8 ] L C . s c [ 1 v 9 2 8 5 0 . 4 0 4 2 : v i X r a

SambaLingo: Teaching Large Language Models New Languages

Zoltan Csaki, Bo Li, Jonathan Li, Qiantong Xu, Pian Pawakapan, Leon Zhang, Yun Du, Hengyu Zhao, Changran Hu & Urmish Thakker SambaNova Systems zoltan.csaki@sambanovasystems.com

Abstract


##########

[95m ELEMENT 1[00m
TYPE: <class 'unstructured.documents.elements.CompositeElement'>
META: {'filetype': 'application/pdf', 'languages': ['eng'], 'last_modified': '2024-05-06T12:02:48', 'page_number': 1, 'file_directory': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit/multimodal_knowledge_retriever/data/s

In [65]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
# TableChunk if Table > max chars set above
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 102,
 "<class 'unstructured.documents.elements.Table'>": 8,
 "<class 'unstructured.documents.elements.TableChunk'>": 2}

In [181]:
from langchain.schema import Document


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        meta = element.metadata.to_dict()
        meta["type"] = "table"
        categorized_elements.append(Document(page_content=str(element), metadata=meta))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        meta = element.metadata.to_dict()
        meta["type"] = "text"
        categorized_elements.append(Document(page_content=str(element), metadata=meta))

# Tables
table_docs = [e for e in categorized_elements if e.metadata["type"] == "table"]
print(len(table_docs))

# Text
text_docs = [e for e in categorized_elements if e.metadata["type"] == "text"]
print(len(text_docs))

10
102


## Text and table summaries

In [211]:
from langchain_community.llms.sambanova import Sambaverse
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import load_prompt

In [212]:
prompt = load_prompt(os.path.join(kit_dir, "prompts", "llama70b-summary.yaml"))

# Summary chain
model = Sambaverse(
    sambaverse_model_name="Meta/llama-2-70b-chat-hf",
    model_kwargs={
            "do_sample": True, 
            "max_tokens_to_generate": 256,
            "temperature": 0.01,
            "process_prompt": True,
            "select_expert": "llama-2-70b-chat-hf"
            #"stop_sequences": { "type":"str", "value":""},
            # "repetition_penalty": {"type": "float", "value": "1"},
            # "top_k": {"type": "int", "value": "50"},
            # "top_p": {"type": "float", "value": "1"}
        }
)
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

### Text Summaries

In [215]:
# Apply to text
texts = [i.page_content for i in text_docs if i.page_content != ""]
if texts:
    text_summaries = summarize_chain.batch(texts, {"max_concurrency": 1})

In [216]:
text_summaries

[' The table shows a schedule with four rows and two columns. The first column contains numbers, while the second column has abbreviations. The text discusses SambaLingo, a method for teaching large language models new languages. The authors are Zoltan Csaki et al., and the abstract is not provided.',
 ' This paper investigates the adaptation of large language models (LLMs) to new languages, addressing the gap in their capabilities and availability across diverse languages. The study comprehensively covers key components such as vocabulary extension, direct preference optimization, and data scarcity problem for human alignment in low-resource languages. The experiments are scaled across 9 languages and 2 parameter scales, outperforming all prior published baselines. The paper also makes all evaluation code and checkpoints publicly available.',
 ' The table provides information on the top 100 most cited papers in the field of artificial intelligence, including their authors, publication

### Table summaries

In [220]:
# Apply to tables
tables = [i.page_content for i in table_docs]
if tables:
    table_summaries = summarize_chain.batch(tables, {"max_concurrency":1})

In [221]:
table_summaries

[' The table compares several natural language processing (NLP) datasets, including mc4, Wikipedia FLORES-200, SIB-200, BELEBELE, and XNLI, XStoryCloze, XCOPA, and XWinograd. The datasets are categorized into tasks such as translation text classification, question answering, and knowledge natural language understanding. The table also shows the number of languages and the metric used for each dataset, primarily perplexity and accuracy.',
 ' The table compares the performance of various language models on several tasks, including language translation, question answering, and text summarization. The models are ranked based on their performance, with the best-performing model for each task listed first. The table shows that the FLORES EN→X model performs well on translation tasks, while the Belebele model performs well on question answering tasks. The SIB-200 and XNLI models perform well on text summarization tasks.',
 ' The table shows the number of added tokens for various languages, in

## Image summary

In [222]:
image_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the humans question. USER: <image>\n, such as bar plots. ASSISTANT:"
image_prompt = load_prompt(os.path.join(kit_dir, "prompts", "llava-summary.yaml"))
prompt = image_prompt.format(instruction = "Describe the image in detail. Be specific about graphs include name of axis, labels, legends and important numerical information")
image_paths = []
image_paths.extend(glob.glob(os.path.join(output_path, '*.jpg')))
image_paths.extend(glob.glob(os.path.join(output_path, '*.png')))

image_summaries = []
image_docs = []

for image_path in image_paths:
    result = llava_call(prompt, image_path)
    image_summaries.append(result)
    image_docs.append(Document(page_content=result, metadata={"type": "image", 'file_directory': image_path }))

In [223]:
image_summaries

['The image is a table with a list of information, including a section titled "Model Completion." The table is filled with various data points and text, making it difficult to discern specific details. However, the table appears to be a part of a research paper or a study, possibly related to the use of Google and its assistants.',
 'The image is a table with a Japanese language text, possibly a study guide or a test. The table is divided into two sections, with the left section containing a list of questions and the right section providing answers. The questions are written in Japanese, while the answers are in English.\n\nThere are several questions and answers in the table, with some questions and answers placed in the left section and others in the right section. The table is organized in a way that makes it easy to understand and follow.',
 'The image is a table with a lot of text in a foreign language. The table is divided into two sections, with the left section containing a lis

## add to vectorstore

In [224]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from utils.sambanova_endpoint import SambaNovaEmbeddingModel
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="summaries", embedding_function=SambaNovaEmbeddingModel()
)

# The storage layer for the parent documents
store = InMemoryByteStore()  # <- Can we extend this to images
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [225]:
# Add texts
if texts:
    doc_ids = [str(uuid.uuid4()) for _ in text_docs]
    summary_texts = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(text_summaries)
    ]
    retriever.vectorstore.add_documents(summary_texts)
    retriever.docstore.mset(list(zip(doc_ids, text_docs)))

# Add tables
if tables:
    table_ids = [str(uuid.uuid4()) for _ in table_docs]
    summary_tables = [
        Document(page_content=s, metadata={id_key: table_ids[i]})
        for i, s in enumerate(table_summaries)
    ]
    retriever.vectorstore.add_documents(summary_tables)
    retriever.docstore.mset(list(zip(table_ids, table_docs)))

# Add images
if image_summaries: #TODO add origin image as metadata
    img_ids = [str(uuid.uuid4()) for _ in image_summaries]
    summary_img = [
        Document(page_content=s, metadata={id_key: img_ids[i], "path" : image_paths[i]})
        for i, s in enumerate(image_summaries)
    ]
    retriever.vectorstore.add_documents(summary_img)
    retriever.docstore.mset(
        list(zip(img_ids, image_docs))
    )  # Store the image summary as the raw document

In [226]:
retriever.invoke("whats the final training loss in arabic language")

[Document(page_content='Table 9: Monolith multilingual continuous training vs language experts, averaged over all 9 languages.\n\nA.3 Qualitative Results\n\nFor Arabic, we compare our 7B arabic expert with aya-101 ( ¨Ust ¨un et al., 2024), Jais-13b-chat (Sengupta et al., 2023), and Bloomchat-v1 (SambaNova Systems, 2023) and use prompts from x-self-instruct-seed-32 (Systems, 2023a) and xOA22 (Systems, 2023b). Our Arabic chat model reaches 87.96% win rate compared to Jais-13B-chat, 99.06% win rate compared to Aya101, and 68.52% compared to Bloomchat-v1. For Japanese, we compare our Japanese chat model with ELYZA-japanese-Llama-2-7b-instruct (Sasaki et al., 2023) on 100 randomly sampled prompts aya dataset ( ¨Ust ¨un et al., 2024), reaching a win rate of 53.5% For Turkish, we compare our Turkish chat model against aya-101 ( ¨Ust ¨un et al., 2024) using prompts from the test set of aya dataset ( ¨Ust ¨un et al., 2024), leading to win-rate of 92.4%.', metadata={'filetype': 'application/pdf'

In [227]:
retriever.invoke("how is the performance of sambalingo vs aya101 in arabic and turkish language")

[Document(page_content='4.3.2 Qualitative Results\n\nMeasuring win-rate using GPT-4 as a judge only works in scenarios where a human aligned or instruction tuned model is available in a language. Given this constraint, we were only able to find relevant comparisons for Arabic, Japanese and Turkish, and do not have qualitative evaluations for our models in the other 6 languages. The results of our evalutation are shown in Figure 3. Our SambaLingo models consistently out-perform other models in the same language. For details about the native speaker-curated prompts, see Appendix A.3.\n\n(a) SambaLingo vs Aya101  (b) SambaLingo vs BloomChat-v1 \n\n(c) SambaLingo vs ELEYZA-7b-instruct\n\n(d) SambaLingo vs Jais-13b-chat\n\nFigure 3: GPT4 evaluation result', metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'last_modified': '2024-05-06T12:02:48', 'page_number': 6, 'file_directory': '/Users/jorgep/Documents/ask_public_own/ai-starter-kit/multimodal_knowledge_retriever/data/sample_

## Retrieval

In [228]:
from langchain.chains import RetrievalQA

prompt = load_prompt(os.path.join(kit_dir,"prompts","llama70b-knowledge_retriever-custom_qa_prompt.yaml"))

chain = RetrievalQA.from_llm(
    llm = model,
    retriever=retriever,
    return_source_documents=True,
    input_key="question",
    output_key="answer"
)
chain.combine_documents_chain.llm_chain.prompt=prompt


In [229]:
chain.invoke({"question": "whats the final training loss in arabic language"})

{'question': 'whats the final training loss in arabic language',
 'answer': ' According to the provided context, specifically Table 9, the final training loss for Arabic is not explicitly stated. The table only provides information on the win rates of different chat models compared to each other. Therefore, I do not have information regarding the final training loss in Arabic.',
 'source_documents': [Document(page_content='Table 9: Monolith multilingual continuous training vs language experts, averaged over all 9 languages.\n\nA.3 Qualitative Results\n\nFor Arabic, we compare our 7B arabic expert with aya-101 ( ¨Ust ¨un et al., 2024), Jais-13b-chat (Sengupta et al., 2023), and Bloomchat-v1 (SambaNova Systems, 2023) and use prompts from x-self-instruct-seed-32 (Systems, 2023a) and xOA22 (Systems, 2023b). Our Arabic chat model reaches 87.96% win rate compared to Jais-13B-chat, 99.06% win rate compared to Aya101, and 68.52% compared to Bloomchat-v1. For Japanese, we compare our Japanese c

In [230]:
chain.invoke("how is the performance of sambalingo vs aya101 in arabic and turkish language")

{'question': 'how is the performance of sambalingo vs aya101 in arabic and turkish language',
 'answer': " According to the provided context, specifically Figure 3, we can see the performance comparison between SambaLingo and Aya101 in Arabic and Turkish languages.\n\nIn Arabic, SambaLingo outperforms Aya101, with a win rate of 72.5% compared to Aya101's 27.5%.\n\nIn Turkish, SambaLingo also outperforms Aya101, with a win rate of 68.8% compared to Aya101's 31.2%.\n\nTherefore, the performance of SambaLingo vs Aya101 in Arabic and Turkish languages shows that SambaLingo consistently outperforms Aya101 in both languages.",
 'source_documents': [Document(page_content='4.3.2 Qualitative Results\n\nMeasuring win-rate using GPT-4 as a judge only works in scenarios where a human aligned or instruction tuned model is available in a language. Given this constraint, we were only able to find relevant comparisons for Arabic, Japanese and Turkish, and do not have qualitative evaluations for our mo