In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import dspy
from dspy.datasets import DataLoader
from dspy.evaluate.metrics import answer_exact_match

import dotenv
import litellm

litellm.suppress_debug_info = True

dotenv.load_dotenv()

def debug_exact_match(example, pred, trace=None, frac=1.0):
    print(example.inputs())
    print(example.answer)
    print(pred)
    # print(trace)
    # print(frac)
    return answer_exact_match(example, pred, trace, frac)

In [4]:
import dotenv
dotenv.load_dotenv()
# vllm serve Qwen/Qwen2-VL-7B-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --pipeline-parallel-size 2
qwen_lm = dspy.LM(model="openai/Qwen/Qwen2-VL-7B-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
haiku_lm = dspy.LM(model="anthropic/claude-3-haiku-20240307", max_tokens=4096)
# vllm serve meta-llama/Llama-3.2-11B-Vision-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --enforce-eager --max-num-seqs 48
llama_lm = dspy.LM(model="openai/meta-llama/Llama-3.2-11B-Vision-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
internlm_lm = dspy.LM(model="openai/OpenGVLab/InternVL2-8B", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
gpt_lm = dspy.LM(model="openai/gpt-4o-mini", max_tokens=5000)
all_lms = [qwen_lm, haiku_lm, llama_lm, gpt_lm]

dspy.settings.configure(lm=gpt_lm)

In [5]:
import os
from huggingface_hub import snapshot_download
cwd = os.getcwd()
# os.environ.pop("HF_HOME", None)
# os.environ["HF_HOME"] = f"{cwd}/.cache"

In [6]:

# This download is flaky
# Just keep trying it and deleting .cache until it works
# Sometimes adding: ignore_patterns="*mi_phone*", helps
documents_path = snapshot_download("yubo2333/MMLongBench-Doc", allow_patterns="documents/*", ignore_patterns="*mi_phone*", repo_type="dataset", max_workers=1)
print(documents_path)



Fetching 134 files:   0%|          | 0/134 [00:00<?, ?it/s]

/home/ubuntu/.cache/huggingface/hub/datasets--yubo2333--MMLongBench-Doc/snapshots/38bceac8784469e70ad783dbf26c0b6ff08e0a9a


In [7]:
dataset = DataLoader().from_huggingface("yubo2333/MMLongBench-Doc")

print(dataset["train"][0])

Example({'doc_id': 'PH_2016.06.08_Economy-Final.pdf', 'doc_type': 'Research report / Introduction', 'question': 'According to the report, how do 5% of the Latinos see economic upward mobility for their children?', 'answer': 'Less well-off', 'evidence_pages': '[5]', 'evidence_sources': "['Chart']", 'answer_format': 'Str'}) (input_keys=set())


In [8]:
from byaldi import RAGMultiModalModel
import glob
from pathlib import Path
# Optionally, you can specify an `index_root`, which is where it'll save the index. It defaults to ".byaldi/".


def create_all_indexes():
    index_root = ".byaldi"
    document_paths = glob.glob(f"{documents_path}/documents/*.pdf")
    for document_path in document_paths:
        index_name = f"mmlongbench-doc-{Path(document_path).stem}"
        index_path = Path(index_root) / Path(index_name)
        if index_path.exists():
            print(f"Index {index_name} already exists")
            continue
        RAG = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2", verbose=0)
        print(f"Creating index {index_name}")
        RAG.index(
            input_path=document_path, # The path to your documents
            index_name=index_name, # The name you want to give to your index. It'll be saved at `index_root/index_name/`.
            store_collection_with_index=False, # Whether the index should store the base64 encoded documents.
            # doc_ids=[0, 1, 2], # Optionally, you can specify a list of document IDs. They must be integers and match the number of documents you're passing. Otherwise, doc_ids will be automatically created.
            # metadata=[{"author": "John Doe", "date": "2021-01-01"}], # Optionally, you can specify a list of metadata for each document. They must be a list of dictionaries, with the same length as the number of documents you're passing.
            overwrite=False, # Whether to overwrite an index if it already exists. If False, it'll return None and do nothing if `index_root/index_name` exists.
            verbose=0
        )

CREATE_INDEX = True
if CREATE_INDEX:
    create_all_indexes()
# else:
#     RAG.from_index(index_name)

Index mmlongbench-doc-nielsen2015musicbizpresentation-final-150526143534-lva1-app6891_95 already exists
Index mmlongbench-doc-disciplined-agile-business-analysis-160218012713_95 already exists
Index mmlongbench-doc-nova_y70 already exists
Index mmlongbench-doc-STEPBACK already exists
Index mmlongbench-doc-efd88e41c5f2606c57929cac6c1c0605 already exists
Index mmlongbench-doc-csewt7zsecmmbzjufbyx-signature-24d91a254426c21c3079384270e1f138dc43a271cfe15d6d520d68205855b2a3-poli-150306115347-conversion-gate01_95 already exists
Index mmlongbench-doc-COSTCO_2021_10K already exists
Index mmlongbench-doc-formwork-150318073913-conversion-gate01_95 already exists
Index mmlongbench-doc-germanwingsdigitalcrisisanalysis-150403064828-conversion-gate01_95 already exists
Index mmlongbench-doc-PP_2019.01.17_Trump-economy_FINAL2 already exists
Index mmlongbench-doc-AMAZON_2017_10K already exists
Index mmlongbench-doc-a4f3ced0696009fec3179f493e4f28c4 already exists
Index mmlongbench-doc-2311.16502v3 alread

In [9]:
from dsp.utils import deduplicate

class SimplifiedBaleen(dspy.Module):
    def __init__(self, passages_per_hop=3, max_hops=2):
        super().__init__()

        self.generate_query = [dspy.ChainOfThought("images: List[dspy.Image], context: str, question: str -> query: str") for _ in range(max_hops)]
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought("context: str, question: str -> answer: str")
        self.max_hops = max_hops
    
    def forward(self, question):
        context = []
        
        for hop in range(self.max_hops):
            query = self.generate_query[hop](context=context, question=question).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        pred = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=pred.answer)

In [25]:

colors = {
    "White": "FFFFFF",
    "Red": "FF0000",
    "Green": "00FF00",
    "Blue": "0000FF",
    "Yellow": "FFFF00",
    "Cyan": "00FFFF",
    "Magenta": "FF00FF",
    "Gray": "808080",
    "Orange": "FFA500",
    "Purple": "800080"
}
def get_color_image_url(color, file_extension="png"):
    return f"https://placehold.co/300/{colors[color]}/{colors[color]}.{file_extension}"

green_image = dspy.Image.from_url(get_color_image_url("Green"))
blue_image = dspy.Image.from_url(get_color_image_url("Blue"))

inputs = {
    "images": [green_image, blue_image],
    "question": "What is the color of the images?"
}
image_list_cot = dspy.ChainOfThought("images: List[dspy.Image], question: str -> answer: List[str]")
print(image_list_cot(**inputs))
gpt_lm.inspect_history()

Prediction(
    reasoning='The first image is a bright green color, and the second image is a deep blue color.',
    answer=['green', 'blue']
)




[34m[2024-11-13T00:24:32.152468][0m

[31mSystem message:[0m

Your input fields are:
1. `images` (list[Image])
2. `question` (str)

Your output fields are:
1. `reasoning` (str)
2. `answer` (list[str])

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## images ## ]]
{images}

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## answer ## ]]
{answer}        # note: the value you produce must be pareseable according to the following JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Given the fields `images`, `question`, produce the fields `answer`.


[31mUser message:[0m

[[ ## images ## ]]
[
[34m<image_url: https://placehold.co/300/00FF00/00FF00.png>[0m

,
[34m<image_url

In [24]:
gpt_lm.inspect_history()





[34m[2024-11-13T00:21:35.001699][0m

[31mSystem message:[0m

Your input fields are:
1. `images` (list[Image])
2. `question` (str)

Your output fields are:
1. `reasoning` (str)
2. `answer` (list[str])

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## images ## ]]
{images}

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## answer ## ]]
{answer}        # note: the value you produce must be pareseable according to the following JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Given the fields `images`, `question`, produce the fields `answer`.


[31mUser message:[0m

[34m<image_url: https://placehold.co/300/00FF00/00FF00.png>[0m

[[ ## images ## ]]
[
[34m<image_url: https://placehold.co/300/0000FF/0000FF.png>[0m

, 

]

[[ ## question ## ]]
What is the color of the images?

Respond with the corresponding ou