In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dspy
from dspy.datasets import DataLoader
from dspy.evaluate.metrics import answer_exact_match

import dotenv
import litellm

litellm.suppress_debug_info = True

dotenv.load_dotenv()

def debug_exact_match(example, pred, trace=None, frac=1.0):
    print(example.inputs())
    print(example.answer)
    print(pred)
    # print(trace)
    # print(frac)
    return answer_exact_match(example, pred, trace, frac)

In [3]:
import dotenv
dotenv.load_dotenv()
# vllm serve Qwen/Qwen2-VL-7B-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --pipeline-parallel-size 2
qwen_lm = dspy.LM(model="openai/Qwen/Qwen2-VL-7B-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
haiku_lm = dspy.LM(model="anthropic/claude-3-haiku-20240307", max_tokens=4096)
# vllm serve meta-llama/Llama-3.2-11B-Vision-Instruct --trust-remote-code --limit-mm-per-prompt image=16 --seed 42 --enforce-eager --max-num-seqs 48
llama_lm = dspy.LM(model="openai/meta-llama/Llama-3.2-11B-Vision-Instruct", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
internlm_lm = dspy.LM(model="openai/OpenGVLab/InternVL2-8B", api_base="http://localhost:8000/v1", api_key="sk-fake-key", max_tokens=5000)
gpt_lm = dspy.LM(model="openai/gpt-4o-mini", max_tokens=5000)
all_lms = [qwen_lm, haiku_lm, llama_lm, gpt_lm]

dspy.settings.configure(lm=gpt_lm)

In [4]:
import os
from huggingface_hub import snapshot_download
cwd = os.getcwd()
# os.environ.pop("HF_HOME", None)
# os.environ["HF_HOME"] = f"{cwd}/.cache"

In [None]:

# This download is flaky
# Just keep trying it and deleting .cache until it works
# Sometimes adding: ignore_patterns="*mi_phone*", helps
documents_path = snapshot_download("yubo2333/MMLongBench-Doc", allow_patterns="documents/*", ignore_patterns="*mi_phone*", repo_type="dataset", max_workers=1)
print(documents_path)



In [None]:
dataset = DataLoader().from_huggingface("yubo2333/MMLongBench-Doc")

print(dataset["train"][0])

In [None]:
gpt_lm.inspect_history(n=3)

In [None]:
from byaldi import RAGMultiModalModel
import glob
from pathlib import Path
# Optionally, you can specify an `index_root`, which is where it'll save the index. It defaults to ".byaldi/".


def create_all_indexes(paths=None):
    index_root = ".byaldi"
    if not os.path.exists(index_root):
        os.makedirs(index_root)

    for document_path in paths:
        index_name = f"mmlongbench-doc-{Path(document_path).stem}"
        index_path = Path(index_root) / Path(index_name)
        if index_path.exists():
            print(f"Index {index_name} already exists at {index_path}")
            continue
        print(f"Creating index {index_name}")
        RAG = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2", verbose=0)
        print(f"model loaded")
        RAG.index(
            input_path=document_path, # The path to your documents
            index_name=index_name, # The name you want to give to your index. It'll be saved at `index_root/index_name/`.
            store_collection_with_index=False, # Whether the index should store the base64 encoded documents.
            # doc_ids=[0, 1, 2], # Optionally, you can specify a list of document IDs. They must be integers and match the number of documents you're passing. Otherwise, doc_ids will be automatically created.
            # metadata=[{"author": "John Doe", "date": "2021-01-01"}], # Optionally, you can specify a list of metadata for each document. They must be a list of dictionaries, with the same length as the number of documents you're passing.
            overwrite=True, # Whether to overwrite an index if it already exists. If False, it'll return None and do nothing if `index_root/index_name` exists.
        )

import concurrent.futures
document_paths = glob.glob(f"{documents_path}/documents/*.pdf")
print(f"Creating indexes for {len(document_paths)} documents")
num_threads = 4
# chunk_size = (len(document_paths) + num_threads - 1) // num_threads
# chunks = [document_paths[i:i + chunk_size] for i in range(0, len(document_paths), chunk_size)]
create_all_indexes(document_paths)
    # with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    #     executor.map(create_all_indexes, chunks)
    # else:
    #     RAG.from_index(index_name)

In [None]:
document_name = "BESTBUY_2023_10K.pdf"
index_name = f"mmlongbench-doc-{document_name.split('.')[0]}"
first_index = RAGMultiModalModel.from_index(index_name)
# print(first_index.search("What is the color of the images?"))

In [None]:
loaded_documents = {}

In [None]:
from collections import Counter
# from pypdf import PdfReader
from pdf2image import convert_from_path
from matplotlib import pyplot as plt
from IPython.display import Image

import ast

questions_about_index = [x for x in dataset["train"] if x["doc_id"] == Path(index_name.split("mmlongbench-doc-")[1]).stem + ".pdf"]

print(questions_about_index[0])
# Lets get a baseline by cheating and putting in the evidence pages as given by the dataset

def get_evidence_pages(question, document_path):
    question = question.copy()
    evidence_pages = question["evidence_pages"]
    if document_path not in loaded_documents:
        images = convert_from_path(document_path, fmt="png")
        loaded_documents[document_path] = images
    else:
        images = loaded_documents[document_path]
    evidence_pages = ast.literal_eval(evidence_pages)
    evidence_pages = [int(i) - 1 for i in evidence_pages]
    question["evidence_page_images_pil"] = [images[i] for i in evidence_pages]
    question["images"] = [dspy.Image.from_PIL(images[i]) for i in evidence_pages]
    return question.with_inputs("images", "question")


document_path = f"{documents_path}/documents/{document_name}"
questions = [get_evidence_pages(question, document_path) for question in questions_about_index]
print(len(questions))
# Image(question["evidence_page_images_pil"][0])

In [25]:
from typing import List
from dspy.evaluate.metrics import answer_exact_match

class MMLongBenchDocSignature(dspy.Signature):
    images: List[dspy.Image] = dspy.InputField()
    question: str = dspy.InputField()
    answer: str = dspy.OutputField(desc="The answer to the question. This should be as succinct as possible.")

evaluate = dspy.Evaluate(metric=answer_exact_match, num_threads=10, return_outputs=True, devset=questions)


In [None]:
image_description = dspy.ChainOfThought(MMLongBenchDocSignature)

# for question in questions:
#     answer = image_description(images=question["evidence_page_images"], question=question["question"])
#     print("question:", question["question"])
#     print("Predicted answer:", answer)
#     print("Actual answer:", question["answer"])
#     print("="*100)
score, outputs = evaluate(image_description, devset=questions)
print(score)
print(outputs)

## Working above

In [9]:
from dsp.utils import deduplicate

class SimplifiedBaleen(dspy.Module):
    def __init__(self, passages_per_hop=3, max_hops=2):
        super().__init__()

        self.generate_query = [dspy.ChainOfThought("images: List[dspy.Image], context: str, question: str -> query: str") for _ in range(max_hops)]
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought("context: str, question: str -> answer: str")
        self.max_hops = max_hops
    
    def forward(self, question):
        context = []
        
        for hop in range(self.max_hops):
            query = self.generate_query[hop](context=context, question=question).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        pred = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=pred.answer)

In [None]:

colors = {
    "White": "FFFFFF",
    "Red": "FF0000",
    "Green": "00FF00",
    "Blue": "0000FF",
    "Yellow": "FFFF00",
    "Cyan": "00FFFF",
    "Magenta": "FF00FF",
    "Gray": "808080",
    "Orange": "FFA500",
    "Purple": "800080"
}
def get_color_image_url(color, file_extension="png"):
    return f"https://placehold.co/300/{colors[color]}/{colors[color]}.{file_extension}"

green_image = dspy.Image.from_url(get_color_image_url("Green"))
blue_image = dspy.Image.from_url(get_color_image_url("Blue"))

inputs = {
    "images": [green_image, blue_image],
    "question": "What is the color of the images?"
}
image_list_cot = dspy.ChainOfThought("images: List[dspy.Image], question: str -> answer: List[str]")
print(image_list_cot(**inputs))
gpt_lm.inspect_history()

In [None]:
gpt_lm.inspect_history()