In [1]:
%load_ext autoreload
%autoreload 2

In [40]:
from datasets import load_dataset
from PIL import Image
from numpy.random import permutation
import os
import glob
import nest_asyncio
nest_asyncio.apply()

from llama_index.core import Settings
from llama_index.core.agent import StructuredPlannerAgent, FunctionCallingAgentWorker
from llama_index.tools.code_interpreter import CodeInterpreterToolSpec

from plotreader.document import DirectoryHandler, MultimodalDirectoryHandler

print(Settings.llm.model)


claude-3-5-sonnet-20240620


In [3]:
ds = load_dataset("futurehouse/lab-bench", "FigQA")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'ideal', 'distractors', 'canary', 'subtask', 'figure', 'figure-path'],
        num_rows: 181
    })
})

In [4]:
def get_randomized_options(sample):

    options = sample['distractors']
    options.append(sample['ideal'])
    options = permutation(options)

    return options

figqa_prompt_template = "Question: {question}\nChoose from the following answer options: {answer_options}"

In [54]:
dl_imagedir = "./data_images"
[os.remove(os.path.join(f)) for f in glob.glob(os.path.join(dl_imagedir,"*"))]

sample = ds['train'][1]
sample['figure'].save('storage/tmp/tmp.pdf')

In [55]:
fig_handler = MultimodalDirectoryHandler(
    name = 'figure',
    dirpath = './storage/tmp',
    desc = 'The figure to use to answer the question.',
    storage_dir = './storage',
    use_cache=False,
    parsing_instructions = None 
                # """
                # Extract as much information and describe them so someone could potentially simulate new data and plot similiar figures.
                # Attempt to extract all of the quantitative information from these figures including the values used to generate lines and other visual information. 
                # Attempt to estimate the values at each plotted point (not interpolated points) along with any associated labels. Return tables of the values only.
                # """
)

In [56]:
tools = [fig_handler.query_engine_tool()] + CodeInterpreterToolSpec().to_tool_list()

tool_agent_worker = FunctionCallingAgentWorker.from_tools(
    tools,
    verbose=True,
    max_function_calls=5,
)

agent = StructuredPlannerAgent(
    tool_agent_worker, 
    tools=tools, 
    verbose=True, 
)

Started parsing the file under job_id a0fb60fa-1159-4a4b-99da-b2b7c789b6d1
> Image for page 1: [{'name': 'page_1.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]


In [57]:
# FIG_READER_PROMPT = """
# Your first main task is to desribe the meaning of each visual signifier for each panel. 
# First, aggregate all signifiers and their various meanings across the full figure.
# Be careful, some signifiers may be used multiple times and in different ways across panels.
# If a signifier in a particular panel is ambiguous, make an educated guess based on other information in the figure 
# and other uses of that signifier.
# Make sure to save your final description of what all of the signifiers mean for each panel.
# After your first task, your second main task is to answer the following multiple choice question. 
# First, decide which panels are relevant.
# Then retrieve the signifier descriptions for those panels as a reference to help you answer the question.
# If something is ambiguous, assume you are incorrect about an assumption, not that the question/answers have an issue.
# Look to other parts of the figure or other information in the relevant panels to help correct your assumption.
# {figqa_input}
# """

In [58]:
FIG_READER_PROMPT = """
Your first main task is to answer the question at the end of this prompt.
Follow more or less this plan to do so:
1. Summarize the figure.
2. Determine which panels are needed to answer the question.
3. Determine which auxiliary panels might contain similar or relevant information or have similar graph styles.
4. Explain what might be ambiguous in this plot and use the information selected in (2) and (3) to resolve it.
5. Explain what every signifier means in the plot.
6. Answer the question and explain which visual features enabled you to answer it.
DO NOT ANSWER A DIFFERENT QUESTION IF YOU THINK INFORMATION IS MISSING OR THE QUESTION IS NOT WELL SPECIFIED. TRY TO FIGURE IT OUT!
{figqa_input}
"""

In [59]:

figqa_prompt = figqa_prompt_template.format(question = sample['question'], answer_options=get_randomized_options(sample))
query = FIG_READER_PROMPT.format(figqa_input=figqa_prompt)

print(query)


Your first main task is to answer the question at the end of this prompt.
Follow more or less this plan to do so:
1. Summarize the figure.
2. Determine which panels are needed to answer the question.
3. Determine which auxiliary panels might contain similar or relevant information or have similar graph styles.
4. Explain what might be ambiguous in this plot and use the information selected in (2) and (3) to resolve it.
5. Explain what every signifier means in the plot.
6. Answer the question and explain which visual features enabled you to answer it.
DO NOT ANSWER A DIFFERENT QUESTION IF YOU THINK INFORMATION IS MISSING OR THE QUESTION IS NOT WELL SPECIFIED. TRY TO FIGURE IT OUT!
Question: According to panel c, how many WT samples had 6 LD per IBA1+ cell?
Choose from the following answer options: ['5' '4' '2' '3' '6' '1' '7']



In [60]:
agent.query(query)

=== Initial plan ===
Summarize_Figure:
Use the figure_multimodal_vector_tool to query and summarize the overall content of the figure. -> A brief summary of the figure, including the number of panels and their general content.
deps: []


Identify_Relevant_Panels:
Analyze the figure summary to determine which panel(s) are needed to answer the question about WT samples with 6 LD per IBA1+ cell in panel c. -> Identification of panel c as the primary relevant panel, and any other panels that might provide context.
deps: ['Summarize_Figure']


Analyze_Panel_C:
Use the figure_multimodal_vector_tool to closely examine panel c, focusing on the WT samples and the data point for 6 LD per IBA1+ cell. -> Detailed description of panel c, including the data representation for WT samples and the specific data point for 6 LD per IBA1+ cell.
deps: ['Identify_Relevant_Panels']


Count_WT_Samples:
Based on the analysis of panel c, count the number of WT samples that had 6 LD per IBA1+ cell. -> The exact 

Response(response='To answer the question "According to panel c, how many WT samples had 6 LD per IBA1+ cell?", let\'s compile the findings from our previous analyses:\n\n1. In our initial analysis, we noted that there appeared to be no data points exactly at 6 LDs per cell for WT samples.\n\n2. However, upon careful reexamination specifically focused on this question, we found that there are exactly 2 WT samples (data points) that have precisely 6 lipid droplets (LDs) per IBA1+ cell.\n\n3. The visual features of panel C, including the clear y-axis labeling, the blue color-coding for WT samples, and the precise scaling of the y-axis, allowed for the identification of these specific data points.\n\n4. The dots representing these samples align exactly with the y-axis value of 6 in the WT (left) column of the dot plot.\n\nGiven this information, we can confidently answer the question:\n\nAccording to panel c, 2 WT samples had 6 LD per IBA1+ cell.\n\nThis answer is based on the most detail

In [61]:
sample['ideal']

'3'