In [None]:
import pandas as pd

df = pd.read_parquet("hf://datasets/AI4Math/MathVerse/testmini.parquet")

#OPTIONAL: if you want to take a look at what "text only" looks like for mathverse, this is the loading code:
df2 = pd.read_parquet("hf://datasets/AI4Math/MathVerse/testmini_text_only.parquet")
df= pd.concat([df, df2])
df["answer"] = df["answer"].str.replace("(", "").str.replace(")", "")


In [None]:
df

In [None]:
## NOTE: we chose to save the mathverse images locally so it integrates seamlessly with evaluate_MLLMs.py. This is optional. 

import ast
from PIL import Image
import io


def process_image_column(row):
    if not isinstance(row, str) or not row.startswith("{") or not row.endswith("}"):
        return None  # Ignore invalid rows

    image_data = eval(row) 
    image_bytes = image_data.get('bytes', None)
    
    if image_bytes:
        print("here!")  # Debugging
        return Image.open(io.BytesIO(image_bytes))
    
    return None

df["processed_image"] = df["image"].astype(str).apply(process_image_column)

In [None]:
import os
from PIL import Image
import numpy as np

# Create the directory if it doesn't exist
output_directory = "images/mathverse_images"
os.makedirs(output_directory, exist_ok=True)

# Function to create a white 224x224 image
def create_white_image(output_directory):
    white_image_path = os.path.join(output_directory, "white_image.png")
    if not os.path.exists(white_image_path):
        # Create and save the white image only once
        white_image = Image.fromarray(np.full((224, 224, 3), 255, dtype=np.uint8))
        white_image.save(white_image_path)
    return white_image_path

# Function to resize images if they are larger than 1024x1024
def resize_image(image, max_size=(1024, 1024)):
    if image.size[0] > max_size[0] or image.size[1] > max_size[1]:
        image.thumbnail(max_size, Image.LANCZOS)  # Use Image.LANCZOS instead of Image.ANTIALIAS
    return image

# Create the white image and get its path
white_image_path = create_white_image(output_directory)

# Initialize a list to store paths
image_paths = []

# Save images and populate paths
for idx, image in enumerate(df["processed_image"]):
    if image is None:
        # Use the path of the dedicated white image for NaN values
        image_paths.append(white_image_path)
    else:
        # Resize the image if necessary
        image = resize_image(image)
        # Save the image with a unique name
        image_path = os.path.join(output_directory, f"image_{idx + 1}.png")
        #image.save(image_path)
        image_paths.append(image_path)

# Add the paths to a new column in the DataFrame
df["path"] = image_paths

print(f"Images saved to {output_directory} and paths added to DataFrame.")


In [None]:
df[['sample_index', 'problem_index', 'problem_version', 'question',
       'answer', 'question_type', 'metadata', 'query_wo', 'query_cot',
       'question_for_eval', 'path']].to_csv("mathverse.csv",index=False)

In [None]:
################## Pre-processing for VC-CoT experiements ##################

In [None]:
import pandas as pd

df = pd.read_csv("mathverse.csv")
# We use multiple-choice and vision domninant split, you can change this as you see fit!
df = df[df["problem_version"] == "Vision Dominant"]
df = df[df["question_type"] == "multi-choice"]
df = df[df["answer"].isin(["A", "B", "C", "D", "E", "F"])]

In [None]:
df1 = df.copy()
df2 = df.copy()

In [None]:
#mathverse CoT
df["prompt"] = df["query_cot"] 
df["type"] = "mathverse_cot"

In [None]:
#direct prompting
df1["prompt"] = df1["query_wo"] 
df1["type"] = "direct"

In [None]:
#Example of VC-CoT. You can teak this based on the needs of different models. 

df2["prompt"] = "Examine the mathematical diagram carefully, noting all present shapes, numbers, and letters. Establish their spatial and numerical relationships, and apply logical reasoning to determine the correct answer. Conclude your response by providing the correct option letter, e.g., A, B, C, D, at the end. " + df2["question"]
df2["type"] = "VC-CoT"

#Other examples that work well: 
#"First, identify and list all the provided visual cues. Next, infer any missing details using geometric principles. Answer the question and provide the correct option letter, e.g., A, B, C, D, at the end. "
#"Carefully inspect the visual representation, identifying its mathematical components including numbers, figures, and key relationships. Follow a structured approach to analyze these elements, derive necessary conclusions, and provide the final answer by stating the correct option letter, e.g., A, B, C, D, at the end." 

In [None]:
df_final = pd.concat([df, df1, df2])

In [None]:
df_final.to_csv("mathverse_revised.csv", index=False)