### Once you ran evaluate_MLLMs.py for all models (or the ones you desire),
### you can modify the code below to combine all of them and calculate accuracies

In [None]:
import os
import pandas as pd

#MODIFY THIS BASED ON THE TASKS YOU WANT TO EVALUATE
base_dfs = {"shape_id":None, "sides_id": None, "two_shapes": None, "abstract": None}  

#MODIFY THIS BASED ON THE MODELS YOU WANT TO EVALUATE
model_columns = {
    "llava-1.5": "generate_text_llava_1.5",
    "llava-1.6": "generate_text_llava_1.6",
    "qwen": "generate_text_qwen",
    "internvl": "generate_text_internvl",
    "llava-one": "generate_text_llava-one",
    "llama-3.2": "generate_text_llama-3.2",
    "gpt-4-turbo": "generate_text_gpt-4-turbo",
    "gpt-4o": "generate_text_gpt-4o",
    "janus": "generate_text_janus",
    "molmo": "generate_text_molmo",
    "math-llava": "generate_text_math-llava",
    "g-llava": "generate_text_g-llava",
    "math-puma": "generate_text_math-puma",
    
}
additional_columns = {task: {col: None for col in model_columns.values()} for task in base_dfs.keys()}

# Iterate over files in the current directory
for file in os.listdir("."):
    if file.endswith(".csv") and "text_only" not in file: 
        for task in base_dfs.keys(): 
            if task in file:
                for model_version, column_name in model_columns.items():
                    if f"{model_version}_" in file:
                        print(f"Processing {file} for {task}...")
                        df = pd.read_csv(file)
                        print(len(df))
                        
                        # For the first model (base_df), take all columns
                        if base_dfs[task] is None and model_version == "llava-1.5":
                            base_dfs[task] = df.copy()
                            base_dfs[task].rename(columns={"generated_text": column_name}, inplace=True)
                        else:
                            # Just take the generated_text column and rename it
                            additional_columns[task][column_name] = df["generated_text"]

# Append the additional columns to the base DataFrames
for task, base_df in base_dfs.items():
    if base_df is not None:
        for column_name, column_data in additional_columns[task].items():
            if column_data is not None:
                base_df[column_name] = column_data

        # Save the final combined DataFrame for each task
        output_file = f"final_combined_{task}_generated_texts.csv"
        base_df.to_csv(output_file, index=False)
        print(f"Saved combined DataFrame for {task} to '{output_file}'.")


In [None]:
import pandas as pd

# Define file names for the combined outputs
two_shapes_file = "final_combined_two_shapes_generated_texts.csv"
shape_id_file = "final_combined_shape_id_generated_texts.csv"
sides_id_file = "final_combined_sides_id_generated_texts.csv"

# Load the DataFrames
two_shapes_df = pd.read_csv(two_shapes_file)
shape_id_df = pd.read_csv(shape_id_file)
sides_id_df = pd.read_csv(sides_id_file)

In [None]:
########################### shape_id ###########################

In [None]:
words_to_replace = ["diamond", "rhombus", "quadrilateral"]
replacement_word = "square"

# Process only columns that start with 'generate_text_'
for col in shape_id_df.columns:
    if col.startswith("generate_text_"):
        shape_id_df[col] = (
            shape_id_df[col]
            .astype(str)  # Convert all values to string
            .str.lower()   # Lowercase the text
            .replace({word: replacement_word for word in words_to_replace}, regex=True)
        )

In [None]:
from difflib import SequenceMatcher

# Define a function to calculate inclusion accuracy
def is_inclusive_match(true_shape, predicted_shape):
    """
    Returns True if the predicted shape contains the true shape as a substring
    or has a high similarity score.
    """
    if pd.isna(predicted_shape):  # Handle NaN cases
        return False
        
    predicted_shape = predicted_shape.strip().lower()
    true_shape = true_shape.strip().lower()
    
    if true_shape in predicted_shape:
        return True
        
    # Exclude specific sentence from inclusion accuracy
    if predicted_shape == "the image contains a variety of shapes, including circles, squares, and triangles.":
        return False
        
    return False

# Models to evaluate
models = ['generate_text_llava_1.5', 'generate_text_llava_1.6',
          'generate_text_qwen', 'generate_text_internvl', 'generate_text_llava-one',
          'generate_text_llama-3.2', 'generate_text_gpt-4-turbo', 'generate_text_gpt-4o',
          'generate_text_molmo', 'generate_text_janus',
          "generate_text_math-llava","generate_text_g-llava", "generate_text_math-puma" ]

# Create correctness, inclusion correctness, and extended accuracy columns for each model
for model in models:
    shape_id_df[f'{model}_is_correct'] = shape_id_df['shape'] == shape_id_df[model].str.strip().str.lower()
    shape_id_df[f'{model}_is_inclusive'] = shape_id_df.apply(
        lambda row: is_inclusive_match(row['shape'], row[model]), axis=1
    )

# Initialize dictionaries to hold per-model accuracies
model_correctness = {}

# Group by shape and calculate exact, inclusion, and extended accuracies for each model
for model in models:
    # Exact accuracy
    model_accuracy = shape_id_df.groupby('shape').apply(lambda group: {
        'Exact Accuracy': group[f'{model}_is_correct'].mean(),
        'Inclusion Accuracy': group[f'{model}_is_inclusive'].mean(),
    }).apply(pd.Series)
    
    # Reset index and store the result in the dictionary
    model_correctness[model] = model_accuracy.reset_index()

# Display the correctness and inclusion accuracy per model
for model, correctness_df in model_correctness.items():
    print(f"\nAccuracy for {model}:")
    print(correctness_df[['shape','Inclusion Accuracy']])


In [None]:
########################### sides_id ###########################

In [None]:
sides_id_df

In [None]:
import re
def extract_number(text):
    words_to_numbers = {
        "one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6",
        "seven": "7", "eight": "8", "nine": "9", "ten": "10"
    }

    # Replace word numbers with digits
    for word, number in words_to_numbers.items():
        text = re.sub(rf'\b{word}\b', number, text, flags=re.IGNORECASE)

    # Extract the first numeric value in the string
    match = re.search(r'\b\d+\b', text)
    return match.group(0) if match else ""  # Return extracted number or empty string



In [None]:
# Apply extract_number to all columns starting with "generate_text"
sides_id_df.loc[:, sides_id_df.columns.str.startswith("generate_text")] = (
    sides_id_df.filter(like="generate_text").apply(extract_number)
)


In [None]:
models = ['generate_text_llava_1.5', 'generate_text_llava_1.6',
          'generate_text_qwen', 'generate_text_internvl', 'generate_text_llava-one',
          'generate_text_llama-3.2', 'generate_text_gpt-4-turbo', 'generate_text_gpt-4o',
          'generate_text_molmo', 'generate_text_janus',
          "generate_text_math-llava","generate_text_g-llava", "generate_text_math-puma" ]

#'generate_text_llava-one',
shape_to_sides = {
    "triangle": 3,
    "square": 4,
    "pentagon": 5,
    "hexagon": 6,
    "heptagon": 7,
    "octagon": 8
}

# Create a new column "sides" based on the "shape" column
sides_id_df["sides"] = sides_id_df["shape"].str.lower().map(shape_to_sides)

# Function to compute accuracy between "sides" and model predictions
def compute_sides_accuracy(df, model_columns, sides_column):
    accuracies = {}

    for model in model_columns:
        # Convert predictions to integer (handling NaNs)
        predictions = pd.to_numeric(df[model], errors='coerce')
        
        # Compare with ground truth
        correct_predictions = predictions == df[sides_column]
        
        # Compute accuracy
        accuracies[model] = correct_predictions.mean()
    
    return pd.DataFrame.from_dict(accuracies, orient='index', columns=['Accuracy'])

# Compute accuracy for each model
sides_accuracy_df = compute_sides_accuracy(sides_id_df, models, "sides")
sides_accuracy_df

In [None]:
def compute_per_shape_accuracy(df, model_columns, shape_column, sides_column):
    accuracies = {shape: {} for shape in df[shape_column].unique()}

    for model in model_columns:
        # Convert predictions to integer (handling NaNs)
        predictions = pd.to_numeric(df[model], errors='coerce')

        # Compute accuracy for each shape
        for shape in df[shape_column].unique():
            shape_df = df[df[shape_column] == shape]
            correct_predictions = (predictions.loc[shape_df.index] == shape_df[sides_column]).mean()
            accuracies[shape][model] = correct_predictions

    # Convert dictionary to DataFrame with models as rows and shapes as columns
    return pd.DataFrame(accuracies) #.T


# Compute accuracy per shape per model
per_shape_accuracy_df = compute_per_shape_accuracy(sides_id_df, models, "shape", "sides")

per_shape_accuracy_df

In [None]:
########################### two_shapes ###########################

In [None]:
import pandas as pd

# Define file names for the combined outputs
two_shapes_file = "final_combined_two_shapes_generated_texts.csv"
two_shapes_df = pd.read_csv(two_shapes_file)

In [None]:
columns_to_process = ['generate_text_llava_1.5', 'generate_text_llava_1.6', 'generate_text_qwen',
                      'generate_text_internvl', 'generate_text_llava-one', 'generate_text_llama-3.2',
                      'generate_text_gpt-4-turbo', 'generate_text_gpt-4o', 'generate_text_janus', 'generate_text_molmo',
                     'generate_text_math-puma', 'generate_text_math-llava', 'generate_text_g-llava']

# Replace "diamond", "quadrilateral", and "rhombus" with "square" in the specified columns
for column in columns_to_process:
    two_shapes_df[column] = two_shapes_df[column].str.replace(
        r'\b(diamond|quadrilateral|rhombus)\b', 'square', case=False, regex=True
    )

In [None]:
two_shapes_df["generate_text_janus"] = two_shapes_df["generate_text_janus"].str.split("\n\n### Final Answer:").str[0]
two_shapes_df["generate_text_janus"][0]

In [None]:
import re
# Refine function to clean up redundant line breaks
def combine_steps_final_cleaned(text):
    # Step 1: Check if "4." exists and is not at the end of the text
    if "\n4." in text and not text.endswith("4."):
        # Split into parts and find the section with "3."
        parts = text.split("\n3.")

        # Ensure there's content to combine
        # Combine the content of "2." and "3."
        combined_part = parts[0].strip() + " " + parts[1].strip()
        # Combine everything back, removing unnecessary "3." and renumbering "4." to "3."
        final_text = combined_part.replace("\n4.", "\n3.")

        # Remove redundant line breaks
        #final_text = re.sub(r'\n+', '\n', final_text)
        return final_text.strip()
    
    # If "4." is at the end or not present, return the text as is
    return re.sub(r'\n+', '\n', text.strip())

# Apply the refined function to the DataFrame
two_shapes_df['generate_text_qwen'] = two_shapes_df['generate_text_qwen'].apply(combine_steps_final_cleaned)

# Display the updated text
two_shapes_df['generate_text_qwen'].iloc[10]


In [None]:
two_shapes_df['generate_text_janus'] = two_shapes_df['generate_text_janus'].apply(combine_steps_final_cleaned)


In [None]:
two_shapes_df['ground_truth_shapes'] = two_shapes_df.apply(lambda row: [row['shape1'], row['shape2']], axis=1)
two_shapes_df['ground_truth_shapes'] = two_shapes_df['ground_truth_shapes'].apply(sorted)

# Step 2: Create a dictionary for the number of sides each shape has
shape_sides_dict = {
    'triangle': 3,
    'square': 4,
    'rectangle': 4,
    'circle': 0,  # Assuming circles have no sides
    'ellipse': 0,  # Assuming ellipses have no sides
    'pentagon': 5,
    'hexagon': 6
}

# Step 3: Count the total number of sides based on the ground truth shapes
def calculate_total_sides(shapes, sides_dict):
    return sum(sides_dict[shape] for shape in shapes if shape in sides_dict)

two_shapes_df['total_ground_truth_sides'] = two_shapes_df['ground_truth_shapes'].apply(
    lambda shapes: calculate_total_sides(shapes, shape_sides_dict)
)

# Display the updated dataframe with relevant columns
two_shapes_df[['ground_truth_shapes', 'total_ground_truth_sides']].head()


In [None]:
import pandas as pd
import re
from collections import Counter

# Define common shapes
common_shapes = ["triangle", "square", "rectangle", "circle", "ellipse", "pentagon", "hexagon", "octagon", "diamond", "heptagon", "cube"]

# Function to extract and count shapes accurately
def extract_shapes(text):
    # Extract text between "1." and "2."
    #print(text)
    match = re.search(r"1\.\s(.*?)\s2\.", text, re.DOTALL)
    if match:
        shape_text = match.group(1).lower()  # Extract and lowercase text
        # Match each shape in the text, allowing for multiple occurrences
        shape_counts = Counter()
        for shape in common_shapes:
            occurrences = re.findall(fr'\b{shape}\b', shape_text)  # Count exact shape matches
            shape_counts[shape] += len(occurrences)
        # Remove shapes with zero occurrences
        return {k: v for k, v in shape_counts.items() if v > 0}
    return {}

# Function to convert ground truth shapes to a counter
def ground_truth_to_counter(shape_list):
    return dict(Counter(shape_list))

# Function to compare shape counts and ground truth counts
def compare_counts(row):
    ground_truth_counter = ground_truth_to_counter(row['ground_truth_shapes'])
    return row['shape_counts'] == ground_truth_counter

# Process for every model in two_shapes_df
models = ['generate_text_llava_1.5', 'generate_text_llava_1.6',
          'generate_text_qwen', 'generate_text_internvl', 'generate_text_llava-one',
          'generate_text_llama-3.2', 'generate_text_gpt-4-turbo',
          'generate_text_gpt-4o', 'generate_text_janus', 'generate_text_molmo',
         'generate_text_math-puma', 'generate_text_math-llava', 'generate_text_g-llava']

# Initialize results dictionary to hold correct counts per model
results = {}

for model in models:
    print(f"\nProcessing model: {model}")
    
    # Extract shapes for each model's generated text
    two_shapes_df[f'{model}_shape_counts'] = two_shapes_df[model].apply(extract_shapes)
    
    # Compare extracted shapes with ground truth
    two_shapes_df[f'{model}_correct_first_step'] = two_shapes_df.apply(
        lambda row: compare_counts({
            'ground_truth_shapes': row['ground_truth_shapes'],
            'shape_counts': row[f'{model}_shape_counts']
        }),
        axis=1
    )
    
    # Store results
    results[model] = two_shapes_df[[f'{model}_shape_counts', 'ground_truth_shapes', f'{model}_correct_first_step']]

# Display results for each model
for model, result_df in results.items():
    print(f"\nResults for {model}:")
    print(result_df)


In [None]:
two_shapes_df["generate_text_janus"] = two_shapes_df["generate_text_janus"].str.replace("\n", " ").str.replace("    ", "")#[4]
#For each shape, specify the number of sides it has:
two_shapes_df["generate_text_janus"] = two_shapes_df["generate_text_janus"].str.replace("For each shape, specify the number of sides it has:", "")

In [None]:
#### total number of sides:
two_shapes_df["generate_text_llama-3.2"] = two_shapes_df["generate_text_llama-3.2"].str.replace("total number of sides:", " ")
two_shapes_df["generate_text_llama-3.2"] = two_shapes_df["generate_text_llama-3.2"].str.replace("infinite", "10000")
two_shapes_df["generate_text_llama-3.2"] = two_shapes_df["generate_text_llama-3.2"].str.replace(":", ".")


In [None]:
import pandas as pd
import re

# Reference dictionaries
shape_sides_reference = {
    "triangle": 3, "square": 4, "rectangle": 4, "circle": 0,
    "pentagon": 5, "hexagon": 6, "ellipse": 0, "diamond": 4, "cube": 12, "heptagon": 7, "octagon": 8
}

# Spelled-out numbers mapping
number_mapping = {
    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4,
    "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
}

# Function to convert spelled-out numbers to integers
def convert_to_number(word):
    return number_mapping.get(word, None)  # Returns None if not in mapping

# Function to extract and validate shapes and sides
def extract_shapes_and_sides(text, ground_truth_shapes):
    # Extract text between "2." and "3."
    match = re.search(r"2\.(.*?)3\.", text, re.DOTALL)
    if not match:
        return {}, False  # No match found

    shapes_text = match.group(1).lower()  # Extracted portion and lowercase

    # Check if the text explicitly states "no shapes"
    if "no shapes" in shapes_text:
        return {}, len(ground_truth_shapes) == 0  # Correct if ground truth also has no shapes

    # Regex: Match shape names (singular & plural), followed by the first number appearing after them
    shape_sides_matches = re.findall(
        r"\b(triangle|triangles|square|squares|rectangle|rectangles|circle|circles|pentagon|pentagons|hexagon|hexagons|ellipse|ellipses|diamond|diamonds|cube|cubes|heptagon|heptagons|octagon|octagons)\b.*?(\d+)", 
        shapes_text
    )


    # Build shape dictionary and validate sides
    shape_sides_dict = {}
    correct = True

    for shape, sides in shape_sides_matches:
        # Convert plural to singular for consistency
        shape = shape.rstrip("s")

        # Convert sides to an integer
        sides = int(sides) if sides.isdigit() else None  # Ensure conversion

        if shape in shape_sides_dict:
            shape_sides_dict[shape] += sides
        else:
            shape_sides_dict[shape] = sides

        # Validate sides
        if shape in shape_sides_reference and sides != shape_sides_reference[shape]:
            correct = False

    return shape_sides_dict, correct


# Process for every model in `two_shapes_df`
for model in models:
    print(f"\nProcessing model: {model}")
    
    # Apply extraction and validation to each model's generated text
    two_shapes_df[f'{model}_shape_sides_dict'], two_shapes_df[f'{model}_correct_second_step'] = zip(
        *two_shapes_df.apply(
            lambda row: extract_shapes_and_sides(row[model], row['ground_truth_shapes']), axis=1
        )
    )

# Display results for each model
for model in models:
    print(f"\nResults for {model}:")
    print(two_shapes_df[[model, f'{model}_shape_sides_dict', f'{model}_correct_second_step']])


In [None]:
import pandas as pd
import re

# Function to extract the total number of sides from text
number_mapping = {
    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4,
    "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
}

def extract_total_sides(text):
    # Match numeric total sides
    match = re.search(r"The total number of sides is (\d+)\.?", text, re.IGNORECASE)
    if match:
        return int(match.group(1))
    
    # Match verbal total sides
    match_verbal = re.search(r"The total number of sides is (\w+)\.?", text, re.IGNORECASE)
    if match_verbal:
        verbal_number = match_verbal.group(1).lower()
        return number_mapping.get(verbal_number)  # Convert verbal number to integer if available

    all_numbers = re.findall(r"(\d+|zero|one|two|three|four|five|six|seven|eight|nine|ten)", text, re.IGNORECASE)
    if all_numbers:
        last_number = all_numbers[-1].lower()
        return int(last_number) if last_number.isdigit() else number_mapping.get(last_number)

    return None

# Function to check if the addition is correct based on shape_sides_dict
def is_math_correct(row, model):
    shape_sides = row[f'{model}_shape_sides_dict']
    extracted_sum = row[f'{model}_extracted_sum']
    
    # Handle empty or None shape sides dict
    if not shape_sides or extracted_sum is None:
        return False
    
    total_calculated = sum(value if value is not None else 0 for value in shape_sides.values())
    return extracted_sum == total_calculated
    

# Process for every model in `two_shapes_df`
for model in models:
    print(f"\nProcessing model: {model}")
    
    # Extract total number of sides for each model's generated text
    two_shapes_df[f'{model}_extracted_sum'] = two_shapes_df[model].apply(extract_total_sides)
    
    # Compare extracted sum with total ground truth sides
    two_shapes_df[f'{model}_correct_third_step'] = two_shapes_df[f'{model}_extracted_sum'] == two_shapes_df['total_ground_truth_sides']
    
    # Validate if the addition is correct based on shape_sides_dict
    two_shapes_df[f'{model}_correct_third_based_second'] = two_shapes_df.apply(
        lambda row: is_math_correct(row, model), axis=1
    )

# Display results for each model
for model in models:
    print(f"\nResults for {model}:")
    print(two_shapes_df[
        [model, f'{model}_extracted_sum', 'total_ground_truth_sides', f'{model}_correct_third_step', f'{model}_correct_third_based_second']
    ])


In [None]:
# Define phrases to check (case insensitive)
invalid_phrases = [
    "the image contains no shapes",
    "i'm sorry",
    "there is no image provided for analysis"
]

# Convert invalid phrases to a single regex pattern (case insensitive)
invalid_pattern = re.compile('|'.join(map(re.escape, invalid_phrases)), re.IGNORECASE)

# Function to invalidate steps based on phrases
def invalidate_steps(row, model):
    if invalid_pattern.search(row[model]):
        return {f'{model}_correct_first_step': False,
                f'{model}_correct_second_step': False,
                f'{model}_correct_third_step': False,
                f'{model}_correct_third_based_second': False}
    return {f'{model}_correct_first_step': row[f'{model}_correct_first_step'],
            f'{model}_correct_second_step': row[f'{model}_correct_second_step'],
            f'{model}_correct_third_step': row[f'{model}_correct_third_step'],
            f'{model}_correct_third_based_second': row[f'{model}_correct_third_based_second']}

# Apply the invalidation for each model
for model in models:
    updated_steps = two_shapes_df.apply(lambda row: invalidate_steps(row, model), axis=1)
    updated_steps_df = pd.DataFrame(updated_steps.tolist())  # Convert to a DataFrame
    two_shapes_df.update(updated_steps_df)  # Update the original DataFrame with invalidated steps


In [None]:
invalid_phrases = [
    "the image contains no shapes",
    "i'm sorry",
    "there is no image provided for analysis"
]

# Convert invalid phrases to a single regex pattern (case insensitive)
invalid_pattern = re.compile('|'.join(map(re.escape, invalid_phrases)), re.IGNORECASE)

# Filter rows containing any of the invalid phrases
invalid_rows = two_shapes_df[two_shapes_df[models].apply(
    lambda row: any(invalid_pattern.search(str(row[model])) for model in models), axis=1
)]


In [None]:
invalid_rows["generate_text_qwen"]

In [None]:
invalid_rows["background_color"].value_counts()

In [None]:
def correct_step_1_false_predictions(row, model):
    generated_text = row[model]
    ground_truth_shapes = row['ground_truth_shapes']
    
    # Ensure ground_truth_shapes is a list with two identical shapes
    if len(ground_truth_shapes) == 2 and ground_truth_shapes[0] == ground_truth_shapes[1]:
        shape = ground_truth_shapes[0]
        # Regex pattern to detect "two [color] [shape]s"
        pattern = rf"two\s+\w+\s+{shape}s"
        if re.search(pattern, generated_text, re.IGNORECASE):
            return True  # Correct the prediction to True
        # Additional check for "two shapes: [color] [shape]s"
        pattern_extended = rf"two\s+shapes:\s+\w+\s+{shape}s"
        if re.search(pattern_extended, generated_text, re.IGNORECASE):
            return True  # Correct the prediction to True
    return row[f'{model}_correct_first_step']  # Return the original correctness value


# Apply the correction for each model
for model in models:
    two_shapes_df[f'{model}_correct_first_step'] = two_shapes_df.apply(
        lambda row: correct_step_1_false_predictions(row, model), axis=1
    )

In [None]:
import pandas as pd
from collections import Counter

# Convert ground truth shapes to tuples for grouping
two_shapes_df['ground_truth_shapes_tuple'] = two_shapes_df['ground_truth_shapes'].apply(tuple)

# Define a function to calculate accuracy per shape combination
def calculate_accuracy_per_combo(two_shapes_df, models):
    combo_accuracies = {}
    for model in models:
        # Extract the shape count column and create a correctness column
        model_shape_counts_col = f'{model}_shape_counts'
        correctness_col = f'{model}_correct'
        
        two_shapes_df[correctness_col] = two_shapes_df.apply(
            lambda row: row[model_shape_counts_col] == dict(Counter(row['ground_truth_shapes'])),
            axis=1
        )
        
        # Group by ground truth shape combinations (as tuples) and calculate accuracy
        accuracy_by_combo = (
            two_shapes_df.groupby('ground_truth_shapes_tuple')[correctness_col]
            .mean()
            .reset_index()
            .rename(columns={correctness_col: 'accuracy'})
        )
        
        # Sort results by accuracy (ascending)
        accuracy_by_combo = accuracy_by_combo.sort_values(by='ground_truth_shapes_tuple', ascending=True).reset_index(drop=True)
        
        # Store results for the model
        combo_accuracies[model] = accuracy_by_combo

    return combo_accuracies


# Calculate accuracy per shape combination for each model
combo_accuracies = calculate_accuracy_per_combo(two_shapes_df, models)

# Display results
for model, accuracy_df in combo_accuracies.items():
    print(f"\nAccuracy per Shape Combination for {model}:")
    print(accuracy_df)


In [None]:
# Function to calculate accuracy for a given column
def calculate_accuracy(column):
    return (two_shapes_df[column].sum() / len(two_shapes_df[column])) * 100  # Proportion of True values as a percentage

# Loop through models and calculate step-wise accuracies
for model in models:
    print(f"\nStep-wise Accuracies for {model}:")
    
    # Accuracy for Step 1 (shape extraction)
    step_1_accuracy = calculate_accuracy(f'{model}_correct_first_step')
    
    # Accuracy for Step 2 (shape-side mapping validation)
    step_2_accuracy = calculate_accuracy(f'{model}_correct_second_step')
    
    # Accuracy for Step 3 (total side extraction validation)
    step_3_accuracy = calculate_accuracy(f'{model}_correct_third_step')
    
    # Accuracy for Step 3 based on Step 2 (math correctness)
    step_3_2_accuracy = calculate_accuracy(f'{model}_correct_third_based_second')
    
    # Display the results
    print(f"Step 1 Accuracy (correct_first_step): {step_1_accuracy:.2f}%")
    print(f"Step 2 Accuracy (correct_second_step): {step_2_accuracy:.2f}%")
    print(f"Step 3 Based on 2 Accuracy: {step_3_2_accuracy:.2f}%")
    print(f"Step 3 Accuracy (correct_third_step): {step_3_accuracy:.2f}%")
    


In [None]:
########################### abstract_shapes ###########################

In [None]:
df = pd.read_csv('final_combined_abstract_generated_texts.csv')
df

In [None]:
## all of these color: output/irregular_poly/shape_151_sides_7.png
# Define the colors from the specific image
bg_color_to_remove = df.loc[df["path"] == "output/irregular_poly/shape_151_sides_7.png", "background_color"].values[0]
shape_color_to_remove = df.loc[df["path"] == "output/irregular_poly/shape_151_sides_7.png", "shape_color"].values[0]

# Drop rows where both `background_color` and `shape_color` match
df_filtered = df[~((df["background_color"].astype(str) == bg_color_to_remove) & (df["shape_color"].astype(str) == shape_color_to_remove))]

# Display changes
print(f"Original DF Size: {len(df)}, New DF Size: {len(df_filtered)}")

# Update the dataframe
df = df_filtered


In [None]:
df["shape_type"] = df["shape_type"].fillna("irregular_polygon")

In [None]:
df["generate_text_llava-one"] = df["generate_text_llava-one"].str.split(".assistant\n").str[-1]

In [None]:
df["generate_text_llama-3.2"] = df["generate_text_llama-3.2"].str.split(".assistant\n\n").str[-1].str.replace(".", "")

In [None]:
df["generate_text_llama-3.2"] = df["generate_text_llama-3.2"].apply(lambda x: x.split("** ")[-1] if isinstance(x, str) else x)

In [None]:
df["generate_text_janus"] = df["generate_text_janus"].str.replace(".", "")

In [None]:
import re

# Define a dictionary to map English numbers to digits
num_translation = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4", "five": "5",
    "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10",
    "eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14", "fifteen": "15",
    "sixteen": "16", "seventeen": "17", "eighteen": "18", "nineteen": "19", "twenty": "20"
}

def clean_and_convert_numbers(text):
    if isinstance(text, str):  # Ensure text is a string
        text = text.lower().replace(".", "") #.replace(" ", "")  # Remove periods and spaces
        # Replace English numbers with digits using regex
        for word, num in num_translation.items():
            text = re.sub(rf"\b{word}\b", num, text)  # Ensure full word match
    return text

# Apply the function to both columns
columns_to_clean = [ "generate_text_llama-3.2"]
df[columns_to_clean] = df[columns_to_clean].applymap(clean_and_convert_numbers)


In [None]:

# slightly different func
def clean_and_convert_numbers(text):
    if isinstance(text, str):  # Ensure text is a string
        text = text.lower().replace(".", "") #.replace(" ", "")  # Remove periods and spaces
        # Replace English numbers with digits using regex
        for word, num in num_translation.items():
            text = re.sub(rf"\b{word}\b", num, text)  # Ensure full word match
    return text

# Apply the function to both columns
columns_to_clean = ["generate_text_janus", "generate_text_molmo", "generate_text_math-puma"]
df[columns_to_clean] = df[columns_to_clean].applymap(clean_and_convert_numbers)

df["generate_text_janus"].value_counts()

In [None]:
import pandas as pd
import re

def extract_last_number(text):
    """
    Extracts the last number from a given text.
    Returns None if no number is found.
    """
    if not isinstance(text, str):
        return None  # Return None for non-string values
    
    numbers = re.findall(r'\d+', text)  # Find all numbers in the text
    return numbers[-1] if numbers else None  # Return the last number found

# Apply function to the column
df["generate_text_janus"] = df["generate_text_janus"].apply(extract_last_number)
df["generate_text_janus"].value_counts()

In [None]:
df["generate_text_math-puma"] = df["generate_text_math-puma"].apply(extract_last_number)
df["generate_text_math-puma"].value_counts()

In [None]:
model_columns = [
    "generate_text_llava_1.5",
    "generate_text_llava_1.6",
    "generate_text_qwen",
    "generate_text_internvl",
    "generate_text_llava-one",
    "generate_text_llama-3.2",
    "generate_text_gpt-4o",
    "generate_text_gpt-4-turbo",
    "generate_text_janus", 
    "generate_text_molmo",
    "generate_text_g-llava",
    "generate_text_math-llava",
    "generate_text_math-puma",
]

# Strip spaces only in model-generated text columns
df[model_columns] = df[model_columns].applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Convert `num_sides` to string for comparison
df["num_sides"] = df["num_sides"].astype(str).str.strip()

# Compute accuracy for each model
accuracy_results = {}
for model in model_columns:
    correct_predictions = (df[model].astype(str).str.strip() == df["num_sides"]).sum()
    accuracy = correct_predictions / len(df)  # Percentage accuracy
    accuracy_results[model] = accuracy

# Convert results to a DataFrame for visualization
accuracy_df = pd.DataFrame(accuracy_results.items(), columns=["Model", "Accuracy"])

accuracy_df

In [None]:

df[model_columns] = df[model_columns].applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Convert `num_sides` to string for comparison
df["num_sides"] = df["num_sides"].astype(str).str.strip()

# Compute accuracy per shape_type
accuracy_results = []
for model in model_columns:
    shape_accuracy = df.groupby("shape_type").apply(
        lambda group: (group[model].astype(str).str.strip() == group["num_sides"]).mean()
    ).reset_index()
    shape_accuracy.columns = ["shape_type", model]
    accuracy_results.append(shape_accuracy)

# Merge accuracy results into a single DataFrame
accuracy_df = accuracy_results[0]
for acc_df in accuracy_results[1:]:
    accuracy_df = accuracy_df.merge(acc_df, on="shape_type")

accuracy_df

In [None]:
########################## visual cues and cot ##########################

In [None]:
import os
import pandas as pd

base_dfs = {"heptagons_with_visual_cues": None, "arrow_on_plus_with_visual_cues": None}  
model_columns = {
    "llava-1.6": "generate_text_llava_1.6",
    "llava-1.5": "generate_text_llava_1.5",
    "qwen": "generate_text_qwen",
    "internvl": "generate_text_internvl",
    "llava-one": "generate_text_llava-one",
    "llama-3.2": "generate_text_llama-3.2",
    "molmo": "generate_text_molmo",
    "janus": "generate_text_janus",
    "gpt-4-turbo": "generate_text_gpt-4-turbo",
    "gpt-4o": "generate_text_gpt-4o",
    "math-llava": "generate_text_math-llava",
    "g-llava": "generate_text_g-llava",
    "math-puma": "generate_text_math-puma",
    
    
}
additional_columns = {task: {col: None for col in model_columns.values()} for task in base_dfs.keys()}

# Iterate over files in the current directory
for file in os.listdir("."):
    if file.endswith(".csv"): #two_shapes_full.csv
        for task in base_dfs.keys():  # Separate processing for 'two_shapes', 'shape_id', and 'traffic'
            if task in file:
                for model_version, column_name in model_columns.items():
                    if f"{model_version}_llava_" in file:
                        print(f"Processing {file} for {task}...")
                        df = pd.read_csv(file)
                        print(len(df))
                        
                        # For the first model (base_df), take all columns
                        if base_dfs[task] is None and model_version == "llava-1.5":
                            base_dfs[task] = df.copy()
                            base_dfs[task].rename(columns={"generated_text": column_name}, inplace=True)
                        else:
                            # Just take the generated_text column and rename it
                            additional_columns[task][column_name] = df["generated_text"]

# Append the additional columns to the base DataFrames
for task, base_df in base_dfs.items():
    if base_df is not None:
        for column_name, column_data in additional_columns[task].items():
            if column_data is not None:
                base_df[column_name] = column_data

        # Save the final combined DataFrame for each task
        output_file = f"final_combined_{task}_generated_texts.csv"
        base_df.to_csv(output_file, index=False)
        print(f"Saved combined DataFrame for {task} to '{output_file}'.")


In [None]:
df = pd.read_csv('final_combined_heptagons_with_visual_cues_generated_texts.csv')
df

In [None]:
word_to_number = {
    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
}

# List of columns to process
columns_to_map = [
    "generate_text_llava_1.5", "generate_text_llava_1.6", "generate_text_llava-one",
    "generate_text_qwen", "generate_text_internvl",
    "generate_text_llama-3.2", "generate_text_gpt-4-turbo",
    "generate_text_gpt-4o","generate_text_molmo",  "generate_text_janus",
    "generate_text_math-puma","generate_text_math-llava",  "generate_text_g-llava"
]

# Function to map words to numbers
def map_words_to_numbers(text):
    words = text.split()  # Split the text into words
    mapped_numbers = [str(word_to_number.get(word.lower(), word)) for word in words]  # Map each word
    return " ".join(mapped_numbers)  # Recombine as a string

# Apply the mapping to each column
for col in columns_to_map:
    df[col] = df[col].astype(str).apply(map_words_to_numbers)

In [None]:
df["generate_text_llava-one"] = df["generate_text_llava-one"].str.split("assistant").str[-1]
df["generate_text_llava-one"].value_counts()

In [None]:
df["generate_text_janus"].value_counts()

In [None]:
import re
import numpy as np

# List of columns to process
columns_to_extract = [
    "generate_text_llava_1.5", "generate_text_llava_1.6", "generate_text_llava-one",
    "generate_text_qwen", "generate_text_internvl",
    "generate_text_llama-3.2", "generate_text_gpt-4-turbo",
    "generate_text_gpt-4o", "generate_text_molmo",  "generate_text_janus", 
     "generate_text_math-puma","generate_text_math-llava",  "generate_text_g-llava"
]

# Function to extract the last number
def extract_last_number(text):
    # Find all numbers in the text
    numbers = re.findall(r'\d+', text)
    
    # Debugging: Print numbers found
    print(f"Extracted numbers: {numbers}")

    if numbers:
        last_number = numbers[-1]

        # Convert safely, checking for extremely large numbers
        try:
            last_number = int(last_number)

            # If the number is too large, return None to avoid overflow
            if last_number > np.iinfo(np.int64).max:
                print(f"Overflow detected: {last_number}")
                return None  

            return last_number  # Return safe integer
        except ValueError:
            print(f"Failed to convert: {last_number}")
            return None

    return None  # Return None if no number is found


# Apply the extraction to each column
for col in columns_to_extract:
    df[col] = df[col].astype(str).apply(extract_last_number)


In [None]:
df['num_sides'] = 7

In [None]:
import pandas as pd


generate_text_columns = [col for col in df.columns if col.startswith("generate_text_")]
updated_columns = [col.replace("generate_text_", "") for col in generate_text_columns]
df = df.rename(columns=dict(zip(generate_text_columns, updated_columns)))

In [None]:
import pandas as pd

# Assuming `df` is your DataFrame with relevant columns
generate_text_columns = [
    "generate_text_llava_1.5", "generate_text_llava_1.6", "generate_text_llava-one",
    "generate_text_qwen", "generate_text_internvl",
    "generate_text_llama-3.2", "generate_text_gpt-4-turbo",
    "generate_text_gpt-4o", "generate_text_janus", "generate_text_molmo",
     "generate_text_math-puma","generate_text_math-llava",  "generate_text_g-llava"
]

# Remove "generate_text_" prefix from column names
updated_columns = [col.replace("generate_text_", "") for col in generate_text_columns]

# Rename the columns in the DataFrame
df = df.rename(columns=dict(zip(generate_text_columns, updated_columns)))

df = df[df["type"].isin(["ABC", "ABC_random", "123","123_random", "plain", "CoT_123", "CoT_123_random", "CoT_ABC", "CoT_ABC_random", "CoT_plain"])]
#df = df[df["type"].isin(["ABC_random", "123_random", "plain",  "CoT_123_random", "CoT_ABC_random", "CoT_plain"])]
# Function to calculate accuracy for each column
def calculate_accuracy_by_type(df, model_column, group_column, target_column):
    df["correct"] = df[model_column] == df[target_column]
    accuracy_table = df.groupby(group_column)["correct"].mean() * 100
    return accuracy_table

# Create a summary table for accuracies by model and type
accuracy_tables = {}

for model_column in updated_columns:
    accuracy_tables[model_column] = calculate_accuracy_by_type(df, model_column, "type", "num_sides")

# Combine accuracy tables into a single DataFrame
accuracy_summary = pd.DataFrame(accuracy_tables)
accuracy_summary


In [None]:
###################### arrow_on_plus_with_visual_cues for VC-CoT ######################

In [None]:
df = pd.read_csv('final_combined_arrow_on_plus_with_visual_cues_generated_texts.csv')
df

In [None]:
word_to_number = {
    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
}

# List of columns to process
columns_to_map = [
    "generate_text_llava_1.5", "generate_text_llava_1.6", "generate_text_llava-one",
    "generate_text_qwen", "generate_text_internvl",
    "generate_text_llama-3.2", "generate_text_gpt-4-turbo", "generate_text_molmo","generate_text_janus",
    "generate_text_gpt-4o",
    "generate_text_math-llava",  "generate_text_g-llava",  "generate_text_math-puma"
]

# Function to map words to numbers
def map_words_to_numbers(text):
    words = text.split()  # Split the text into words
    mapped_numbers = [str(word_to_number.get(word.lower(), word)) for word in words]  # Map each word
    return " ".join(mapped_numbers)  # Recombine as a string

# Apply the mapping to each column
for col in columns_to_map:
    df[col] = df[col].astype(str).apply(map_words_to_numbers)

In [None]:
df["generate_text_llava-one"] = df["generate_text_llava-one"].str.split("assistant").str[-1]
df["generate_text_llava-one"].value_counts()

In [None]:
import re

# List of columns to process
columns_to_extract = [
    "generate_text_llava_1.5", "generate_text_llava_1.6", "generate_text_llava-one",
    "generate_text_qwen", "generate_text_internvl", "generate_text_molmo","generate_text_janus",
    "generate_text_llama-3.2", "generate_text_gpt-4-turbo",
    "generate_text_gpt-4o",  "generate_text_math-llava",
    "generate_text_math-llava",  "generate_text_g-llava",  "generate_text_math-puma"# "generate_text_gpt-o1"
]

# Function to extract the last number
def extract_last_number(text):
    # Find all numbers in the text
    numbers = re.findall(r'\d+', text)
    # Return the last number if found, otherwise None
    return int(numbers[-1]) if numbers else None

# Apply the extraction to each column
for col in columns_to_extract:
    df[col] = df[col].astype(str).apply(extract_last_number)


In [None]:
df['num_sides'] =15

In [None]:
generate_text_columns = [col for col in df.columns if col.startswith("generate_text_")]
updated_columns = [col.replace("generate_text_", "") for col in generate_text_columns]
df = df.rename(columns=dict(zip(generate_text_columns, updated_columns)))


In [None]:
import pandas as pd

# Assuming `df` is your DataFrame with relevant columns
generate_text_columns = [
    "generate_text_llava_1.5", "generate_text_llava_1.6", "generate_text_llava-one",
    "generate_text_qwen", "generate_text_internvl", "generate_text_molmo","generate_text_janus",
    "generate_text_llama-3.2", "generate_text_gpt-4-turbo",
    "generate_text_gpt-4o",  
    "generate_text_math-llava",  "generate_text_g-llava",  "generate_text_math-puma"
]

# Remove "generate_text_" prefix from column names
updated_columns = [col.replace("generate_text_", "") for col in generate_text_columns]

# Rename the columns in the DataFrame
df = df.rename(columns=dict(zip(generate_text_columns, updated_columns)))

df = df[df["type"].isin(["ABC", "ABC_random", "123","123_random", "plain", "CoT_123", "CoT_123_random", "CoT_ABC", "CoT_ABC_random", "CoT_plain"])]
# Function to calculate accuracy for each column
def calculate_accuracy_by_type(df, model_column, group_column, target_column):
    df["correct"] = df[model_column] == df[target_column]
    accuracy_table = df.groupby(group_column)["correct"].mean() * 100
    return accuracy_table

# Create a summary table for accuracies by model and type
accuracy_tables = {}

for model_column in updated_columns:
    accuracy_tables[model_column] = calculate_accuracy_by_type(df, model_column, "type", "num_sides")

# Combine accuracy tables into a single DataFrame
accuracy_summary = pd.DataFrame(accuracy_tables)
accuracy_summary


In [None]:
###################### MathVerse for VC-CoT ######################

In [None]:
import os
import pandas as pd

base_dfs = {"mathverse_CoT": None}  
model_columns = {
    "llava-1.6": "generate_text_llava_1.6",
    "llava-1.5": "generate_text_llava_1.5",
    "qwen": "generate_text_qwen",
    "internvl": "generate_text_internvl",
    "llava-one": "generate_text_llava-one",
    "llama-3.2": "generate_text_llama-3.2",
    "molmo": "generate_text_molmo",
    "janus": "generate_text_janus",
    "gpt-4-turbo": "generate_text_gpt-4-turbo",
    "gpt-4o": "generate_text_gpt-4o",
    "math-llava": "generate_text_math-llava",
    "g-llava": "generate_text_g-llava",
    "math-puma": "generate_text_math-puma"
}


additional_columns = {task: {col: None for col in model_columns.values()} for task in base_dfs.keys()}

# Iterate over files in the current directory
for file in os.listdir("."):
    if file.endswith(".csv"):
        for task in base_dfs.keys():  # Separate processing for 'two_shapes', 'shape_id', and 'traffic'
            if task in file:
                for model_version, column_name in model_columns.items():
                    if f"{model_version}_llava_" in file:
                        print(f"Processing {file} for {task}...")
                        df = pd.read_csv(file)
                        print(len(df))
                        
                        # For the first model (base_df), take all columns
                        if base_dfs[task] is None and model_version == "llava-1.5":
                            base_dfs[task] = df.copy()
                            base_dfs[task].rename(columns={"generated_text": column_name}, inplace=True)
                        else:
                            # Just take the generated_text column and rename it
                            additional_columns[task][column_name] = df["generated_text"]

# Append the additional columns to the base DataFrames
for task, base_df in base_dfs.items():
    if base_df is not None:
        for column_name, column_data in additional_columns[task].items():
            if column_data is not None:
                base_df[column_name] = column_data

        # Save the final combined DataFrame for each task
        output_file = f"final_combined_{task}_generated_texts.csv"
        base_df.to_csv(output_file, index=False)
        print(f"Saved combined DataFrame for {task} to '{output_file}'.")


In [None]:
df = pd.read_csv('final_combined_mathverse_CoT_generated_texts.csv')

In [None]:
import re

# Extract columns that start with "generate_text_"
model_columns = [col for col in df.columns if col.startswith("generate_text_")]

# Create a new dataframe to store last matches
last_match_df = df.copy()

# Apply regex extraction for each model column
for col in model_columns:
    last_match_df[f"last_match_{col}"] = df[col].apply(
        lambda x: re.findall(r'[ABCDEF]', x)[-1] if isinstance(x, str) and re.findall(r'[ABCDEF]', x) else None
    )

In [None]:
last_match_df

In [None]:
from sklearn.metrics import f1_score, accuracy_score

# Extract model columns that start with "last_match_generate_text_"
model_columns = [col for col in last_match_df.columns if col.startswith("last_match_generate_text_")]

# Drop rows with missing values in either any model column or 'answer'
df_cleaned = last_match_df.dropna(subset=model_columns + ['answer'])

# Convert columns to string type to avoid type issues
df_cleaned[model_columns] = df_cleaned[model_columns].astype(str)
df_cleaned['answer'] = df_cleaned['answer'].astype(str)

# Initialize dictionary to store results per model
model_metrics = {}

# Compute metrics per model column
for model_col in model_columns:
    results = df_cleaned.groupby("type").apply(
        lambda group: pd.Series({
            'F1-score': f1_score(group['answer'], group[model_col], average='macro') if not group.empty else None,
            'Accuracy': accuracy_score(group['answer'], group[model_col]) if not group.empty else None
        })
    )
    model_metrics[model_col] = results

In [None]:
model_metrics

In [None]:
selected_models = [
    "last_match_generate_text_molmo",
    "last_match_generate_text_janus",
    "last_match_generate_text_gpt-4o",
    "last_match_generate_text_gpt-4-turbo"
]

# Define the types to focus on
selected_types = ["mathverse_cot", "direct", "VC-CoT"]

# Extract the relevant accuracy data
filtered_accuracy_results = {
    model: data.loc[selected_types, "Accuracy"] for model, data in model_metrics.items() if model in selected_models
}

# Convert to DataFrame for better readability
filtered_accuracy_df = pd.DataFrame(filtered_accuracy_results)

filtered_accuracy_df