In [3]:
import pandas as pd
import numpy as np
import re

# List of CSV files representing different tasks
csv_files = [
    './direction_results.csv', 
    './length_results.csv', 
    './position_common_scale_results.csv', 
    './position_non_aligned_scale_results.csv',
    './angle_results.csv',
    './area_results.csv',
    './volume_results.csv',
    './curvature_results.csv',
    './shading_results.csv'
]

# Function to extract digits
def extract_digits(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip().replace('\n', '')
    if x.startswith('user'):
        numbers = re.findall(r'\d+\.?\d*', x)
        return float(numbers[-1]) if numbers else np.nan
    numbers = re.findall(r'\d+\.?\d*', x)
    return float(numbers[0]) if numbers else np.nan

# Function to load and process CSV files into dataframes
def load_and_process_data(csv_files, drop_na=False):
    dataframes = {}
    
    for file_path in csv_files:
        # Load the CSV file into a dataframe
        df = pd.read_csv(file_path)
        
        # Apply extract_digits function to 'raw_answers' column
        df['parsed_answers'] = df['raw_answers'].apply(extract_digits)
        
        # Print row count before dropping NaN values
        print(f"Row count before dropping NaN for {file_path}: {len(df)}")
        
        # Drop rows with NaN in 'parsed_answers' if specified
        if drop_na:
            df = df.dropna(subset=['parsed_answers'])
        
        # Format 'parsed_answers' to one decimal point
        df['parsed_answers'] = df['parsed_answers'].apply(lambda x: '{:.1f}'.format(x) if not pd.isna(x) else x)
        
        # Store the dataframe in the dictionary, keyed by filename without extension
        task_name = file_path.split('/')[-1].replace('_results.csv', '')
        dataframes[task_name] = df
    
    return dataframes

# Usage example:
# Load with NaN rows retained
data_with_na = load_and_process_data(csv_files, drop_na=False)

# Load with NaN rows removed
data_no_na = load_and_process_data(csv_files, drop_na=True)

# Access a specific dataframe, e.g., for 'volume'
df_volume_with_na = data_with_na.get('volume')
df_volume_no_na = data_no_na.get('volume')


Row count before dropping NaN for ./direction_results.csv: 305
Row count before dropping NaN for ./length_results.csv: 305
Row count before dropping NaN for ./position_common_scale_results.csv: 305
Row count before dropping NaN for ./position_non_aligned_scale_results.csv: 305
Row count before dropping NaN for ./angle_results.csv: 305
Row count before dropping NaN for ./area_results.csv: 305
Row count before dropping NaN for ./volume_results.csv: 305
Row count before dropping NaN for ./curvature_results.csv: 305
Row count before dropping NaN for ./shading_results.csv: 305
Row count before dropping NaN for ./direction_results.csv: 305
Row count before dropping NaN for ./length_results.csv: 305
Row count before dropping NaN for ./position_common_scale_results.csv: 305
Row count before dropping NaN for ./position_non_aligned_scale_results.csv: 305
Row count before dropping NaN for ./angle_results.csv: 305
Row count before dropping NaN for ./area_results.csv: 305
Row count before dropping 

In [4]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Set width to None to fit all content across the width
pd.set_option('display.max_colwidth', None)  # Show full content of each column

df_volume_with_na

Unnamed: 0,model_name,run,image_path,ground_truth,raw_answers,forced_repetitions,time_ms,parsed_answers
0,gpt4o,run_0,EXP1-Results/volume/volume_image_1.png,27.0,1,0.0,6126.12462,1.0
1,gpt4o,run_0,EXP1-Results/volume/volume_image_2.png,2744.0,1,0.0,6017.877579,1.0
2,gpt4o,run_0,EXP1-Results/volume/volume_image_3.png,64.0,1,0.0,5959.122181,1.0
3,gpt4o,run_0,EXP1-Results/volume/volume_image_4.png,1.0,1,0.0,5919.401407,1.0
4,gpt4o,run_0,EXP1-Results/volume/volume_image_5.png,1000.0,64,0.0,5819.411278,64.0
5,gpt4o,run_0,EXP1-Results/volume/volume_image_6.png,343.0,1,0.0,6105.591536,1.0
6,gpt4o,run_0,EXP1-Results/volume/volume_image_7.png,343.0,1,0.0,6164.758444,1.0
7,gpt4o,run_0,EXP1-Results/volume/volume_image_8.png,512.0,1,0.0,5991.663218,1.0
8,gpt4o,run_0,EXP1-Results/volume/volume_image_9.png,216.0,1,0.0,5822.170973,1.0
9,gpt4o,run_0,EXP1-Results/volume/volume_image_10.png,64.0,1,0.0,5873.377323,1.0


In [9]:
df_volume_no_na

Unnamed: 0,model_name,run,image_path,ground_truth,raw_answers,forced_repetitions,time_ms,parsed_answers
0,gpt4o,run_0,EXP1-Results/volume/volume_image_1.png,27.0,1,0.0,6126.12462,1.0
1,gpt4o,run_0,EXP1-Results/volume/volume_image_2.png,2744.0,1,0.0,6017.877579,1.0
2,gpt4o,run_0,EXP1-Results/volume/volume_image_3.png,64.0,1,0.0,5959.122181,1.0
3,gpt4o,run_0,EXP1-Results/volume/volume_image_4.png,1.0,1,0.0,5919.401407,1.0
4,gpt4o,run_0,EXP1-Results/volume/volume_image_5.png,1000.0,64,0.0,5819.411278,64.0
5,gpt4o,run_0,EXP1-Results/volume/volume_image_6.png,343.0,1,0.0,6105.591536,1.0
6,gpt4o,run_0,EXP1-Results/volume/volume_image_7.png,343.0,1,0.0,6164.758444,1.0
7,gpt4o,run_0,EXP1-Results/volume/volume_image_8.png,512.0,1,0.0,5991.663218,1.0
8,gpt4o,run_0,EXP1-Results/volume/volume_image_9.png,216.0,1,0.0,5822.170973,1.0
9,gpt4o,run_0,EXP1-Results/volume/volume_image_10.png,64.0,1,0.0,5873.377323,1.0
