## Result Visualization Scripts


In [20]:
import pandas as pd
import numpy as np
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import re

sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = [14, 8]
plt.rcParams['font.size'] = 12

In [21]:
RESULTS_FOLDER = "../results_csv/"
FIGURE_FOLDER = "../figures/"
TABLE_FOLDER = "../tables/"

os.makedirs(FIGURE_FOLDER, exist_ok=True)
os.makedirs(TABLE_FOLDER, exist_ok=True)

In [22]:
def read_csv_file(filepath):
    try:
        df = pd.read_csv(filepath)
        print(f"Loaded: {filepath}")
        return df
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

In [33]:
def parse_filename(filename):
    filename = filename.replace("_table.csv", "")
    parts = filename.split('_')
    
    if parts[0] == "complexity":
        task = "complexity"
        if 'Regressor' in parts[1] or 'Classifier' in parts[1]:
            model_type = parts[1]
            metric = parts[2]
            split = parts[3]
            # Check if submetric exists
            if len(parts) > 4:
                submetric = '_'.join(parts[4:])
            else:
                submetric = "complexity"
            layer = "overall"
        else:
            model_type = parts[1] + "_" + parts[2]
            metric = parts[3]
            split = parts[4]
            if len(parts) > 5:
                submetric = parts[5]
            else:
                submetric = "complexity"
            layer = parts[6] if len(parts) > 6 else "overall"

    elif parts[0] == "single" and parts[1] == "submetric":
        task = "single_submetric"
        model_type = parts[2] + "_" + parts[3]
        metric = parts[4]
        split = parts[5]
        if len(parts) > 7:
            submetric = '_'.join(parts[6:-1])
            layer = parts[-1]
        else:
            submetric = '_'.join(parts[6:])
            layer = "overall"

    elif parts[0] == "question" and parts[1] == "type":
        task = "question_type"
        model_type = parts[2] + "_" + parts[3]
        metric = parts[4]
        split = parts[5]
        if len(parts) > 6:
            submetric = parts[6]
        else:
            submetric = "overall"
        layer = parts[7] if len(parts) > 7 else "overall"

    else:
        raise ValueError(f"Unknown filename format: {filename}")

    return {
        "task": task,
        "model_type": model_type,
        "metric": metric,
        "split": split,
        "submetric": submetric,
        "layer": layer
    }


In [34]:
def compute_selectivity(df, metric_type):
    """Compute selectivity based on metric type"""
    if metric_type == "classification":
        df['control_mean'] = df[['control1', 'control2', 'control3']].mean(axis=1)
        df['selectivity'] = df['real'] - df['control_mean']
    elif metric_type == "regression":
        df['control_mean'] = df[['control1', 'control2', 'control3']].mean(axis=1)
        df['selectivity'] = df['control_mean'] - df['real']
    else:
        raise ValueError(f"Unknown metric type: {metric_type}")
    return df

def detect_metric_type(metric_name):
    name = metric_name.lower()
    if "accuracy" in name:
        return "classification"
    elif "mse" in name:
        return "regression"
    else:
        raise ValueError(f"Cannot infer metric type from: {metric_name}")


all_results = []

for filepath in glob.glob(os.path.join(RESULTS_FOLDER, "*.csv")):
    filename = os.path.basename(filepath)
    meta = parse_filename(filename)
    
    df = read_csv_file(filepath)
    if df is None:
        continue
    
    metric_type = detect_metric_type(meta['metric'])
    df = compute_selectivity(df, metric_type)
    
    # Add metadata columns
    for key, value in meta.items():
        df[key] = value
    
    all_results.append(df)

# Combine everything into one big DataFrame
results_df = pd.concat(all_results, ignore_index=True)
print(f"\nLoaded {len(results_df)} results.")




Loaded: ../results_csv/single_submetric_lm_probe_mse_test_avg_max_depth_layer11_table.csv
Loaded: ../results_csv/complexity_DummyRegressor_mse_test_avg_max_depth_table.csv
Loaded: ../results_csv/complexity_lm_probe_mse_test_complexity_layer6_table.csv
Loaded: ../results_csv/complexity_XGBRegressor_mse_test_avg_links_len_table.csv
Loaded: ../results_csv/single_submetric_lm_probe_mse_test_avg_verb_edges_layer11_table.csv


IndexError: list index out of range

In [35]:
for f in glob.glob(os.path.join(RESULTS_FOLDER, "*.csv")):
    filename = os.path.basename(f)
    print(f"Processing: {filename}")
    print(filename.replace("_table.csv", "").split('_'))  # Show split parts


Processing: single_submetric_lm_probe_mse_test_avg_max_depth_layer11_table.csv
['single', 'submetric', 'lm', 'probe', 'mse', 'test', 'avg', 'max', 'depth', 'layer11']
Processing: complexity_DummyRegressor_mse_test_avg_max_depth_table.csv
['complexity', 'DummyRegressor', 'mse', 'test', 'avg', 'max', 'depth']
Processing: complexity_lm_probe_mse_test_complexity_layer6_table.csv
['complexity', 'lm', 'probe', 'mse', 'test', 'complexity', 'layer6']
Processing: complexity_XGBRegressor_mse_test_avg_links_len_table.csv
['complexity', 'XGBRegressor', 'mse', 'test', 'avg', 'links', 'len']
Processing: single_submetric_lm_probe_mse_test_avg_verb_edges_layer11_table.csv
['single', 'submetric', 'lm', 'probe', 'mse', 'test', 'avg', 'verb', 'edges', 'layer11']
Processing: question_type_DummyClassifier_accuracy_test_table.csv
['question', 'type', 'DummyClassifier', 'accuracy', 'test']
Processing: question_type_lm_probe_accuracy_test_overall_layer2_table.csv
['question', 'type', 'lm', 'probe', 'accuracy'

In [36]:
for f in glob.glob(os.path.join(RESULTS_FOLDER, "*.csv")):
    filename = os.path.basename(f)
    print(f"Processing: {filename}")
    parts = filename.replace("_table.csv", "").split('_')
    print(parts)
    try:
        meta = parse_filename(filename)
        print(meta)
    except Exception as e:
        print(f"ERROR: {e}")


Processing: single_submetric_lm_probe_mse_test_avg_max_depth_layer11_table.csv
['single', 'submetric', 'lm', 'probe', 'mse', 'test', 'avg', 'max', 'depth', 'layer11']
{'task': 'single_submetric', 'model_type': 'lm_probe', 'metric': 'mse', 'split': 'test', 'submetric': 'avg_max_depth', 'layer': 'layer11'}
Processing: complexity_DummyRegressor_mse_test_avg_max_depth_table.csv
['complexity', 'DummyRegressor', 'mse', 'test', 'avg', 'max', 'depth']
{'task': 'complexity', 'model_type': 'DummyRegressor', 'metric': 'mse', 'split': 'test', 'submetric': 'avg_max_depth', 'layer': 'overall'}
Processing: complexity_lm_probe_mse_test_complexity_layer6_table.csv
['complexity', 'lm', 'probe', 'mse', 'test', 'complexity', 'layer6']
{'task': 'complexity', 'model_type': 'lm_probe', 'metric': 'mse', 'split': 'test', 'submetric': 'complexity', 'layer': 'layer6'}
Processing: complexity_XGBRegressor_mse_test_avg_links_len_table.csv
['complexity', 'XGBRegressor', 'mse', 'test', 'avg', 'links', 'len']
{'task':

In [None]:
all_results = []

for filepath in glob.glob(os.path.join(RESULTS_FOLDER, "*.csv")):
    filename = os.path.basename(filepath)
    meta = parse_filename(filename)
    
    df = read_csv_file(filepath)
    if df is None:
        continue
    
    metric_type = detect_metric_type(meta['metric'])
    df = compute_selectivity(df, metric_type)
    
    # Add metadata columns
    for key, value in meta.items():
        df[key] = value
    
    all_results.append(df)

# Combine everything into one big DataFrame
results_df = pd.concat(all_results, ignore_index=True)
print(f"\nLoaded {len(results_df)} results.")

Loaded: ../results_csv/single_submetric_lm_probe_mse_test_avg_max_depth_layer11_table.csv
Loaded: ../results_csv/complexity_DummyRegressor_mse_test_avg_max_depth_table.csv


ValueError: Cannot infer metric type from: test

In [None]:
display(results_df.head())

In [None]:
subset = results_df[
    (results_df['task'] == 'question_type') & 
    (results_df['model_type'] == 'lmprobe') & 
    (results_df['split'] == 'test_set')
]

plt.figure(figsize=(14, 8))
sns.barplot(x='language', y='selectivity', data=subset, palette="viridis")
plt.title("Question Type Classification Selectivity (LMProbe)", fontsize=16)
plt.xlabel("Language", fontsize=14)
plt.ylabel("Selectivity", fontsize=14)
plt.xticks(rotation=45)
plt.ylim(-0.1, 0.6)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(FIGURE_FOLDER, "selectivity_question_type_lmprobe.png"), dpi=300)
plt.show()

In [None]:
table = subset[['language', 'real', 'control_mean', 'selectivity']].round(3)
table.to_csv(os.path.join(TABLE_FOLDER, "selectivity_question_type_lmprobe.csv"), index=False)
table.to_latex(os.path.join(TABLE_FOLDER, "selectivity_question_type_lmprobe.tex"), index=False)

print("Saved Figure and Table for Question Type Selectivity.")

In [None]:
subset_submetrics = results_df[
    (results_df['task'].isin(['token_count', 'lexical_density', 'avg_dep_len', 'max_tree_depth', 'subordinate_clauses', 'verb_complexity'])) &
    (results_df['model_type'] == 'lmprobe') &
    (results_df['split'] == 'test_set')
]

heatmap_data = subset_submetrics.pivot(index='task', columns='language', values='selectivity')

plt.figure(figsize=(14, 8))
sns.heatmap(heatmap_data, annot=True, cmap="coolwarm", center=0)
plt.title("Submetric Selectivity Heatmap (LMProbe)", fontsize=16)
plt.xlabel("Language", fontsize=14)
plt.ylabel("Submetric", fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(FIGURE_FOLDER, "submetric_selectivity_heatmap_lmprobe.png"), dpi=300)
plt.show()

print("Saved Heatmap for Submetric Selectivity.")