## Result Analysis and Visuals

In [11]:
import os
import json
import re
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [30]:
BASE_DIR = "/home/robin/Research/qtype-eval/scripts/visualization"
RESULTS_DIR = os.path.join(BASE_DIR, "combined_results/experiment_results")
OUTPUT_DIR = os.path.join(BASE_DIR, "combined_results/analysis_output")
FIGURES_DIR = os.path.join(OUTPUT_DIR, "figures")
TABLES_DIR = os.path.join(OUTPUT_DIR, "tables")

# Constants for analysis
LAYERS = [2, 6, 11, 12]
SUBMETRICS = [
    "avg_links_len", 
    "avg_max_depth", 
    "avg_sub_chain_len", 
    "avg_verb_edges", 
    "lexical_density", 
    "n_tokens",
    "complexity"
]
LANGUAGES = ["en", "ru", "fi", "ar", "id", "ja", "ko"]
MODEL_TYPES = ["lm_probe", "tfidf"]

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(FIGURES_DIR, exist_ok=True)
os.makedirs(TABLES_DIR, exist_ok=True)

In [31]:
def load_json_file(filepath):
    try:
        with open(filepath, 'r') as f:
            content = f.read()
            content = re.sub(r'}\s*{', '},{', content)
            if not content.strip().startswith('{'):
                content = '{' + content + '}'
            return json.loads(content)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in {filepath}: {e}")
        try:
            with open(filepath, 'r') as f:
                lines = f.readlines()
                combined = '{'
                for line in lines:
                    if line.strip().startswith('"') and ':' in line:
                        combined += line
                combined += '}'
                combined = combined.replace('}{', '},{')
                return json.loads(combined)
        except Exception as fallback_e:
            print(f"Fallback parsing also failed: {fallback_e}")
            return {}
    except Exception as e:
        print(f"Error loading file {filepath}: {e}")
        return {}

def find_json_files(base_dir):
    print(f"Searching for JSON files in: {base_dir}")
    pattern = os.path.join(base_dir, "**", "*.json")
    files = glob.glob(pattern, recursive=True)
    print(f"Found {len(files)} JSON files.")
    return files

In [32]:
def collect_results(results_dir):
    """Collect and organize all result files into a structured dictionary."""
    print(f"Collecting results from: {results_dir}")
    results = {}
    
    # Find all JSON files
    json_files = find_json_files(results_dir)
    
    # Extract layer, language, and experiment type information from filenames and paths
    for filepath in json_files:
        filename = os.path.basename(filepath)
        
        try:
            # Extract key information from filepath
            # Check for layer information in the filepath or filename
            layer_match = re.search(r'layer[_]?(\d+)', filepath.lower())
            layer = int(layer_match.group(1)) if layer_match else None
            
            # If no layer found but path contains layer directory
            if layer is None and '/glot500_layer_' in filepath:
                layer_dir_match = re.search(r'/glot500_layer_(\d+)', filepath)
                if layer_dir_match:
                    layer = int(layer_dir_match.group(1))
            
            # Load JSON content
            data = load_json_file(filepath)
            
            if not data:
                print(f"Warning: Empty or invalid data in {filepath}")
                continue
                
            # Process each entry in the JSON file
            for key, entry in data.items():
                if not isinstance(entry, dict):
                    print(f"Warning: Entry '{key}' in {filepath} is not a dictionary")
                    continue
                
                # Make sure we have metrics
                has_metrics = False
                for metric_type in ["train_metrics", "test_metrics", "val_metrics"]:
                    if metric_type in entry and isinstance(entry[metric_type], dict):
                        has_metrics = True
                        break
                
                if not has_metrics:
                    print(f"Warning: Entry '{key}' in {filepath} has no metrics")
                    continue
                
                # Extract language and experiment type information
                is_crosslingual = False
                source_language = None
                target_language = None
                
                # Try to extract language information
                if "language" in entry:
                    target_language = entry["language"]
                elif "eval_language" in entry:
                    target_language = entry["eval_language"]
                
                if "train_language" in entry and "eval_language" in entry:
                    source_language = entry["train_language"]
                    target_language = entry["eval_language"]
                    is_crosslingual = source_language != target_language
                
                # Check if it's a cross-lingual task based on key name
                if "_crosslingual_" in key or "_cross_" in key or "to_" in key or "cross_lingual" in filepath:
                    is_crosslingual = True
                    # Try to extract source and target from key pattern
                    cross_match = re.search(r"([a-z]{2})_to_([a-z]{2})", key)
                    if cross_match:
                        source_language, target_language = cross_match.groups()
                    elif "source" in entry:
                        source_language = entry["source"]
                
                # If no explicit language information, try to extract from filepath/key
                if target_language is None:
                    for lang in LANGUAGES:
                        if f"_{lang}" in key or f"/{lang}/" in filepath:
                            target_language = lang
                            break
                
                # Now extract submetric information
                submetric = None
                task = None
                
                if "submetric" in entry:
                    submetric = entry["submetric"]
                elif "task" in entry:
                    task = entry["task"]
                
                # Try to extract submetric from key or filepath
                if submetric is None:
                    for sm in SUBMETRICS:
                        if sm in key or sm in filepath:
                            submetric = sm
                            break
                    
                # Check for question_types task
                if task is None and "question_types" in key or "question_types" in filepath:
                    task = "question_types"
                
                # Check if it's a control task
                is_control = entry.get("is_control", False)
                if "control" in key.lower() and "is_control" not in entry:
                    is_control = True
                
                # Add additional metadata
                entry["experiment_type"] = "crosslingual" if is_crosslingual else "monolingual"
                
                # Add layer information if not already present and we found it
                if "layer" not in entry and layer is not None:
                    entry["layer"] = layer
                
                # Add language information
                if target_language is not None and "language" not in entry:
                    entry["language"] = target_language
                    
                if source_language is not None and "source_language" not in entry:
                    entry["source_language"] = source_language
                
                if is_control and "is_control" not in entry:
                    entry["is_control"] = True
                    
                if submetric is not None and "submetric" not in entry:
                    entry["submetric"] = submetric
                    
                if task is not None and "task" not in entry:
                    entry["task"] = task
                
                # Create a result key that includes all necessary information
                result_key = key
                if layer is not None and f"layer{layer}" not in key:
                    result_key = f"{key}_layer{layer}"
                
                # Print information about found entry
                print(f"Found entry: {result_key}")
                print(f"  Layer: {entry.get('layer', 'unknown')}")
                print(f"  Language: {entry.get('language', 'unknown')}")
                print(f"  Task: {entry.get('task', entry.get('submetric', 'unknown'))}")
                print(f"  Is control: {entry.get('is_control', False)}")
                print(f"  Experiment type: {entry['experiment_type']}")
                if "test_metrics" in entry:
                    for metric, value in entry["test_metrics"].items():
                        print(f"  {metric}: {value}")
                
                results[result_key] = entry
                
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            import traceback
            traceback.print_exc()
    
    print(f"Successfully processed {len(results)} results")
    return results

In [33]:
def calculate_selectivity(results):
    """Calculate selectivity (task performance - average control task performance)."""
    print("Calculating selectivity...")
    selectivity = {}
    
    # Group results by model, task, language, layer, and experiment type
    grouped_results = defaultdict(lambda: {"regular": None, "controls": []})
    
    for key, result in results.items():
        if not isinstance(result, dict) or "test_metrics" not in result:
            continue
            
        # Extract information
        model_type = result.get("model_type", "unknown")
        language = result.get("language", "unknown")
        experiment_type = result.get("experiment_type", "monolingual")
        layer = result.get("layer", 0)
        is_control = result.get("is_control", False) or "control" in key.lower()
        
        # Determine task type and specific task
        task_type = result.get("task_type", "unknown")
        task = "unknown"
        
        if "submetric" in result:
            task = result["submetric"]
        elif "task" in result and result["task"] == "question_types":
            task = "question_types"
        elif "question_types" in key:
            task = "question_types"
            
        # Skip if we don't have task information
        if task == "unknown":
            continue
        
        # Create a key for grouping
        if experiment_type == "crosslingual" and "source_language" in result:
            source_language = result["source_language"]
            group_key = f"{model_type}_{task}_{source_language}_to_{language}_layer{layer}_{experiment_type}"
        else:
            group_key = f"{model_type}_{task}_{language}_layer{layer}_{experiment_type}"
        
        # Store the result
        if is_control:
            grouped_results[group_key]["controls"].append(result)
        else:
            grouped_results[group_key]["regular"] = result
    
    # Calculate selectivity for each group
    for group_key, group_data in grouped_results.items():
        if group_data["regular"] and group_data["controls"]:
            regular_result = group_data["regular"]
            control_results = group_data["controls"]
            
            task_type = regular_result.get("task_type", "unknown")
            
            # Select the appropriate metric based on task type
            if "regression" in task_type or any(submetric in group_key for submetric in SUBMETRICS):
                metric_name = "r2"
            elif "classification" in task_type or "question_types" in group_key:
                metric_name = "accuracy"
            else:
                # Try to guess the metric
                if "test_metrics" in regular_result:
                    if "r2" in regular_result["test_metrics"]:
                        metric_name = "r2"
                    elif "accuracy" in regular_result["test_metrics"]:
                        metric_name = "accuracy"
                    else:
                        print(f"Warning: Unknown metric type for {group_key}")
                        continue
                else:
                    continue
            
            # Get value for regular task
            regular_value = regular_result["test_metrics"].get(metric_name, 0)
            
            # Calculate average control value
            control_values = [control["test_metrics"].get(metric_name, 0) for control in control_results]
            
            if control_values:
                avg_control_value = sum(control_values) / len(control_values)
                
                # Debug output
                print(f"Group: {group_key}")
                print(f"  - Regular: {regular_value:.4f}")
                print(f"  - Controls: {', '.join([f'{v:.4f}' for v in control_values])}")
                print(f"  - Avg Control: {avg_control_value:.4f}")
                print(f"  - Selectivity: {regular_value - avg_control_value:.4f}")
                
                # Calculate selectivity
                selectivity[group_key] = regular_value - avg_control_value
    
    print(f"Calculated selectivity for {len(selectivity)} tasks")
    return selectivity

In [34]:
def generate_complexity_table(results, selectivity, experiment_type="monolingual"):
    """Generate a table for complexity regression performance by layer and language."""
    print(f"Generating complexity table for {experiment_type} experiments...")
    table_data = {}
    
    for model_type in MODEL_TYPES:
        table_data[model_type] = {}
        
        if experiment_type == "monolingual":
            # Handle monolingual experiments
            for language in LANGUAGES:
                table_data[model_type][language] = {}
                
                for layer in LAYERS:
                    # Create key pattern to match the results
                    key_pattern = f"{model_type}_complexity_{language}_layer{layer}_{experiment_type}"
                    
                    # Find matching results
                    matching_results = []
                    for key, result in results.items():
                        if (key_pattern in key and 
                            isinstance(result, dict) and 
                            not result.get("is_control", True) and
                            "control" not in key.lower() and
                            (result.get("submetric", "") == "complexity" or "complexity" in key) and
                            result.get("experiment_type", "monolingual") == experiment_type):
                            matching_results.append(result)
                    
                    # Use the first matching result
                    if matching_results:
                        result = matching_results[0]
                        r2 = result["test_metrics"].get("r2", "N/A")
                        
                        # Find selectivity
                        sel_key = next((k for k in selectivity.keys() if key_pattern in k), None)
                        sel = selectivity.get(sel_key, "N/A") if sel_key else "N/A"
                        
                        table_data[model_type][language][layer] = {
                            "r2": round(r2, 3) if isinstance(r2, (int, float)) else r2,
                            "selectivity": round(sel, 3) if isinstance(sel, (int, float)) else sel
                        }
        else:
            # Handle cross-lingual experiments
            for source_lang in LANGUAGES:
                for target_lang in LANGUAGES:
                    if source_lang == target_lang:
                        continue  # Skip same language pairs
                    
                    language_pair = f"{source_lang}_to_{target_lang}"
                    table_data[model_type][language_pair] = {}
                    
                    for layer in LAYERS:
                        # Create key pattern to match the results
                        key_pattern = f"{model_type}_complexity_{source_lang}_to_{target_lang}_layer{layer}_{experiment_type}"
                        
                        # Find matching results
                        matching_results = []
                        for key, result in results.items():
                            if (key_pattern in key and 
                                isinstance(result, dict) and 
                                not result.get("is_control", True) and
                                "control" not in key.lower() and
                                (result.get("submetric", "") == "complexity" or "complexity" in key) and
                                result.get("experiment_type", "crosslingual") == experiment_type and
                                result.get("source_language", "") == source_lang and
                                result.get("language", "") == target_lang):
                                matching_results.append(result)
                        
                        # Use the first matching result
                        if matching_results:
                            result = matching_results[0]
                            r2 = result["test_metrics"].get("r2", "N/A")
                            
                            # Find selectivity
                            sel_key = next((k for k in selectivity.keys() if key_pattern in k), None)
                            sel = selectivity.get(sel_key, "N/A") if sel_key else "N/A"
                            
                            table_data[model_type][language_pair][layer] = {
                                "r2": round(r2, 3) if isinstance(r2, (int, float)) else r2,
                                "selectivity": round(sel, 3) if isinstance(sel, (int, float)) else sel
                            }
    
    # Save the table as JSON
    output_path = os.path.join(TABLES_DIR, f"complexity_{experiment_type}_table.json")
    with open(output_path, 'w') as f:
        json.dump(table_data, f, indent=2)
        
    print(f"Saved complexity table to {output_path}")
    return table_data

In [35]:
def prepare_layer_wise_data(results, model_type="lm_probe", submetric="complexity", experiment_type="monolingual"):
    """Prepare data for layer-wise performance visualization."""
    data = {}
    
    if experiment_type == "monolingual":
        for language in LANGUAGES:
            data[language] = []
            
            for layer in LAYERS:
                # Create key pattern to match the results
                if submetric == "question_types":
                    key_pattern = f"{model_type}_question_types_{language}_layer{layer}_{experiment_type}"
                else:
                    key_pattern = f"{model_type}_{submetric}_{language}_layer{layer}_{experiment_type}"
                
                # Find matching results
                matching_results = []
                for key, result in results.items():
                    if (key_pattern in key and 
                        isinstance(result, dict) and 
                        not result.get("is_control", True) and
                        "control" not in key.lower() and
                        (
                            (submetric == "question_types" and (result.get("task", "") == "question_types" or "question_types" in key)) or
                            (submetric != "question_types" and (result.get("submetric", "") == submetric or submetric in key))
                        ) and
                        result.get("experiment_type", "monolingual") == experiment_type):
                        matching_results.append(result)
                
                # Use the first matching result
                if matching_results:
                    result = matching_results[0]
                    
                    # Get the appropriate metric
                    if submetric == "question_types":
                        metric_value = result["test_metrics"].get("accuracy", 0)
                    else:
                        metric_value = result["test_metrics"].get("r2", 0)
                    
                    data[language].append({
                        "layer": layer,
                        "value": metric_value
                    })
    
    return data

def plot_layer_wise_performance(results, submetric="complexity", model_type="lm_probe", experiment_type="monolingual"):
    """Plot layer-wise performance for a given submetric and model type."""
    print(f"Plotting layer-wise performance for {submetric}, {model_type}, {experiment_type}...")
    
    # Get the data
    layer_wise_data = prepare_layer_wise_data(results, model_type, submetric, experiment_type)
    
    plt.figure(figsize=(12, 8))
    try:
        plt.style.use('seaborn-v0_8-colorblind')  # For older matplotlib
    except:
        plt.style.use('seaborn-colorblind')  # For newer matplotlib
    
    colors = plt.cm.tab10(np.linspace(0, 1, len(LANGUAGES)))
    markers = ['o', 's', '^', 'D', 'v', '<', '>']
    
    # Filter out languages with no data
    non_empty_data = {lang: data for lang, data in layer_wise_data.items() if data}
    
    if not non_empty_data:
        print(f"Warning: No data found for {submetric}, {model_type}, {experiment_type}")
        plt.close()
        return None
    
    for i, (language, data) in enumerate(non_empty_data.items()):
        if not data:
            continue
            
        layers = [entry["layer"] for entry in data]
        values = [entry["value"] for entry in data]
        
        plt.plot(
            layers, values, 
            label=language.upper(), 
            color=colors[i % len(colors)], 
            marker=markers[i % len(markers)], 
            linewidth=2, 
            markersize=8
        )
    
    plt.xlabel('Layer', fontsize=14)
    
    if submetric == "question_types":
        plt.ylabel('Accuracy', fontsize=14)
        plt.title(f'Layer-wise Performance for Question Type Classification ({model_type})', fontsize=16)
    else:
        plt.ylabel('R² Score', fontsize=14)
        title_submetric = submetric.replace('_', ' ').title()
        plt.title(f'Layer-wise Performance for {title_submetric} ({model_type})', fontsize=16)
    
    plt.xticks(LAYERS)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(loc='best')
    plt.tight_layout()
    
    return plt

In [52]:
# Load results
print("Loading results data...")
results = collect_results(RESULTS_DIR)
print(f"Loaded {len(results)} result entries")

# Save raw result entries for debugging
with open(os.path.join(OUTPUT_DIR, 'parsed_results.json'), 'w') as f:
    # Convert results to a serializable format
    serializable_results = {}
    for key, value in results.items():
        try:
            # Try to serialize as JSON to check if it's serializable
            json.dumps(value)
            serializable_results[key] = value
        except:
            # If not serializable, convert to string
            print(f"Warning: Result for key '{key}' is not JSON serializable, converting to string")
            serializable_results[key] = str(value)
    
    json.dump(serializable_results, f, indent=2)

# Calculate selectivity
selectivity = calculate_selectivity(results)
print(f"Calculated selectivity for {len(selectivity)} tasks")

# Save selectivity results
with open(os.path.join(OUTPUT_DIR, 'selectivity.json'), 'w') as f:
    json.dump({str(k): v for k, v in selectivity.items()}, f, indent=2)

Loading results data...
Collecting results from: /home/robin/Research/qtype-eval/scripts/visualization/combined_results/experiment_results
Searching for JSON files in: /home/robin/Research/qtype-eval/scripts/visualization/combined_results/experiment_results
Found 52 JSON files.
Found entry: dummy_complexity_all
  Layer: unknown
  Language: unknown
  Task: complexity
  Is control: False
  Experiment type: monolingual
  mse: 0.04718531668186188
  rmse: 0.21722181447051278
  mae: 0.17157766222953796
  r2: -0.042618393898010254
Found entry: dummy_complexity_control1_all
  Layer: unknown
  Language: unknown
  Task: complexity
  Is control: True
  Experiment type: monolingual
  mse: 0.04718531668186188
  rmse: 0.21722181447051278
  mae: 0.17157766222953796
  r2: -0.042618393898010254
Found entry: dummy_complexity_control2_all
  Layer: unknown
  Language: unknown
  Task: complexity
  Is control: True
  Experiment type: monolingual
  mse: 0.04718531668186188
  rmse: 0.21722181447051278
  mae: 

In [20]:
# Generate tables for monolingual experiments
print("\nGenerating tables for monolingual experiments...")
mono_complexity_table = generate_complexity_table(results, selectivity, experiment_type="monolingual")
#mono_question_type_table = generate_question_type_table(results, selectivity, experiment_type="monolingual")

# Generate tables for different layers
#for layer in LAYERS:
    #mono_submetric_table = generate_submetric_table(results, selectivity, layer=layer, experiment_type="monolingual")

# Check for cross-lingual experiments
print("\nChecking for cross-lingual experiments...")
crosslingual_experiments = False

# Check if we have any cross-lingual experiments
for key, result in results.items():
    if isinstance(result, dict) and result.get("experiment_type", "") == "crosslingual":
        crosslingual_experiments = True
        break

if crosslingual_experiments:
    print("Cross-lingual experiments found. Generating tables...")
    cross_complexity_table = generate_complexity_table(results, selectivity, experiment_type="crosslingual")
    #cross_question_type_table = generate_question_type_table(results, selectivity, experiment_type="crosslingual")
    
    #for layer in LAYERS:
        #cross_submetric_table = generate_submetric_table(results, selectivity, layer=layer, experiment_type="crosslingual")
else:
    print("No cross-lingual experiments found.")


Generating tables for monolingual experiments...
Generating complexity table for monolingual experiments...
Saved complexity table to /home/robin/Research/qtype-eval/scripts/visualization/combined_results/analysis_output/tables/complexity_monolingual_table.json

Checking for cross-lingual experiments...
Cross-lingual experiments found. Generating tables...
Generating complexity table for crosslingual experiments...
Saved complexity table to /home/robin/Research/qtype-eval/scripts/visualization/combined_results/analysis_output/tables/complexity_crosslingual_table.json


In [41]:
# Create a figure for complexity across layers
plt.figure(figsize=(12, 8))
for model_type in MODEL_TYPES:
    fig = plot_layer_wise_performance(results, "complexity", model_type, "monolingual")
    if fig:
        plt.savefig(os.path.join(FIGURES_DIR, f'layer_wise_complexity_{model_type}.png'), dpi=300)
        plt.savefig(os.path.join(FIGURES_DIR, f'layer_wise_complexity_{model_type}.pdf'))
    plt.close()

# Create a figure for question types across layers
for model_type in MODEL_TYPES:
    fig = plot_layer_wise_performance(results, "question_types", model_type, "monolingual")
    if fig:
        plt.savefig(os.path.join(FIGURES_DIR, f'layer_wise_question_types_{model_type}.png'), dpi=300)
        plt.savefig(os.path.join(FIGURES_DIR, f'layer_wise_question_types_{model_type}.pdf'))
    plt.close()

# Create figures for each submetric
for submetric in [sm for sm in SUBMETRICS if sm != "complexity"]:
    for model_type in MODEL_TYPES:
        fig = plot_layer_wise_performance(results, submetric, model_type, "monolingual")
        if fig:
            plt.savefig(os.path.join(FIGURES_DIR, f'layer_wise_{submetric}_{model_type}.png'), dpi=300)
            plt.savefig(os.path.join(FIGURES_DIR, f'layer_wise_{submetric}_{model_type}.pdf'))
        plt.close()

print("\nAll visualizations generated. Check the output directory for results.")

Plotting layer-wise performance for complexity, lm_probe, monolingual...
Plotting layer-wise performance for complexity, tfidf, monolingual...
Plotting layer-wise performance for question_types, lm_probe, monolingual...
Plotting layer-wise performance for question_types, tfidf, monolingual...
Plotting layer-wise performance for avg_links_len, lm_probe, monolingual...
Plotting layer-wise performance for avg_links_len, tfidf, monolingual...
Plotting layer-wise performance for avg_max_depth, lm_probe, monolingual...
Plotting layer-wise performance for avg_max_depth, tfidf, monolingual...
Plotting layer-wise performance for avg_sub_chain_len, lm_probe, monolingual...
Plotting layer-wise performance for avg_sub_chain_len, tfidf, monolingual...
Plotting layer-wise performance for avg_verb_edges, lm_probe, monolingual...
Plotting layer-wise performance for avg_verb_edges, tfidf, monolingual...
Plotting layer-wise performance for lexical_density, lm_probe, monolingual...
Plotting layer-wise pe