## Result Analysis and Visuals

In [15]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
from collections import defaultdict
import os
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.ticker import MaxNLocator
import matplotlib as mpl

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['figure.titlesize'] = 16
plt.rcParams['figure.dpi'] = 300

model_colors = {
    "Dummy": "#E1E1E1",
    "LogisticRegression": "#6EB5FF",
    "Ridge": "#6EB5FF",
    "XGBRegressor": "#3D85C6",
    "XGBClassifier": "#3D85C6",
    "glot500": "#073763"
}

language_colors = {
    "ar": "#C44E52",  # Arabic - red
    "en": "#4C72B0",  # English - blue
    "fi": "#55A868",  # Finnish - green
    "id": "#8172B3",  # Indonesian - purple
    "ja": "#CCB974",  # Japanese - yellow
    "ko": "#64B5CD",  # Korean - teal
    "ru": "#DB8E00"   # Russian - orange
}

cmap_blues = LinearSegmentedColormap.from_list(
    "custom_blues", ["#FFFFFF", "#EBF1F9", "#D6E3F3", "#C1D6ED", "#ADCAE7", "#98BDE0", "#83B0DA", "#6EA4D4", "#5997CE", "#448BC8", "#2F7EC2"])


In [26]:
# Cell 2: Load the experiment results
RESULTS_FILE = '/home/robin/Research/qtype-eval/scripts/visualization/combined_results/combined_results.json'

with open(RESULTS_FILE, 'r') as f:
    results = json.load(f)

print(f"Loaded {len(results)} experiment results from {RESULTS_FILE}")

Loaded 880 experiment results from /home/robin/Research/qtype-eval/scripts/visualization/combined_results/combined_results.json


In [28]:
def create_structured_dataframe(results):
    data_rows = []
    
    for exp_key, exp_data in results.items():
        parts = exp_key.split('_')
        source = parts[0]  # e.g., tfidf or glot500
        
        # Handle different experiment naming patterns
        if 'single_submetric' in exp_key:
            # Extract model, task, control index, submetric and language
            model = 'lm_probe'
            task = 'complexity'  # Single submetric experiments are for complexity
            
            # Find control index if present
            control_idx = None
            for part in parts:
                if part.startswith('control') and part[7:].isdigit():
                    control_idx = part[7:]
                    break
            
            # Extract submetric
            submetric = None
            for sm in ['avg_links_len', 'avg_max_depth', 'avg_subordinate_chain_len', 'avg_verb_edges', 'lexical_density', 'n_tokens']:
                if sm in exp_key:
                    submetric = sm
                    break
            
            # Extract language
            language = None
            for lang in ['ar', 'en', 'fi', 'id', 'ja', 'ko', 'ru']:
                if f"_{lang}" in exp_key or exp_key.endswith(f"_{lang}"):
                    language = lang
                    break
        
        elif 'question_type' in exp_key:
            idx = exp_key.index('question_type')
            if 'lm_probe' in exp_key:
                model = 'lm_probe'
            else:
                model = '_'.join(parts[1:idx])
            task = 'question_type'
            
            # Find control index if present
            control_idx = None
            for part in parts:
                if part.startswith('control') and part[7:].isdigit():
                    control_idx = part[7:]
                    break
            
            # Extract language if present
            language = None
            for lang in ['ar', 'en', 'fi', 'id', 'ja', 'ko', 'ru']:
                if f"_{lang}" in exp_key or exp_key.endswith(f"_{lang}"):
                    language = lang
                    break
            
            submetric = None  # Question type task doesn't have submetrics
        
        elif 'complexity' in exp_key:
            idx = exp_key.index('complexity')
            if 'lm_probe' in exp_key:
                model = 'lm_probe'
            else:
                model = '_'.join(parts[1:idx])
            task = 'complexity'
            
            # Find control index if present
            control_idx = None
            for part in parts:
                if part.startswith('control') and part[7:].isdigit():
                    control_idx = part[7:]
                    break
            
            # Look for specific submetrics in traditional complexity experiments
            submetric = None
            for sm in ['avg_links_len', 'avg_max_depth', 'avg_subordinate_chain_len', 'avg_verb_edges', 'lexical_density', 'n_tokens']:
                if sm in exp_key:
                    submetric = sm
                    break
                    
            # Extract language if present
            language = None
            for lang in ['ar', 'en', 'fi', 'id', 'ja', 'ko', 'ru']:
                if f"_{lang}" in exp_key or exp_key.endswith(f"_{lang}"):
                    language = lang
                    break
        
        else:
            # Default parsing for other experiment types
            if len(parts) >= 3:
                model = parts[1]
                task = parts[2] if len(parts) > 2 else None
            else:
                model = None
                task = None
            
            control_idx = None
            submetric = None
            language = None
        
        # Extract scores from the experiment data
        if isinstance(exp_data, dict):
            score = exp_data.get('score')
            scores_by_lang = exp_data.get('scores', {})
            languages_in_exp = exp_data.get('languages', [])
        else:
            score = exp_data
            scores_by_lang = {}
            languages_in_exp = []
        
        # Create a base row with common metadata
        base_row = {
            'experiment_key': exp_key,
            'source': source,
            'model': model,
            'task': task,
            'control_index': control_idx,
            'submetric': submetric,
        }
        
        # If we have language-specific scores and no specific language was extracted
        if language is None and scores_by_lang:
            for lang, lang_score in scores_by_lang.items():
                lang_row = base_row.copy()
                lang_row['language'] = lang
                lang_row['score'] = lang_score
                data_rows.append(lang_row)
        elif language is None and languages_in_exp:
            # If we have a list of languages but no language-specific scores
            for lang in languages_in_exp:
                lang_row = base_row.copy()
                lang_row['language'] = lang
                lang_row['score'] = score  # Use the same score for all languages
                data_rows.append(lang_row)
        else:
            # Either we have a specific language or no language information
            row = base_row.copy()
            row['language'] = language
            row['score'] = score
            data_rows.append(row)
    
    # Convert to DataFrame
    df = pd.DataFrame(data_rows)
    
    # Clean up - handle NaN and None values appropriately
    df['score'] = pd.to_numeric(df['score'], errors='coerce')
    
    return df

# Create the dataframe
df = create_structured_dataframe(results)

# Display basic information about the dataframe
print(f"Created dataframe with {len(df)} rows")
print("\nDataframe columns:", df.columns.tolist())
print("\nUnique sources:", df['source'].unique())
print("Unique models:", df['model'].unique())
print("Unique tasks:", df['task'].unique())
print("Unique languages:", df['language'].dropna().unique())
print("Unique submetrics:", df['submetric'].dropna().unique())
print("Unique control indices:", df['control_index'].dropna().unique())

# Show a sample of the dataframe
print("\nSample of the dataframe:")
display(df)

Created dataframe with 1318 rows

Dataframe columns: ['experiment_key', 'source', 'model', 'task', 'control_index', 'submetric', 'language', 'score']

Unique sources: ['tfidf' 'glot500']
Unique models: ['Ridge_complexity_control1_avg_max_depth'
 'XGBRegressor_complexity_control1_n_tokens'
 'XGBRegressor_complexity_avg_max_depth'
 'XGBRegressor_complexity_control2'
 'XGBRegressor_complexity_control2_avg_max_depth'
 'XGBRegressor_complexity_control1_avg_subordinate_chain_len'
 'DummyClassifier_question_type' 'XGBRegressor_complexity_control1'
 'XGBRegressor_complexity_control1_avg_verb_edges'
 'XGBRegressor_complexity_control2_lexical_density'
 'Ridge_complexity_control1_avg_subordinate_chain_len'
 'XGBRegressor_complexity_control3_avg_max_depth'
 'Ridge_complexity_avg_subordinate_chain_len' 'all'
 'LogisticRegression_question_type_control1' 'Ridge_complexity'
 'Ridge_complexity_control1_avg_links_len'
 'XGBRegressor_complexity_control3_avg_links_len'
 'Ridge_complexity_control3_avg_verb

Unnamed: 0,experiment_key,source,model,task,control_index,submetric,language,score
0,tfidf_Ridge_complexity_control1_avg_max_depth,tfidf,Ridge_complexity_control1_avg_max_depth,complexity,1,avg_max_depth,ar,
1,tfidf_Ridge_complexity_control1_avg_max_depth,tfidf,Ridge_complexity_control1_avg_max_depth,complexity,1,avg_max_depth,en,
2,tfidf_Ridge_complexity_control1_avg_max_depth,tfidf,Ridge_complexity_control1_avg_max_depth,complexity,1,avg_max_depth,fi,
3,tfidf_Ridge_complexity_control1_avg_max_depth,tfidf,Ridge_complexity_control1_avg_max_depth,complexity,1,avg_max_depth,id,
4,tfidf_Ridge_complexity_control1_avg_max_depth,tfidf,Ridge_complexity_control1_avg_max_depth,complexity,1,avg_max_depth,ja,
...,...,...,...,...,...,...,...,...
1313,glot500_lm_probe_single_submetric_control3_avg...,glot500,lm_probe,complexity,3,avg_verb_edges,id,
1314,glot500_lm_probe_single_submetric_control2_avg...,glot500,lm_probe,complexity,2,avg_verb_edges,id,
1315,glot500_lm_probe_single_submetric_control1_avg...,glot500,lm_probe,complexity,1,avg_verb_edges,ru,
1316,glot500_lm_probe_single_submetric_control3_avg...,glot500,lm_probe,complexity,3,avg_verb_edges,ru,


In [32]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Main function to run the parser on JSON data from uploaded files
async def main():
    """
    Main function to parse and analyze experiment results.
    
    This should be run in a Jupyter notebook cell after uploading your JSON data files.
    """
    # Extract JSON data from the uploaded files
    json_chunks = []
    
    try:
        # Assuming you have the file content available
        # For document 3 (paste.txt)
        paste_txt_content = await window.fs.readFile('paste.txt', { encoding: 'utf8' })
        json_chunks.append("{" + paste_txt_content + "}")
        print("Successfully loaded paste.txt")
    except Exception as e:
        print(f"Error loading paste.txt: {e}")
    
    try:
        # For document 4 (paste-2.txt)
        paste2_txt_content = await window.fs.readFile('paste-2.txt', { encoding: 'utf8' })
        json_chunks.append("{" + paste2_txt_content + "}")
        print("Successfully loaded paste-2.txt")
    except Exception as e:
        print(f"Error loading paste-2.txt: {e}")
    
    try:
        # For document 5 (paste-3.txt)
        paste3_txt_content = await window.fs.readFile('paste-3.txt', { encoding: 'utf8' })
        json_chunks.append("{" + paste3_txt_content + "}")
        print("Successfully loaded paste-3.txt")
    except Exception as e:
        print(f"Error loading paste-3.txt: {e}")
    
    # Check if we have any data
    if not json_chunks:
        print("No JSON data loaded. Please upload your experiment results files.")
        return
    
    # Parse all chunks
    results_df, overview = parse_multiple_json_chunks(json_chunks)
    
    # Display overview
    print("\nOverview of Experiments:")
    display(overview)
    
    # Display key metrics by experiment type
    print("\nPerformance Metrics by Experiment Type:")
    metrics_cols = [col for col in results_df.columns if col.startswith('test_')]
    if metrics_cols:
        display(results_df.groupby(['source', 'task', 'experiment_type', 'is_control'])[metrics_cols].mean().reset_index())
    
    # Plot results for regression tasks
    regression_df = results_df[results_df['task_type'] == 'regression']
    if len(regression_df) > 0:
        print("\nRegression Task Performance:")
        if 'test_r2' in regression_df.columns:
            plot_experiment_results(regression_df, metric='test_r2', task='complexity')
    
    # Plot results for classification tasks
    classification_df = results_df[results_df['task_type'] == 'classification']
    if len(classification_df) > 0:
        print("\nClassification Task Performance:")
        if 'test_accuracy' in classification_df.columns:
            plot_experiment_results(classification_df, metric='test_accuracy', task='question_type')
    
    # Check for cross-lingual experiments
    cross_lingual_df = results_df[results_df['experiment_type'] == 'cross_lingual']
    if len(cross_lingual_df) > 0:
        print("\nCross-Lingual Performance:")
        plot_cross_lingual_heatmap(results_df, task='complexity', metric='test_r2')
        if 'question_type' in cross_lingual_df['task'].values:
            plot_cross_lingual_heatmap(results_df, task='question_type', metric='test_accuracy')
    
    # Analyze per-language metrics for tfidf models
    if 'tfidf' in results_df['source'].values:
        print("\nAnalyzing TFIDF Per-Language Performance:")
        analyze_per_language_metrics(results_df)
    
    return results_df, overview

# Function to generate experiment overview
def generate_experiment_overview(results_df):
    """Generate an overview of experiments by source, task, and type."""
    # Convert to pandas DataFrame if not already
    if not isinstance(results_df, pd.DataFrame):
        results_df = pd.DataFrame(results_df)
    
    # Create a summary DataFrame
    overview = []
    
    # Group by source, task, and experiment type
    grouped = results_df.groupby(['source', 'task', 'experiment_type', 'is_control'])
    
    for (source, task, exp_type, is_control), group in grouped:
        control_status = 'control' if is_control else 'non_control'
        languages = []
        
        # Get unique languages
        if 'language' in group.columns:
            languages = group['language'].dropna().unique().tolist()
        elif 'train_language' in group.columns:
            train_langs = group['train_language'].dropna().unique().tolist()
            eval_langs = group['eval_language'].dropna().unique().tolist()
            languages = list(set(train_langs + eval_langs))
        elif 'languages' in group.columns:
            # Handle comma-separated language strings
            all_langs = []
            for lang_str in group['languages'].dropna():
                all_langs.extend(lang_str.split(','))
            languages = list(set(all_langs))
        
        # Get unique submetrics
        submetrics = group['submetric'].dropna().unique().tolist() if 'submetric' in group.columns else []
        
        overview.append({
            'source': source,
            'task': task,
            'experiment_type': exp_type,
            'control_status': control_status,
            'count': len(group),
            'languages': ', '.join(languages) if languages else '',
            'submetrics': ', '.join(submetrics) if submetrics else '',
            'models': ', '.join(group['model'].unique().tolist())
        })
    
    return pd.DataFrame(overview)

# Function to handle parsing data from multiple chunks
def parse_multiple_json_chunks(json_chunks):
    """
    Parse ML experiment results from multiple JSON chunks.
    
    Args:
        json_chunks: List of JSON strings containing experiment results
        
    Returns:
        Tuple of (results_df, overview_df)
    """
    # Combine all chunks into a single JSON object
    combined_data = {}
    
    for chunk in json_chunks:
        try:
            # Parse the chunk
            chunk_data = json.loads(chunk)
            
            # Add to combined data
            combined_data.update(chunk_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON chunk: {e}")
            continue
    
    # Process the combined data
    results = parse_experiment_results(combined_data)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    # Generate overview
    overview_df = generate_experiment_overview(results_df)
    
    return results_df, overview_df

# Function to visualize results
def plot_experiment_results(df, metric='test_r2', task=None, by='source', is_control=None, submetric=None):
    """
    Plot experiment results based on specified filters.
    
    Args:
        df: DataFrame with experiment results
        metric: Metric to plot (default: test_r2)
        task: Filter by task (default: None to show all)
        by: Group by column (default: 'source')
        is_control: Filter by control status (default: None to show all)
        submetric: Filter by submetric (default: None to show all)
    """
    # Filter data based on parameters
    plot_df = df.copy()
    
    if task:
        plot_df = plot_df[plot_df['task'] == task]
    
    if is_control is not None:
        plot_df = plot_df[plot_df['is_control'] == is_control]
    
    if submetric:
        plot_df = plot_df[plot_df['submetric'] == submetric]
    
    # Check if we have data after filtering
    if len(plot_df) == 0:
        print("No data available with the current filters.")
        return
    
    # Check if the metric exists in the DataFrame
    if metric not in plot_df.columns:
        print(f"Metric '{metric}' not found in data. Available metrics: {[c for c in df.columns if c.startswith('test_') or c.startswith('val_')]}")
        return
    
    # Create the plot
    plt.figure(figsize=(12, 6))
    
    # Create a grouped boxplot
    sns.boxplot(x=by, y=metric, data=plot_df)
    
    # Add individual points
    sns.stripplot(x=by, y=metric, data=plot_df, color='black', size=4, alpha=0.5)
    
    # Add titles and labels
    plt.title(f'{metric} by {by}' + (f' for {task}' if task else '') + 
              (f' (control: {is_control})' if is_control is not None else '') +
              (f' - {submetric}' if submetric else ''))
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45)
    
    # Adjust layout
    plt.tight_layout()
    plt.show()

# Function to create a cross-lingual performance heatmap
def plot_cross_lingual_heatmap(df, task='complexity', metric='test_r2'):
    """Create a heatmap of cross-lingual performance"""
    # Filter for cross-lingual experiments
    cross_df = df[(df['experiment_type'] == 'cross_lingual') & (df['task'] == task)]
    
    if len(cross_df) == 0:
        print(f"No cross-lingual data found for task: {task}")
        return
    
    # Check if we have the necessary columns
    if 'train_language' not in cross_df.columns or 'eval_language' not in cross_df.columns:
        print("Missing language columns for cross-lingual analysis")
        return
    
    # Create a pivot table for the heatmap
    heatmap_data = cross_df.pivot_table(
        index='train_language', 
        columns='eval_language', 
        values=metric,
        aggfunc='mean'
    )
    
    # Plot the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.2f', linewidths=.5)
    plt.title(f'Cross-Lingual Performance: {task} - {metric}')
    plt.tight_layout()
    plt.show()
    
    return heatmap_data

# Function to analyze per-language metrics from tfidf models
def analyze_per_language_metrics(df):
    """Extract and analyze per-language metrics from tfidf models"""
    # Filter for tfidf models
    tfidf_df = df[df['source'] == 'tfidf']
    
    # Check if we have any tfidf models
    if len(tfidf_df) == 0:
        print("No tfidf models found in the data")
        return
    
    # Create a DataFrame to store per-language metrics
    per_lang_metrics = []
    
    for _, row in tfidf_df.iterrows():
        if 'per_language_metrics' in row and isinstance(row['per_language_metrics'], dict):
            test_metrics = row['per_language_metrics'].get('test', {})
            
            # Extract metrics for each language
            for lang, metrics in test_metrics.items():
                per_lang_metrics.append({
                    'experiment_id': row['experiment_id'],
                    'model': row['model'],
                    'task': row['task'],
                    'is_control': row['is_control'],
                    'control_index': row['control_index'],
                    'submetric': row['submetric'],
                    'language': lang,
                    'test_mse': metrics.get('mse'),
                    'test_rmse': metrics.get('rmse'),
                    'test_mae': metrics.get('mae'),
                    'test_r2': metrics.get('r2')
                })
    
    if len(per_lang_metrics) > 0:
        per_lang_df = pd.DataFrame(per_lang_metrics)
        print("\nPer-Language Performance for TFIDF Models:")
        display(per_lang_df.groupby(['language', 'task', 'is_control'])['test_r2'].mean().reset_index())
        
        # Create a visualization
        plt.figure(figsize=(14, 6))
        sns.barplot(x='language', y='test_r2', hue='is_control', data=per_lang_df)
        plt.title('Per-Language Performance (TFIDF Models)')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()
        
        return per_lang_df
    else:
        print("No per-language metrics found in the data")
        return None

# Function to parse experiment results
def parse_experiment_results(json_data):
    """
    Parse ML experiment results from JSON data.
    
    Args:
        json_data: Dictionary containing experiment results
        
    Returns:
        List of dictionaries with parsed results
    """
    # Initialize the results array
    results = []
    
    # Process each experiment in the JSON data
    for key, value in json_data.items():
        try:
            # Skip any potential empty or invalid entries
            if not value or not isinstance(value, dict):
                continue
            
            # Parse the experiment key to extract metadata
            key_parts = key.split('_')
            source = key_parts[0]
            
            # Create a results object with common fields
            result_obj = {
                'experiment_id': key,
                'source': source
            }
            
            # Handle the two main experiment types: glot500 and tfidf
            if source == 'glot500':
                # Extract model name (usually 'lm_probe')
                result_obj['model'] = '_'.join(key_parts[1:3])
                
                # Handle the experiment_type
                if 'experiment_type' in value and value['experiment_type'] == 'cross_lingual' or \
                   ('train_language' in value and 'eval_language' in value):
                    result_obj['experiment_type'] = 'cross_lingual'
                    result_obj['train_language'] = value.get('train_language')
                    result_obj['eval_language'] = value.get('eval_language')
                else:
                    result_obj['experiment_type'] = 'monolingual'
                    # Handle ko and other nested structures
                    if 'language' in value:
                        result_obj['language'] = value['language']
                    elif 'ko' in value and isinstance(value['ko'], dict):
                        result_obj['language'] = 'ko'
                        # Use the nested metrics for ko
                        if 'test_metrics' in value['ko']:
                            value['test_metrics'] = value['ko']['test_metrics']
                            value['train_metrics'] = value['ko']['train_metrics']
                            value['val_metrics'] = value['ko']['val_metrics']
                    else:
                        # Try to extract language from key
                        lang_candidates = [part for part in key_parts if len(part) == 2]
                        if lang_candidates:
                            result_obj['language'] = lang_candidates[0]
                        else:
                            result_obj['language'] = 'unknown'
                
                # Extract task details
                result_obj['task'] = value.get('task')
                result_obj['task_type'] = value.get('task_type', '')  # Classification or regression
                
                # Handle control info
                result_obj['is_control'] = value.get('is_control', False)
                result_obj['control_index'] = value.get('control_index')
                
                # Handle submetric
                result_obj['submetric'] = value.get('submetric', '')
                
                # Handle layer info 
                result_obj['layer'] = value.get('layer')
                
                # Extract metrics based on task type
                result_obj['train_time'] = value.get('train_time')
                
                # Add test metrics
                if 'test_metrics' in value:
                    if value.get('task_type') == 'classification':
                        result_obj['test_loss'] = value['test_metrics'].get('loss')
                        result_obj['test_accuracy'] = value['test_metrics'].get('accuracy')
                        result_obj['test_f1'] = value['test_metrics'].get('f1')
                    else:  # regression
                        result_obj['test_loss'] = value['test_metrics'].get('loss')
                        result_obj['test_mse'] = value['test_metrics'].get('mse')
                        result_obj['test_rmse'] = value['test_metrics'].get('rmse')
                        result_obj['test_r2'] = value['test_metrics'].get('r2')
                
                # Add val metrics
                if 'val_metrics' in value:
                    if value.get('task_type') == 'classification':
                        result_obj['val_loss'] = value['val_metrics'].get('loss')
                        result_obj['val_accuracy'] = value['val_metrics'].get('accuracy')
                        result_obj['val_f1'] = value['val_metrics'].get('f1')
                    else:  # regression
                        result_obj['val_loss'] = value['val_metrics'].get('loss')
                        result_obj['val_mse'] = value['val_metrics'].get('mse')
                        result_obj['val_rmse'] = value['val_metrics'].get('rmse')
                        result_obj['val_r2'] = value['val_metrics'].get('r2')
            
            elif source == 'tfidf':
                # Extract model name (e.g., Ridge, XGBRegressor)
                result_obj['model'] = key_parts[1]
                
                # Set experiment type (always monolingual for tfidf)
                result_obj['experiment_type'] = 'monolingual'
                
                # Extract task
                result_obj['task'] = key_parts[2]
                result_obj['task_type'] = 'regression'  # TFIDF models are regression
                
                # Handle control info
                control_parts = [part for part in key_parts if part.startswith('control')]
                if control_parts:
                    result_obj['is_control'] = True
                    result_obj['control_index'] = int(control_parts[0].replace('control', ''))
                else:
                    result_obj['is_control'] = False
                    result_obj['control_index'] = None
                
                # Handle submetric (if present)
                submetric_candidates = [
                    'avg_max_depth', 'n_tokens', 'avg_subordinate_chain_len', 
                    'avg_verb_edges', 'lexical_density'
                ]
                detected_submetrics = [sm for sm in submetric_candidates if sm in key or 
                                     (isinstance(value.get('submetric'), str) and sm in value.get('submetric', ''))]
                
                if detected_submetrics:
                    result_obj['submetric'] = detected_submetrics[0]
                elif 'submetric' in value and value['submetric']:
                    result_obj['submetric'] = value['submetric']
                else:
                    result_obj['submetric'] = ''
                
                # Extract languages
                if 'languages' in value:
                    result_obj['languages'] = ','.join(value['languages'])
                
                # Extract metrics
                result_obj['train_time'] = value.get('training_time')
                
                # Add test metrics
                if 'test_metrics' in value:
                    result_obj['test_mse'] = value['test_metrics'].get('mse')
                    result_obj['test_rmse'] = value['test_metrics'].get('rmse')
                    result_obj['test_mae'] = value['test_metrics'].get('mae')
                    result_obj['test_r2'] = value['test_metrics'].get('r2')
                
                # Add validation metrics
                if 'val_metrics' in value:
                    result_obj['val_mse'] = value['val_metrics'].get('mse')
                    result_obj['val_rmse'] = value['val_metrics'].get('rmse')
                    result_obj['val_mae'] = value['val_metrics'].get('mae')
                    result_obj['val_r2'] = value['val_metrics'].get('r2')
                
                # Store per-language metrics reference
                if 'per_language_metrics' in value:
                    result_obj['per_language_metrics'] = value['per_language_metrics']
            
            # Add the processed result to the results array
            results.append(result_obj)
            
        except Exception as e:
            print(f"Error processing experiment {key}: {e}")
    
    return results

In [34]:

# Option 1: Run the main function to automatically process uploaded files
results_df, overview = await main()

# Option 2: Manually process JSON strings or files
json_chunks = []

# Example: Manually specify JSON content as strings
json_chunks.append("""
{
  "glot500_lm_probe_single_submetric_control2_avg_subordinate_chain_len_fi_layer12": {
    "train_time": 80.71137046813965,
    "train_metrics": {
      "loss": 0.013749146613602837,
      "mse": 0.013802728615701199,
      "rmse": 0.11748501443035703,
      "r2": 0.008942842483520508
    },
    "val_metrics": {
      "loss": 0.05693912319839001,
      "mse": 0.05693540349602699,
      "rmse": 0.23861140688581298,
      "r2": -0.1697772741317749
    },
    "test_metrics": {
      "loss": 0.054117132776549885,
      "mse": 0.054204776883125305,
      "rmse": 0.232819193545389,
      "r2": -0.22100341320037842
    },
    "language": "fi",
    "task": "single_submetric",
    "task_type": "regression",
    "model_type": "lm_probe",
    "is_control": true,
    "control_index": 2,
    "submetric": "avg_subordinate_chain_len",
    "source": "glot500",
    "layer": 12
  }
}
""")

# Parse the manually provided JSON chunks
manual_results_df, manual_overview = parse_multiple_json_chunks(json_chunks)

# Display the overview
print("Overview of Experiments:")
display(manual_overview)

# Additional analysis examples:

# 1. Compare performance across different tasks
if 'task' in manual_results_df.columns and 'test_r2' in manual_results_df.columns:
    task_performance = manual_results_df.groupby('task')['test_r2'].mean().reset_index()
    display(task_performance)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='task', y='test_r2', data=task_performance)
    plt.title('Performance by Task')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

# 2. Compare control vs non-control experiments
if 'is_control' in manual_results_df.columns and 'test_r2' in manual_results_df.columns:
    control_comparison = manual_results_df.groupby(['task', 'is_control'])['test_r2'].mean().reset_index()
    display(control_comparison)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x='task', y='test_r2', hue='is_control', data=control_comparison)
    plt.title('Control vs Non-Control Performance')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

# 3. Find the best performing model for each task
if len(manual_results_df) > 0:
    best_models = manual_results_df.loc[manual_results_df.groupby('task')['test_r2'].idxmax()]
    print("Best Performing Models by Task:")
    display(best_models[['task', 'model', 'source', 'test_r2']])

Error loading paste.txt: name 'window' is not defined
Error loading paste-2.txt: name 'window' is not defined
Error loading paste-3.txt: name 'window' is not defined
No JSON data loaded. Please upload your experiment results files.


TypeError: cannot unpack non-iterable NoneType object