## Data Analysis

In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
import seaborn as sns
import re
from collections import defaultdict
from IPython.display import display, HTML


  from .autonotebook import tqdm as notebook_tqdm


### dataset table

In [3]:
dataset = load_dataset("rokokot/question-type-and-complexity", name="base", split="train")
lang_map = {'ar': 'Arabic', 'en': 'English', 'fi': 'Finnish','id': 'Indonesian', 'ja': 'Japanese','ko': 'Korean', 'ru': 'Russian'}

splits = ['train', 'validation', 'test']
all_results = []

for split in splits:
  dataset = load_dataset('rokokot/question-type-and-complexity', name='base', split=split)
  
  for lang in list(lang_map.keys()):
    lang_data = dataset.filter(lambda x: x['language'] == lang)

    n_questions = len(lang_data)

    question_types = lang_data['question_type']

    polar_count = sum(1 for qt in question_types if qt == 1)
    content_count = sum(1 for qt in question_types if qt == 0)

    polar_pct = round((polar_count / n_questions) * 100, 1)
    content_pct = round((content_count / n_questions) * 100, 1)

    avg_complexity = round(np.mean(lang_data['complexity_score']), 2)

    all_results.append({'Language': lang_map[lang],'Questions': n_questions,'Polar (%)': polar_pct,'Content (%)': content_pct,'Avg. Complexity': avg_complexity})
stats_df = pd.DataFrame(all_results)
print(stats_df.to_string(index=False))

  Language  Questions  Polar (%)  Content (%)  Avg. Complexity
    Arabic        995       49.9         50.1             1.50
   English       1192       50.0         50.0             1.60
   Finnish       1195       50.0         50.0             1.37
Indonesian        954       47.9         52.1             1.86
  Japanese       1191       50.0         50.0             1.60
    Korean        739       46.1         53.9             1.97
   Russian       1194       50.0         50.0             1.76
    Arabic         44       45.5         54.5             1.73
   English         72       50.0         50.0             1.74
   Finnish         63       47.6         52.4             1.64
Indonesian         72       50.0         50.0             2.01
  Japanese         46       52.2         47.8             1.71
    Korean         72       50.0         50.0             2.05
   Russian         72       50.0         50.0             1.83
    Arabic         77       28.6         71.4          

In [3]:
def analyze_averages():
    splits = ["train", "validation", "test"]
    lang_map = {
        'ar': 'Arabic', 'en': 'English', 'fi': 'Finnish',
        'id': 'Indonesian', 'ja': 'Japanese',
        'ko': 'Korean', 'ru': 'Russian'
    }
    
    combined_stats = {lang: {'Questions': 0, 'Polar': 0, 'Content': 0, 'Complexity': []} 
                     for lang in lang_map.values()}
    
    total_questions = 0

    for split in splits:
        try:
            dataset = load_dataset("rokokot/question-type-and-complexity", name="base", split=split)
            
            for lang_code, lang_name in lang_map.items():
                lang_data = dataset.filter(lambda x: x['language'] == lang_code)
                
                if len(lang_data) == 0:
                    print(f"No data for {lang_name} in {split} split")
                    continue
                
                combined_stats[lang_name]['Questions'] += len(lang_data)
                total_questions += len(lang_data)
                
                question_types = lang_data['question_type']
                polar_count = sum(1 for qt in question_types if qt == 1)
                combined_stats[lang_name]['Polar'] += polar_count
                combined_stats[lang_name]['Content'] += (len(lang_data) - polar_count)
                
                combined_stats[lang_name]['Complexity'].extend(lang_data['complexity_score'])
                
        except Exception as e:
            print(f"Error processing {split} split: {e}")
    
    results = []
    for lang_name, stats in combined_stats.items():
        if stats['Questions'] > 0:
            polar_pct = round((stats['Polar'] / stats['Questions']) * 100, 1)
            content_pct = round((stats['Content'] / stats['Questions']) * 100, 1)
            avg_complexity = round(np.mean(stats['Complexity']), 2) if stats['Complexity'] else 0
            dataset_pct = round((stats['Questions'] / total_questions) * 100, 1)

            results.append({'Language': lang_name,'Dataset %': dataset_pct,'Polar %': polar_pct,'Content %': content_pct,'Avg. Complexity': avg_complexity
            })
    
    stats_df = pd.DataFrame(results)
    
    return stats_df

stats_df = analyze_averages()
print(stats_df.to_string(index=False))


  Language  Dataset %  Polar %  Content %  Avg. Complexity
    Arabic       12.9     48.3       51.7             1.55
   English       15.9     50.0       50.0             1.61
   Finnish       15.9     49.9       50.1             1.40
Indonesian       13.2     48.2       51.8             1.88
  Japanese       15.4     50.8       49.2             1.66
    Korean       10.7     46.9       53.1             1.98
   Russian       16.0     50.0       50.0             1.76


In [59]:
original_df = pd.read_csv("/home/robin/Research/qtype-eval/visualization/base_csv_files/original_base.csv")
probe_df = pd.read_csv("/home/robin/Research/qtype-eval/visualization/base_csv_files/probe_base.csv")
finetune_df = pd.read_csv("/home/robin/Research/qtype-eval/visualization/base_csv_files/finetune_base.csv")

In [None]:
target_columns = [
    'language', 'real', 'control1', 'control2', 'control3', 'control_mean', 
    'selectivity', 'normalized_selectivity', 'task', 'model_type', 'metric', 'submetric'
]

In [66]:
original_filtered = original_df[original_df['model_type'] != 'lm_probe'].copy()
original_filtered = original_filtered[original_filtered['split'] == 'test'].copy()
finetune_real = finetune_df[finetune_df['control_index'].isna()].copy()

In [72]:
finetune_real.tail(100)

Unnamed: 0,experiment_type,language,task,submetric,control_index,metric,value
320,finetune,fi,single_submetric,n_tokens,,loss,0.011284
321,finetune,fi,single_submetric,n_tokens,,mse,0.011379
322,finetune,fi,single_submetric,n_tokens,,rmse,0.106674
323,finetune,fi,single_submetric,n_tokens,,r2,0.176997
324,finetune,id,single_submetric,avg_links_len,,loss,0.031529
...,...,...,...,...,...,...,...
415,finetune,ru,single_submetric,lexical_density,,r2,-0.067467
416,finetune,ru,single_submetric,n_tokens,,loss,0.005862
417,finetune,ru,single_submetric,n_tokens,,mse,0.005938
418,finetune,ru,single_submetric,n_tokens,,rmse,0.077056


In [68]:
print(f"Selected {len(finetune_real)} rows from finetune data where control_index is None")

finetune_processed = pd.DataFrame()
finetune_processed['language'] = finetune_real['language']
finetune_processed['real'] = finetune_real['value']
finetune_processed['task'] = finetune_real['task']
finetune_processed['metric'] = finetune_real['metric']
finetune_processed['submetric'] = finetune_real['submetric']
finetune_processed['model_type'] = 'finetune'
finetune_processed['control1'] = np.nan
finetune_processed['control2'] = np.nan
finetune_processed['control3'] = np.nan
finetune_processed['control_mean'] = np.nan
finetune_processed['selectivity'] = np.nan
finetune_processed['normalized_selectivity'] = np.nan


Selected 231 rows from finetune data where control_index is None


In [77]:
finetune_processed.head(5)

Unnamed: 0,language,real,task,metric,submetric,model_type,control1,control2,control3,control_mean,selectivity,normalized_selectivity
0,ar,0.042425,complexity,loss,,finetune,,,,,,
1,ar,0.042143,complexity,mse,,finetune,,,,,,
2,ar,0.205289,complexity,rmse,,finetune,,,,,,
3,ar,0.273465,complexity,r2,,finetune,,,,,,
4,ar,0.742846,question_type,loss,,finetune,,,,,,


In [79]:
# Print diagnostic information
print(f"Total probe rows: {len(probe_df)}")
print(f"Unique languages: {probe_df['language'].unique()}")
print(f"Unique metrics: {probe_df['metric'].unique()}")
print(f"Unique control_index values: {probe_df['control_index'].unique()}")
print(f"Unique layer_index values: {probe_df['layer_index'].unique()}")

# Get rows where control_index is None (these are the real values)
real_rows = probe_df[probe_df['control_index'].isna()].copy()
print(f"Found {len(real_rows)} real value rows in probe data")

# Initialize our results DataFrame
probe_processed = pd.DataFrame()

# For each real value row, find the matching control rows
results = []
for _, row in real_rows.iterrows():
    language = row['language']
    task = row['task']
    submetric = row['submetric']
    metric = row['metric']
    layer_idx = row['layer_index']
    real_value = row['value']
    
    # Create new result row
    result_row = {
        'language': language,
        'task': task,
        'submetric': submetric,
        'metric': metric,
        'model_type': 'probe',
        'real': real_value,
    }
    
    # Find control rows with the same parameters but control_index = 1.0, 2.0, 3.0
    control_rows = probe_df[
        (probe_df['language'] == language) & 
        (probe_df['task'] == task) & 
        (probe_df['submetric'] == submetric) & 
        (probe_df['metric'] == metric) & 
        (probe_df['layer_index'] == layer_idx) &
        probe_df['control_index'].notna()  # Must have a control_index
    ]
    
    # Extract control values
    for control_idx in [1.0, 2.0, 3.0]:
        control_matches = control_rows[control_rows['control_index'] == control_idx]
        if len(control_matches) > 0:
            result_row[f'control{int(control_idx)}'] = control_matches.iloc[0]['value']
    
    # Calculate control_mean if we have controls
    controls = []
    for control_key in ['control1', 'control2', 'control3']:
        if control_key in result_row and pd.notna(result_row.get(control_key)):
            try:
                controls.append(float(result_row[control_key]))
            except (ValueError, TypeError):
                pass
    
    if controls:
        result_row['control_mean'] = sum(controls) / len(controls)
        
        # Calculate selectivity and normalized_selectivity
        if pd.notna(result_row.get('real')):
            try:
                real_val = float(result_row['real'])
                control_mean = float(result_row['control_mean'])
                result_row['selectivity'] = abs(real_val - control_mean)
                
                # Calculate normalized_selectivity
                if control_mean != 0:
                    result_row['normalized_selectivity'] = (abs(real_val - control_mean) / control_mean) * 100
                else:
                    result_row['normalized_selectivity'] = 0
            except (ValueError, TypeError):
                pass
    
    results.append(result_row)

# Convert to DataFrame
probe_processed = pd.DataFrame(results)
print(f"Created {len(probe_processed)} processed probe rows")


Total probe rows: 10785
Unique languages: ['ar' 'en' 'fi' 'id' 'ja' 'ko' 'ru']
Unique metrics: ['loss' 'accuracy' 'f1' 'precision' 'recall' 'mse' 'rmse' 'r2']
Unique control_index values: [nan  1.  2.  3.]
Unique layer_index values: [ 1  2  3  4  5  6  7  8  9 10 11 12]
Found 2768 real value rows in probe data
Created 2768 processed probe rows


In [81]:
probe_processed.tail(5)

Unnamed: 0,language,task,submetric,metric,model_type,real,control1,control2,control3,control_mean,selectivity,normalized_selectivity
2763,ru,single_submetric,lexical_density,r2,probe,-0.271337,-0.161527,-0.192802,-0.194643,-0.18299,0.088347,-48.279518
2764,ru,single_submetric,n_tokens,loss,probe,0.015568,0.012798,0.012395,0.017626,0.014273,0.001295,9.073085
2765,ru,single_submetric,n_tokens,mse,probe,0.01564,0.012907,0.012508,0.017693,0.014369,0.001271,8.844056
2766,ru,single_submetric,n_tokens,rmse,probe,0.125061,0.113607,0.111841,0.133016,0.119488,0.005573,4.664013
2767,ru,single_submetric,n_tokens,r2,probe,-0.374906,-0.134595,-0.099598,-0.555374,-0.263189,0.111717,-42.447513


In [84]:
original_filtered.head(50)

Unnamed: 0,language,real,control1,control2,control3,control_mean,selectivity,normalized_selectivity,task,model_type,metric,split,submetric,layer
7,ar,0.052279,0.052279,0.052279,0.052279,0.052279,0.0,0.0,single_submetric,DummyRegressor,mse,test,avg_max_depth,overall
8,en,0.028999,0.028999,0.028999,0.028999,0.028999,3.469447e-18,1.196421e-14,single_submetric,DummyRegressor,mse,test,avg_max_depth,overall
9,fi,0.03527,0.03527,0.03527,0.03527,0.03527,0.0,0.0,single_submetric,DummyRegressor,mse,test,avg_max_depth,overall
10,id,0.032611,0.032611,0.032611,0.032611,0.032611,0.0,0.0,single_submetric,DummyRegressor,mse,test,avg_max_depth,overall
11,ja,0.09447,0.09447,0.09447,0.09447,0.09447,0.0,0.0,single_submetric,DummyRegressor,mse,test,avg_max_depth,overall
12,ko,0.035441,0.035441,0.035441,0.035441,0.035441,0.0,0.0,single_submetric,DummyRegressor,mse,test,avg_max_depth,overall
13,ru,0.02476,0.02476,0.02476,0.02476,0.02476,3.469447e-18,1.401259e-14,single_submetric,DummyRegressor,mse,test,avg_max_depth,overall
21,ar,0.045155,0.059633,0.054521,0.058539,0.057564,0.01240899,21.55675,single_submetric,XGBRegressor,mse,test,avg_links_len,overall
22,en,0.00935,0.01218,0.011723,0.011791,0.011898,0.002548323,21.41765,single_submetric,XGBRegressor,mse,test,avg_links_len,overall
23,fi,0.021578,0.022874,0.021671,0.022502,0.022349,0.0007704478,3.447359,single_submetric,XGBRegressor,mse,test,avg_links_len,overall


In [106]:
len(original_filtered)
original_filtered.to_csv('/home/robin/Research/qtype-eval/visualization/results/baseline_models.csv', index=False)

In [85]:
for df in [original_filtered, probe_processed, finetune_processed]:
    for col in target_columns:
        if col not in df.columns:
            df[col] = np.nan

# Combine all processed data (using only the target columns in the specified order)
merged_df = pd.concat(
    [
        original_filtered[target_columns],
        probe_processed[target_columns],
        finetune_processed[target_columns]
    ],
    ignore_index=True
)

In [95]:
merged_df['split'] = merged_df['split'].fillna('test')

In [98]:

print("\nColumns in merged file:")
print(merged_df.columns.tolist())

merged_df.tail(5)


Columns in merged file:
['language', 'model_type', 'task', 'split', 'submetric', 'layer', 'metric', 'real', 'control1', 'control2', 'control3', 'control_mean', 'selectivity', 'normalized_selectivity']


Unnamed: 0,language,model_type,task,split,submetric,layer,metric,real,control1,control2,control3,control_mean,selectivity,normalized_selectivity
3162,ru,finetune,single_submetric,test,lexical_density,,r2,-0.067467,,,,,,
3163,ru,finetune,single_submetric,test,n_tokens,,loss,0.005862,,,,,,
3164,ru,finetune,single_submetric,test,n_tokens,,mse,0.005938,,,,,,
3165,ru,finetune,single_submetric,test,n_tokens,,rmse,0.077056,,,,,,
3166,ru,finetune,single_submetric,test,n_tokens,,r2,0.478031,,,,,,


In [87]:
len(merged_df)

3167

In [90]:
merged_df.head(50)

Unnamed: 0,language,model_type,task,split,submetric,layer,metric,real,control1,control2,control3,control_mean,selectivity,normalized_selectivity
0,ar,DummyRegressor,single_submetric,test,avg_max_depth,overall,mse,0.052279,0.052279,0.052279,0.052279,0.052279,0.0,0.0
1,en,DummyRegressor,single_submetric,test,avg_max_depth,overall,mse,0.028999,0.028999,0.028999,0.028999,0.028999,3.469447e-18,1.196421e-14
2,fi,DummyRegressor,single_submetric,test,avg_max_depth,overall,mse,0.03527,0.03527,0.03527,0.03527,0.03527,0.0,0.0
3,id,DummyRegressor,single_submetric,test,avg_max_depth,overall,mse,0.032611,0.032611,0.032611,0.032611,0.032611,0.0,0.0
4,ja,DummyRegressor,single_submetric,test,avg_max_depth,overall,mse,0.09447,0.09447,0.09447,0.09447,0.09447,0.0,0.0
5,ko,DummyRegressor,single_submetric,test,avg_max_depth,overall,mse,0.035441,0.035441,0.035441,0.035441,0.035441,0.0,0.0
6,ru,DummyRegressor,single_submetric,test,avg_max_depth,overall,mse,0.02476,0.02476,0.02476,0.02476,0.02476,3.469447e-18,1.401259e-14
7,ar,XGBRegressor,single_submetric,test,avg_links_len,overall,mse,0.045155,0.059633,0.054521,0.058539,0.057564,0.01240899,21.55675
8,en,XGBRegressor,single_submetric,test,avg_links_len,overall,mse,0.00935,0.01218,0.011723,0.011791,0.011898,0.002548323,21.41765
9,fi,XGBRegressor,single_submetric,test,avg_links_len,overall,mse,0.021578,0.022874,0.021671,0.022502,0.022349,0.0007704478,3.447359


In [101]:
output_file = '/home/robin/Research/qtype-eval/visualization/results/merged_results.csv'
merged_df.to_csv(output_file, index=False)


In [None]:
import pandas as pd
import numpy as np

original_df = pd.read_csv("/home/robin/Research/qtype-eval/visualization/base_csv_files/original_base.csv")
probe_df = pd.read_csv("/home/robin/Research/qtype-eval/visualization/base_csv_files/probe_base.csv")
finetune_df = pd.read_csv("/home/robin/Research/qtype-eval/visualization/base_csv_files/finetune_base.csv")
# Define the target columns for the final CSV in the exact order specified
target_columns = [
    'language', 'real', 'control1', 'control2', 'control3', 'control_mean', 
    'selectivity', 'normalized_selectivity', 'task', 'model_type', 'metric', 'submetric', 'split', 'layer'
]

print("Filtering original data...")
original_filtered = original_df[original_df['model_type'] != 'lm_probe'].copy()
original_filtered = original_filtered[original_filtered['split'] == 'test'].copy()

original_filtered['layer'] = None

print("Processing finetune data...")
finetune_real = finetune_df[finetune_df['control_index'].isna()].copy()
print(f"Selected {len(finetune_real)} rows from finetune data where control_index is None")

finetune_processed = pd.DataFrame()
finetune_processed['language'] = finetune_real['language']
finetune_processed['real'] = finetune_real['value']
finetune_processed['task'] = finetune_real['task']
finetune_processed['metric'] = finetune_real['metric']
finetune_processed['submetric'] = finetune_real['submetric']
finetune_processed['model_type'] = 'finetune'
finetune_processed['split'] = 'test'
finetune_processed['layer'] = None  # Explicitly set to None for finetune experiments
finetune_processed['control1'] = np.nan
finetune_processed['control2'] = np.nan
finetune_processed['control3'] = np.nan
finetune_processed['control_mean'] = np.nan
finetune_processed['selectivity'] = np.nan
finetune_processed['normalized_selectivity'] = np.nan

# Step 3: Process probe data - IMPROVED APPROACH with proper layer handling
print("Processing probe data with improved control extraction...")

# Get all unique combinations of parameters 
unique_combinations = probe_df.drop_duplicates(
    subset=['language', 'task', 'submetric', 'metric', 'layer_index']
).copy()

print(f"Found {len(unique_combinations)} unique parameter combinations in probe data")

# Process each unique combination
results = []
for _, row in unique_combinations.iterrows():
    language = row['language']
    task = row['task']
    submetric = row['submetric']
    metric = row['metric']
    layer_idx = row['layer_index']
    
    # Get all rows matching this combination
    matching_rows = probe_df[
        (probe_df['language'] == language) & 
        (probe_df['task'] == task) & 
        (probe_df['submetric'] == submetric) & 
        (probe_df['metric'] == metric) & 
        (probe_df['layer_index'] == layer_idx)
    ]
    
    # Skip if no matching rows
    if len(matching_rows) == 0:
        continue
    
    # Create a new row for this combination
    result_row = {
        'language': language,
        'task': task,
        'submetric': submetric,
        'metric': metric,
        'model_type': 'probe',
        'split': 'test',
        'layer': f'layer{layer_idx}'  # Properly include layer information for probe experiments
    }
    
    # Find real value (NaN control_index)
    real_rows = matching_rows[matching_rows['control_index'].isna()]
    if len(real_rows) > 0:
        result_row['real'] = real_rows.iloc[0]['value']
    else:
        # Skip if no real value found
        print(f"Warning: No real value found for {language}/{task}/{metric}")
        continue
    
    # Find control values
    for control_idx in [1.0, 2.0, 3.0]:
        control_rows = matching_rows[matching_rows['control_index'] == control_idx]
        if len(control_rows) > 0:
            result_row[f'control{int(control_idx)}'] = control_rows.iloc[0]['value']
    
    # Calculate control_mean if we have at least one control
    controls = []
    for control_key in ['control1', 'control2', 'control3']:
        if control_key in result_row and pd.notna(result_row.get(control_key)):
            try:
                controls.append(float(result_row[control_key]))
            except (ValueError, TypeError):
                pass
    
    if controls:
        result_row['control_mean'] = sum(controls) / len(controls)
        
        # Calculate selectivity and normalized_selectivity
        if pd.notna(result_row.get('real')):
            try:
                real_val = float(result_row['real'])
                control_mean = float(result_row['control_mean'])
                result_row['selectivity'] = abs(real_val - control_mean)
                
                # Calculate normalized_selectivity
                if control_mean != 0:
                    result_row['normalized_selectivity'] = (abs(real_val - control_mean) / control_mean) * 100
                else:
                    result_row['normalized_selectivity'] = 0
            except (ValueError, TypeError):
                pass
    
    # Add to our results
    results.append(result_row)

# Convert to DataFrame
probe_processed = pd.DataFrame(results)
print(f"Created {len(probe_processed)} processed probe rows")

# Print a diagnosis of tasks in the probe data for debugging
print("\nTask breakdown in processed probe data:")
task_counts = probe_processed['task'].value_counts()
print(task_counts)

# Step 4: Merge all datasets
print("\nMerging datasets...")

# Ensure all DataFrames have all required columns
for df in [original_filtered, probe_processed, finetune_processed]:
    for col in target_columns:
        if col not in df.columns:
            df[col] = np.nan

# Combine all processed data
merged_df = pd.concat(
    [
        original_filtered[target_columns],
        probe_processed[target_columns],
        finetune_processed[target_columns]
    ],
    ignore_index=True
)

# Fill NaN values in the 'split' column with 'test'
merged_df['split'] = merged_df['split'].fillna('test')

# Save the result
output_file = '/home/robin/Research/qtype-eval/visualization/notebooks/merged_ml_results.csv'
merged_df.to_csv(output_file, index=False)

print(f"\nSuccessfully created {output_file}")
print(f"Total rows: {len(merged_df)}")
print(f"  - Original data: {len(original_filtered)}")
print(f"  - Probe data: {len(probe_processed)}")
print(f"  - Finetune data: {len(finetune_processed)}")

# Display some diagnostics about layer values
print("\nLayer value counts:")
layer_counts = merged_df['layer'].value_counts(dropna=False)
print(layer_counts)

# Check each model type for proper layer handling
print("\nChecking layer values by model_type:")
for model_type in merged_df['model_type'].unique():
    model_data = merged_df[merged_df['model_type'] == model_type]
    layer_values = model_data['layer'].value_counts(dropna=False)
    print(f"\n{model_type}:")
    print(layer_values)

Filtering original data...
Processing finetune data...
Selected 231 rows from finetune data where control_index is None
Processing probe data with improved control extraction...
Found 2772 unique parameter combinations in probe data
Created 2012 processed probe rows

Task breakdown in processed probe data:
task
single_submetric    2012
Name: count, dtype: int64

Merging datasets...

Successfully created /home/robin/Research/qtype-eval/visualization/notebooks/merged_ml_results.csv
Total rows: 2411
  - Original data: 168
  - Probe data: 2012
  - Finetune data: 231

Layer value counts:
layer
None       399
layer1     168
layer2     168
layer3     168
layer4     168
layer5     168
layer7     168
layer8     168
layer9     168
layer10    168
layer11    168
layer12    168
layer6     164
Name: count, dtype: int64

Checking layer values by model_type:

DummyRegressor:
layer
None    49
Name: count, dtype: int64

XGBRegressor:
layer
None    49
Name: count, dtype: int64

DummyClassifier:
layer
Non