In [72]:
import os
import shap
import wandb
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import lang2vec.lang2vec as l2v
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from joblib import Parallel, delayed
from scipy.stats import pearsonr, zscore
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import PartialDependenceDisplay

In [73]:
language_map = {
    "same": "English",
    "avg": "Average",
    "af": "Afrikaans",
    "ar": "Arabic",
    "bg": "Bulgarian",
    "ca": "Catalan",
    "cs": "Czech",
    "da": "Danish",
    "de": "German",
    "el": "Greek",
    "es": "Spanish",
    "fa": "Persian",
    "fi": "Finnish",
    "fr": "French",
    "he": "Hebrew",
    "hi": "Hindi",
    "hu": "Hungarian",
    "it": "Italian",
    "ja": "Japanese",
    "ko": "Korean",
    "lt": "Lithuanian",
    "lv": "Latvian",
    "no": "Norwegian",
    "pl": "Polish",
    "pt": "Portuguese",
    "ro": "Romanian",
    "ru": "Russian",
    "sk": "Slovak",
    "sl": "Slovenian",
    "sv": "Swedish",
    "ta": "Tamil",
    "th": "Thai",
    "tr": "Turkish",
    "uk": "Ukrainian",
    "vi": "Vietnamese",
    "zh": "Chinese",
    "sw": "Swahili",
    "ur": "Urdu",
    "en": "English"
}

In [74]:
method_names = {
    'baseline': 'Fine-tuning Only',
    'before_fastalign': 'Before FastAlign',
    'before_awesome': 'Before Awesome',
    'before_dico': 'Before Dict',
    'during_fastalign': 'During FastAlign',
    'during_awesome': 'During Awesome',
    'during_dico': 'During Dict',
    'freeze_realign_unfreeze_fastalign': 'Before - Front Freeze FastAlign',
    'freeze_realign_unfreeze_awesome': 'Before - Front Freeze Awesome',
    'freeze_realign_unfreeze_dico': 'Before - Front Freeze Dict',
    'freeze_realign_unfreeze_last_6_fastalign': 'Before - Back Freeze FastAlign',
    'freeze_realign_unfreeze_last_6_awesome': 'Before - Back Freeze Awesome',
    'freeze_realign_unfreeze_last_6_dico': 'Before - Back Freeze Dict',
    'during_partial_freeze_front_fastalign': 'During Front Freeze FastAlign',
    'during_partial_freeze_front_awesome': 'During Front Freeze Awesome',
    'during_partial_freeze_front_dico': 'During Front Freeze Dict',
    'during_partial_freeze_back_fastalign': 'During Back Freeze FastAlign',
    'during_partial_freeze_back_awesome': 'During Back Freeze Awesome',
    'during_partial_freeze_back_dico': 'During Back Freeze Dict',
}

In [75]:
specific_language_families = {
    'ja': 'East Asian',
    'th': 'Southeast Asian',
    'ta': 'South Asian',
    'vi': 'Southeast Asian',
    'ko': 'East Asian',
    'ar': 'Middle Eastern',
    'he': 'Middle Eastern',
    'zh': 'East Asian',
    'hi': 'South Asian',
    'tr': 'Middle Eastern',
    'en': 'European',
    'fa': 'Middle Eastern',
    'lv': 'European',
    'lt': 'European',
    'avg': 'European',
    'el': 'European',
    'sv': 'European',
    'no': 'European',
    'hu': 'European',
    'da': 'European',
    'de': 'European',
    'ca': 'European',
    'bg': 'European',
    'fr': 'European',
    'af': 'European',
    'es': 'European',
    'it': 'European',
    'uk': 'European',
    'sk': 'European',
    'fi': 'European',
    'cs': 'European',
    'pt': 'European',
    'pl': 'European',
    'ru': 'European',
    'ro': 'European',
    'sl': 'European'
}

In [76]:
specific_family_shapes = {
    'East Asian': 'o',  
    'Southeast Asian': '^',  
    'South Asian': 's',  
    'European': 'P',  
    'Middle Eastern': 'v', 
    'Other': 'p' 
}

In [77]:
lang2vec_mapping = {
    "af": "afr",
    "ar": "arb",
    "bg": "bul",
    "ca": "cat",
    "cs": "ces",
    "da": "dan",
    "de": "deu",
    "el": "ell",
    "es": "spa",
    "fa": "fas",
    "fi": "fin",
    "fr": "fra",
    "he": "heb",
    "hi": "hin",
    "hu": "hun",
    "it": "ita",
    "ja": "jpn",
    "ko": "kor",
    "lt": "lit",
    "lv": "lav",
    "no": "nob",
    "pl": "pol",
    "pt": "por",
    "ro": "ron",
    "ru": "rus",
    "sk": "slk",
    "sl": "slv",
    "sv": "swe",
    "ta": "tam",
    "th": "tha",
    "tr": "tur",
    "uk": "ukr",
    "vi": "vie",
    "zh": "cmn"
}


In [78]:
# xlmr_tokens = {
#     'af': 242,
#     'am': 68,
#     'ar': 2869,
#     'as': 5,
#     'az': 783,
#     'be': 362,
#     'bg': 5487,
#     'bn': 525,
#     'br': 16,
#     'bs': 14,
#     'ca': 1752,
#     'cs': 2498,
#     'cy': 141,
#     'da': 7823,
#     'de': 10297,
#     'el': 4285,
#     'en': 55608,
#     'eo': 157,
#     'es': 9374,
#     'et': 843,
#     'eu': 270,
#     'fa': 13259,
#     'fi': 6730,
#     'fr': 9780,
#     'fy': 29,
#     'ga': 86,
#     'gd': 21,
#     'gl': 495,
#     'gu': 140,
#     'ha': 56,
#     'he': 3399,
#     'hi': 1715,
#     'hr': 3297,
#     'hu': 7807,
#     'hy': 421,
#     'id': 22704,
#     'is': 505,
#     'it': 4983,
#     'ja': 530,
#     'jv': 24,
#     'ka': 469,
#     'kk': 476,
#     'km': 36,
#     'kn': 169,
#     'ko': 5644,
#     'ku': 66,
#     'ky': 94,
#     'la': 390,
#     'lt': 1835,
#     'lv': 1198,
#     'mg': 25,
#     'mk': 449,
#     'ml': 313,
#     'mn': 248,
#     'mr': 175,
#     'ms': 1318,
#     'my': 15,
#     'ne': 237,
#     'nl': 5025,
#     'no': 8494,
#     'om': 8,
#     'or': 36,
#     'pa': 68,
#     'pl': 6490,
#     'pt': 8405,
#     'ro': 10354,
#     'ru': 23408,
#     'sa': 17,
#     'sd': 50,
#     'si': 243,
#     'sk': 3525,
#     'sl': 1669,
#     'so': 62,
#     'sq': 918,
#     'sr': 843,
#     'su': 10,
#     'sv': 778,
#     'sw': 275,
#     'ta': 595,
#     'te': 249,
#     'th': 1834,
#     'tl': 556,
#     'tr': 2736,
#     'ug': 27,
#     'uk': 6500,
#     'ur': 730,
#     'uz': 91,
#     'vi': 24757,
#     'xh': 13,
#     'yi': 34,
#     'zh': 259,  # Chinese (Simplified)
#     'zh-Hant': 176  # Chinese (Traditional)
# }

In [79]:
api = wandb.Api()
workspace = "align_freeze"
finetuning_project_name = "3nl_34langs_baseline_udpos"

In [80]:
langs = ["same", "avg", "af", "ar", "bg", "ca", "cs", "da", "de", "el", "es", "fa", "fi", "fr", "he", "hi",
         "hu", "it", "ja", "ko", "lt", 'lv', 'no', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sv', 'ta', 'th', 'tr', 'uk',
         'vi', 'zh']

In [81]:
def fetch_and_process_runs(source, _langs, username="felixgaschi", method_name=None):
    runs_data = {lang: [] for lang in _langs}
    
    runs = api.runs(f"{workspace}/{source}")
    for run in runs:
        if run.user.username == username:  
            for lang in langs:
                accuracy_key = f"final_eval_{lang}_accuracy"
                accuracy = run.summary.get(accuracy_key)
                if accuracy is not None:
                    runs_data[lang].append(accuracy)
    return runs_data

In [82]:
def load_baseline_csv(csv_directory, model_name):
    csv_file = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if 'filtered_percentage' not in file]
    baseline_df = pd.read_csv(csv_file[0])
    baseline_df['value'] = 0.0
    baseline_df['model'] = model_name
    return baseline_df

In [83]:
def load_and_combine_csvs(csv_directory):
    csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith('.csv')]
    dfs = {}
    
    for file in csv_files:
        file_name = os.path.basename(file)
        if "filtered_percent_" in file_name:
            value = float(file_name.split('filtered_percent_')[-1].replace('.csv', ''))
        elif "filtered_" in file_name:
            value = float(file_name.split('filtered_')[-1].replace('.csv', ''))
        else:
            value = 0.0
        
        df = pd.read_csv(file)
        
        # Drop empty columns
        df = df.dropna(axis=1, how='all')
        
        # Print statement to show the DataFrame after dropping empty columns
        print(f"DataFrame after dropping empty columns from {file_name}:")
        print(df.head())
        
        # Print rows with any NaN values for debugging
        print("Rows with NaN values:")
        print(df[df.isna().any(axis=1)])
        
        # Drop rows with any NaN values
        df = df.dropna(how='any')
        
        # Print statement to show the DataFrame after dropping NaN rows
        print(f"DataFrame loaded and cleaned from {file_name}:")
        print(df.head())
        
        df['value'] = value
        
        # Print statement to show the DataFrame after adding the 'value' column
        print(f"DataFrame after adding 'value' column from {file_name}:")
        print(df.head())
        
        dfs[value] = df
    
    combined_df = pd.concat(dfs.values(), ignore_index=True)
    
    # Print statement to show the combined DataFrame
    print("Combined DataFrame before further processing:")
    print(combined_df.head())
    
    return combined_df

In [84]:
def get_lang_distances(lang2, mapping):
    lang1 = 'eng'  # English is always the reference language
    if lang2 == 'same':
        return {'syntactic': 0, 'geographic': 0, 'phonological': 0, 'genetic': 0, 'inventory': 0, 'featural': 0}
    lang2 = mapping.get(lang2, lang2)
    
    try:
        distances = {
            'syntactic': l2v.distance('syntactic', lang1, lang2),
            'geographic': l2v.distance('geographic', lang1, lang2),
            'phonological': l2v.distance('phonological', lang1, lang2),
            'genetic': l2v.distance('genetic', lang1, lang2),
            'inventory': l2v.distance('inventory', lang1, lang2),
            'featural': l2v.distance('featural', lang1, lang2),
        }
    except Exception as e:
        print(f"Error calculating distances between {lang1} and {lang2}: {e}")
        distances = {k: np.nan for k in ['syntactic', 'geographic', 'phonological', 'genetic', 'inventory', 'featural']}
    return distances


In [85]:
def transform_data_for_regression(df, id_vars, include_distance=False, mapping=None):
    language_columns = [col for col in df.columns if col.startswith('final_eval_') and col.endswith('_accuracy')]
    melted_df = df.melt(id_vars=id_vars, value_vars=language_columns, 
                        var_name='language_accuracy', value_name='accuracy')
    melted_df['language_code'] = melted_df['language_accuracy'].apply(lambda x: x.split('_')[2])
    melted_df = melted_df.drop(columns=['language_accuracy'])
    melted_df['language_code'] = melted_df['language_code'].astype('category')
    
    if 'method' in id_vars:
        melted_df['method'] = melted_df['method'].astype('category')
        
    if 'value' in id_vars:
        melted_df['value'] = melted_df['value'].astype('category')
        
    if 'model' in id_vars:
        melted_df['model'] = melted_df['model'].astype('category')
    
    if include_distance:
        english_code = 'eng'  # Assuming 'eng' is the code for English in your dataset
        unique_languages = melted_df['language_code'].unique()
        
        # Calculate distances for each unique language
        distance_dict = {}
        for lang in unique_languages:
            if lang != english_code:
                distance_dict[lang] = get_lang_distances(lang, mapping)
        
        # Update DataFrame with distances
        for distance_type in ['syntactic', 'geographic', 'phonological', 'genetic', 'inventory', 'featural']:
            melted_df[distance_type] = melted_df['language_code'].map(lambda x: distance_dict.get(x, {}).get(distance_type, np.nan))
    
    return melted_df


In [86]:
def standardize_xlmr_columns(df):
    df.columns = [f'final_eval_{col}_accuracy' if col in langs else col for col in df.columns]
    return df

In [87]:
def plot_decision_tree(regression_model, feature_names, save_path=None, tree_index=0, max_depth=3, dpi=100, font_size=10, figsize=(30, 10)):
    """ Plot an individual decision tree from the Random Forest. """
    plt.figure(figsize=figsize, dpi=dpi)
    plot_tree(
        regression_model.estimators_[tree_index],
        feature_names=feature_names,
        filled=True,
        rounded=True,
        max_depth=max_depth,
        fontsize=font_size,
        proportion=True
    )
    plt.title(f'Decision Tree {tree_index} (max depth = {max_depth})')
    if save_path:
        plt.savefig(os.path.join(save_path, f'decision_tree_{tree_index}.png'))
    else:
        plt.show()
    plt.close()

In [88]:
def calculate_shap_values(explainer, X, approximate=False):
    return explainer.shap_values(X, approximate=approximate)

In [89]:
def plot_shap_summary(model, X, save_path=None, approximate=False):
    explainer = shap.TreeExplainer(model)
    
    # Use Parallel processing to speed up SHAP value calculation
    n_jobs = -1  # Use all available CPU cores
    shap_values = Parallel(n_jobs=n_jobs, backend='loky')(
        delayed(calculate_shap_values)(explainer, X.iloc[i:i+1], approximate)
        for i in tqdm(range(X.shape[0]), desc="Calculating SHAP values")
    )
    
    shap_values = np.vstack(shap_values)
    plt.figure()
    shap.summary_plot(shap_values, X, show=False)
    if save_path:
        plt.savefig(os.path.join(save_path, 'shap_summary.png'))
    else:
        plt.show()
    plt.close()

In [90]:
def plot_average_tree_paths(model, X, sample_index):
    decision_paths = model.decision_path(X)
    path_lengths = decision_paths.indptr[sample_index+1] - decision_paths.indptr[sample_index]
    plt.figure(figsize=(10, 6))
    plt.hist(path_lengths, bins=20)
    plt.title(f'Average Path Lengths for Sample {sample_index}')
    plt.xlabel('Path Length')
    plt.ylabel('Frequency')
    plt.show()

In [91]:
def plot_partial_dependences(model, X, features):
    fig, ax = plt.subplots(figsize=(12, 8))
    display = PartialDependenceDisplay.from_estimator(model, X, features, ax=ax, grid_resolution=50)
    plt.show()

In [92]:
def plot_loss_curve(errors, save_path=None, start=1, end=30):
    plt.figure(figsize=(10, 6))
    plt.plot(range(start, end + 1), errors[start - 1:end], marker='o')
    plt.title('Model Loss Over Time')
    plt.xlabel('Number of Trees')
    plt.ylabel('Mean Squared Error')
    plt.grid(True)
    if save_path:
        plt.savefig(os.path.join(save_path, 'loss_curve.png'))
    else:
        plt.show()
    plt.close()

In [93]:
def split_train_val_test(grouped_data):
    train_data = []
    val_data = []
    test_data = []
    
    # Dynamically determine grouping columns
    group_cols = [col for col in grouped_data.columns if col != 'delta_accuracy']
    
    for _, group in grouped_data.groupby(group_cols):
        group = group.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle the group
        train_size = int(len(group) * 0.6)  # 60% for training
        val_size = int(len(group) * 0.2)    # 20% for validation
        
        train_data.append(group.iloc[:train_size])
        val_data.append(group.iloc[train_size:train_size + val_size])
        test_data.append(group.iloc[train_size + val_size:])
        
    train_data = pd.concat(train_data).reset_index(drop=True)
    val_data = pd.concat(val_data).reset_index(drop=True)
    test_data = pd.concat(test_data).reset_index(drop=True)
    
    return train_data, val_data, test_data

In [94]:
def perform_random_forest_regression(grouped_data, id_vars, save_path=None, n_estimators=30):
    grouped_data = grouped_data.dropna(subset=['delta_accuracy'])
    
    train_data, val_data, test_data = split_train_val_test(grouped_data)
    
    encoded_train_df = pd.get_dummies(train_data, columns=id_vars + ['language_code'], drop_first=False)
    encoded_val_df = pd.get_dummies(val_data, columns=id_vars + ['language_code'], drop_first=False)
    encoded_test_df = pd.get_dummies(test_data, columns=id_vars + ['language_code'], drop_first=False)
    
    # Ensure the encoded validation and test sets have the same columns as the training set
    encoded_val_df = encoded_val_df.reindex(columns=encoded_train_df.columns, fill_value=0)
    encoded_test_df = encoded_test_df.reindex(columns=encoded_train_df.columns, fill_value=0)
    
    X_train = encoded_train_df.drop('delta_accuracy', axis=1)
    y_train = encoded_train_df['delta_accuracy']
    X_val = encoded_val_df.drop('delta_accuracy', axis=1)
    y_val = encoded_val_df['delta_accuracy']
    X_test = encoded_test_df.drop('delta_accuracy', axis=1)
    y_test = encoded_test_df['delta_accuracy']
    
    # Initialize the model
    regression_model = RandomForestRegressor(n_estimators=1, warm_start=True, random_state=42)
    
    # Track the mean squared error
    errors = []
    
    for i in tqdm(range(1, n_estimators + 1)):
        regression_model.n_estimators = i
        regression_model.fit(X_train, y_train)
        y_pred = regression_model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        errors.append(mse)
    
    importances = regression_model.feature_importances_
    coefficients = pd.Series(importances, index=X_train.columns)
    
    # Plot the first decision tree
    plot_decision_tree(regression_model, X_train.columns, tree_index=0, save_path=save_path)
    
    # Plot the loss curve
    plot_loss_curve(errors, save_path=save_path)
    
    # Perform final evaluation on the test set
    y_test_pred = regression_model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("\nFinal Test MSE:", test_mse)
    
    # Plot SHAP summary
    plot_shap_summary(regression_model, X_train, save_path=save_path)

    return coefficients, encoded_train_df.columns

In [95]:
# def group_by_subset(df, subset):
#     # Drop rows with NaN values in the specified subset columns and 'accuracy' column
#     df = df.dropna(subset=subset + ['accuracy'])

#     # Remove duplicate rows
#     df = df.drop_duplicates()

#     # Ensure columns are of correct data types
#     for col in subset:
#         df[col] = df[col].astype('category')
#     df['accuracy'] = df['accuracy'].astype(float)

#     # Check for duplicates again
#     duplicates = df.duplicated(subset=subset, keep=False)
#     print(f"Number of duplicate rows: {duplicates.sum()}")

#     # Check for unique combinations
#     unique_combinations = df.drop_duplicates(subset=subset)
#     print(f"Unique combinations shape: {unique_combinations.shape}")

#     # Display unique combinations to ensure correctness
#     print(unique_combinations.head())

#     # Pivot the DataFrame to compute the mean accuracy for each combination of the subset columns
#     grouped_df = df.pivot_table(index=subset, values='accuracy', aggfunc='mean').reset_index()

#     print(grouped_df.head())
#     print(f"Grouped DF Shape: {grouped_df.shape}")

#     return grouped_df

In [96]:
def plot_aggregated_coefficients(coefficients, id_vars, feature_names, title, y_labels, include_distance=False, save_path=None):
    factor_names = {
        'Method': [name for name in feature_names if name.startswith('method')],
        'Language Code': [name for name in feature_names if name.startswith('language_code')]
    }
    for var in id_vars:
        if var not in factor_names:
            factor_names[var] = [name for name in feature_names if name.startswith(var)]
    
    aggregated_coefficients = {factor.title(): coefficients[factor_names[factor]].sum() for factor in factor_names}
    aggregated_coefficients_series = pd.Series(aggregated_coefficients).sort_values(ascending=False)
    
    if include_distance:
        distance_cols = [col for col in feature_names if col in ['syntactic', 'geographic', 'phonological', 'genetic', 'inventory', 'featural']]
        distance_sum = coefficients[distance_cols].sum()
        aggregated_coefficients['Distance'] = distance_sum.sum()
    
    aggregated_coefficients_series = pd.Series(aggregated_coefficients).sort_values(ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x=aggregated_coefficients_series.values, y=aggregated_coefficients_series.index, palette='viridis', edgecolor='black', hue=aggregated_coefficients_series.index)
    plt.title(title, fontsize=16)
    plt.xlabel('Importance', fontsize=14)
    plt.ylabel('Feature', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    # Set custom y-tick labels
    y_tick_labels = [y_labels.get(label, label) for label in aggregated_coefficients_series.index]
    plt.gca().set_yticklabels(y_tick_labels, fontsize=12)

    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    if save_path:
        plt.savefig(os.path.join(save_path, 'aggregated_feature_importance.png'))
    else:
        plt.show()
    plt.close()

In [97]:
def plot_detailed_coefficients(coefficients, id_vars, title, custom_y_label=None, include_distance=False, save_path=None):
    language_coefficients = coefficients.filter(like='language_code_').rename(index=lambda x: language_map.get(x.replace('language_code_', ''), x))
    language_coefficients = language_coefficients.sort_values(ascending=False)

    plt.figure(figsize=(14, 14))
    sns.barplot(x=language_coefficients.values, y=language_coefficients.index, palette='coolwarm', edgecolor='black', hue=language_coefficients.index, legend=False)
    plt.title(f'{title} - Languages')
    plt.xlabel('Importance')
    plt.ylabel('Languages')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    if save_path:
        plt.savefig(os.path.join(save_path, 'detailed_languages.png'))
    else:
        plt.show()
    plt.close()

    method_coefficients = coefficients.filter(like='method_').rename(index=lambda x: method_names.get(x.replace('method_', ''), x))
    method_coefficients = method_coefficients.rename(index=lambda x: x.replace('method_', ''))

    # Filter out the 'fine-tuning only' method
    method_coefficients = method_coefficients[method_coefficients.index != 'Fine-tuning Only']

    method_coefficients = method_coefficients.sort_values(ascending=False)

    plt.figure(figsize=(14, 10))
    sns.barplot(x=method_coefficients.values, y=method_coefficients.index, palette='coolwarm', edgecolor='black', hue=method_coefficients.index, legend=False)
    plt.title(f'{title} - Methods')
    plt.xlabel('Importance')
    plt.ylabel('Method')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    if save_path:
        plt.savefig(os.path.join(save_path, 'detailed_methods.png'))
    else:
        plt.show()
    plt.close()

    for var in id_vars:
        if var not in ['language_code', 'method']:
            other_coefficients = coefficients.filter(like=f'{var}_').rename(index=lambda x: x.replace(f'{var}_', ''))
            other_coefficients = other_coefficients.sort_values(ascending=False)

            plt.figure(figsize=(8, 6))
            sns.barplot(x=other_coefficients.values, y=other_coefficients.index, palette='coolwarm', edgecolor='black', hue=other_coefficients.index, legend=False)
            plt.title(f'{title} - {custom_y_label if custom_y_label else var.capitalize()}')
            plt.xlabel('Importance')
            plt.ylabel(custom_y_label if custom_y_label else var.capitalize())
            plt.grid(axis='x', linestyle='--', alpha=0.7)
            plt.tight_layout()
            if save_path:
                plt.savefig(os.path.join(save_path, f'detailed_{var}.png'))
            else:
                plt.show()
            plt.close()
    
    # Plot all distances on a single plot if they are included in the id_vars
    if include_distance:
        distance_cols = [col for col in coefficients.index if col in ['syntactic', 'geographic', 'phonological', 'genetic', 'inventory', 'featural']]
        distance_coefficients = coefficients.filter(items=distance_cols).sort_values(ascending=False)

        # Titleize the y-axis labels
        titleized_distance_labels = [label.title() for label in distance_coefficients.index]

        plt.figure(figsize=(14, 10))
        sns.barplot(x=distance_coefficients.values, y=titleized_distance_labels, palette='coolwarm', edgecolor='black', hue=distance_coefficients.index, legend=False)
        plt.title(f'{title} - Distances')
        plt.xlabel('Importance')
        plt.ylabel('Distance')
        plt.grid(axis='x', linestyle='--', alpha=0.7)
        plt.tight_layout()
        if save_path:
            plt.savefig(os.path.join(save_path, 'detailed_distances.png'))
        else:
            plt.show()
        plt.close()

In [98]:
def calculate_delta_accuracy(group):
    baseline = group[group['method'] == 'baseline']
    methods = group[group['method'] != 'baseline']
    
    if len(baseline) > 0:
        baseline_accuracy = baseline['accuracy'].values
        
        # Iterate through each method and perform element-wise subtraction
        for method in methods['method'].unique():
            method_indices = methods['method'] == method
            baseline_subset = baseline_accuracy[:sum(method_indices)]
            methods.loc[method_indices, 'delta_accuracy'] = methods.loc[method_indices, 'accuracy'].values - baseline_subset

    # Set delta_accuracy for baseline to 0
    # baseline['delta_accuracy'] = 0
    
    return pd.concat([baseline, methods])

def process_dataframe(df, group_by_columns):
    result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
    return result.reset_index(drop=True)

In [99]:
def process_and_plot(
    data_loading_func,
    data_loading_args,
    id_vars,
    y_labels,
    group_by_columns,
    agg_title,
    detail_title,
    group_name,
    version_name,
    method_filter=None,
    include_distance=True
):
    # Create the save directory structure
    save_dir = os.path.join('regression_plots', group_name, version_name)
    os.makedirs(save_dir, exist_ok=True)
    
    # Load the data using the provided data loading function and arguments
    df = data_loading_func(*data_loading_args)
    
    # Apply method filter if provided
    if method_filter:
        df = df[df['method'].str.contains(method_filter)]
    
    # Transform data for regression
    melted_df = transform_data_for_regression(df, id_vars, include_distance=include_distance, mapping=lang2vec_mapping)
    
    # Group by specified columns and apply the calculation
    melted_df = process_dataframe(melted_df, group_by_columns)
    
    # Drop 'accuracy' column and remove rows with any NA values
    melted_df.drop('accuracy', axis=1, inplace=True)
    melted_df = melted_df[~melted_df.isna().any(axis=1)]
    
    # Perform random forest regression
    rf_coefficients, feature_names = perform_random_forest_regression(melted_df, id_vars, save_path=save_dir)
    
    # Plot aggregated coefficients
    plot_aggregated_coefficients(rf_coefficients, id_vars, feature_names, agg_title, y_labels, include_distance=include_distance, save_path=save_dir)
    
    # Plot detailed coefficients
    plot_detailed_coefficients(rf_coefficients, id_vars, detail_title, 'QE Threshold', include_distance=include_distance, save_path=save_dir)

# Example data loading functions
def load_csvs(csv_directory):
    return load_and_combine_csvs(csv_directory)

def fetch_and_process_runs_func():
    return pd.DataFrame(fetch_and_process_runs(finetuning_project_name, langs))

In [100]:
def load_combined_data(finetuning_project_name, langs, csv_directory_distilMBERT):
    xlmr_baseline_data = fetch_and_process_runs(finetuning_project_name, langs)
    if xlmr_baseline_data is None:
        raise ValueError("Failed to fetch and process runs data.")
    
    xlmr_baseline_df = pd.DataFrame(xlmr_baseline_data)
    xlmr_baseline_df['method'] = 'baseline'
    xlmr_baseline_df['value'] = 0.0
    xlmr_baseline_df['model'] = 'XLM-R'

    xlmr_baseline_df = standardize_xlmr_columns(xlmr_baseline_df)

    distilMBERT_baseline_df = load_baseline_csv(csv_directory_distilMBERT, 'distilMBERT')
    combined_df = pd.concat([distilMBERT_baseline_df, xlmr_baseline_df], ignore_index=True)
    
    combined_df = combined_df.dropna(axis=1, how='any')
    
    return combined_df


In [53]:
group_name = 'xlmr_pos'

process_and_plot(
    data_loading_func=load_csvs,
    data_loading_args=['./xlmr_threshold_data'],
    id_vars=['method', 'value'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
        'Value': 'QE Threshold'
    },
    group_by_columns=['language_code', 'value'],
    agg_title='Aggregated Feature Importance for PoS using XLM-R',
    detail_title='Feature Importance for PoS Using XLM-R',
    group_name=group_name,
    version_name='aggregate'
)

DataFrame after dropping empty columns from opus100_filtered_0.4.csv:
   seed             model   task    method  finetuning_steps  \
0    31  xlm-roberta-base  udpos  baseline              1970   
1    42  xlm-roberta-base  udpos  baseline              1970   
2    66  xlm-roberta-base  udpos  baseline              1970   
3    23  xlm-roberta-base  udpos  baseline              1970   
4    17  xlm-roberta-base  udpos  baseline              1970   

   realignment_steps  distinct_realignment_samples  \
0                  0                             0   
1                  0                             0   
2                  0                             0   
3                  0                             0   
4                  0                             0   

   repeated_realignment_samples  train_loss  realignment_loss  ...  \
0                             0    0.105559               0.0  ...   
1                             0    0.097366               0.0  ...   
2         

KeyboardInterrupt: 

In [None]:
process_and_plot(
    data_loading_func=load_csvs,
    data_loading_args=['./xlmr_threshold_data'],
    id_vars=['method', 'value'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
        'Value': 'QE Threshold'
    },
    group_by_columns=['language_code', 'value'],
    agg_title='Aggregated Feature Importance for PoS using XLM-R - Vanilla Realignment',
    detail_title='Feature Importance for PoS Using XLM-R - Vanilla Realignment',
    group_name=group_name,
    version_name='vanilla',
    method_filter='^((?!freeze).)*$'  # Regex to exclude 'freeze'
)

DataFrame after dropping empty columns from opus100_filtered_0.4.csv:
   seed             model   task    method  finetuning_steps  \
0    31  xlm-roberta-base  udpos  baseline              1970   
1    42  xlm-roberta-base  udpos  baseline              1970   
2    66  xlm-roberta-base  udpos  baseline              1970   
3    23  xlm-roberta-base  udpos  baseline              1970   
4    17  xlm-roberta-base  udpos  baseline              1970   

   realignment_steps  distinct_realignment_samples  \
0                  0                             0   
1                  0                             0   
2                  0                             0   
3                  0                             0   
4                  0                             0   

   repeated_realignment_samples  train_loss  realignment_loss  ...  \
0                             0    0.105559               0.0  ...   
1                             0    0.097366               0.0  ...   
2         

  df = df[df['method'].str.contains(method_filter)]


Error calculating distances between eng and avg: Unknown language avg (or maybe we don't have precomputed distances for this one).


  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):
100%|██████████| 30/30 [00:00<00:00, 118.52it/s]



Final Test MSE: 0.0001644951234523304


Calculating SHAP values: 100%|██████████| 3150/3150 [00:12<00:00, 251.71it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [45]:
process_and_plot(
    data_loading_func=load_csvs,
    data_loading_args=['./xlmr_threshold_data'],
    id_vars=['method', 'value'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
        'Value': 'QE Threshold'
    },
    group_by_columns=['language_code', 'value'],
    agg_title='Aggregated Feature Importance for PoS using XLM-R - Partially Frozen Realignment',
    detail_title='Feature Importance for PoS Using XLM-R - Partially Frozen Realignment',
    group_name=group_name,
    version_name='frozen',
    method_filter='freeze|baseline'  # Regex to include 'freeze' or 'baseline'
)

DataFrame after dropping empty columns from opus100_filtered_0.4.csv:
   seed             model   task    method  finetuning_steps  \
0    31  xlm-roberta-base  udpos  baseline              1970   
1    42  xlm-roberta-base  udpos  baseline              1970   
2    66  xlm-roberta-base  udpos  baseline              1970   
3    23  xlm-roberta-base  udpos  baseline              1970   
4    17  xlm-roberta-base  udpos  baseline              1970   

   realignment_steps  distinct_realignment_samples  \
0                  0                             0   
1                  0                             0   
2                  0                             0   
3                  0                             0   
4                  0                             0   

   repeated_realignment_samples  train_loss  realignment_loss  ...  \
0                             0    0.105559               0.0  ...   
1                             0    0.097366               0.0  ...   
2         

  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):
100%|██████████| 30/30 [00:00<00:00, 57.03it/s]



Final Test MSE: 0.00012023341487611355


Calculating SHAP values: 100%|██████████| 6300/6300 [00:51<00:00, 123.11it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [67]:
group_name = 'xlmr_vs_distilmbert_pos'

# Process and plot
process_and_plot(
    data_loading_func=load_combined_data,
    data_loading_args=[finetuning_project_name, langs, './distilMBERT_percentile_data'],
    id_vars=['method', 'model'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
    },
    group_by_columns=['language_code', 'model'],
    agg_title='Aggregated Feature Importance for PoS',
    detail_title='Feature Importance for PoS',
    group_name=group_name,  # Replace with actual group name
    version_name='aggregate',  # Replace with actual version name
    include_distance=True
)

Error calculating distances between eng and avg: Unknown language avg (or maybe we don't have precomputed distances for this one).


  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):
100%|██████████| 30/30 [00:00<00:00, 158.51it/s]



Final Test MSE: 0.0004374483593333071


Calculating SHAP values: 100%|██████████| 1890/1890 [00:13<00:00, 142.80it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [69]:
process_and_plot(
    data_loading_func=load_combined_data,
    data_loading_args=[finetuning_project_name, langs, csv_directory_distilMBERT],
    id_vars=['method', 'model'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
    },
    group_by_columns=['language_code', 'model'],
    agg_title='Aggregated Feature Importance for PoS - Vanilla Realignment',
    detail_title='Feature Importance for PoS - Vanilla Realignment',
    group_name=group_name,
    version_name='vanilla',
    method_filter='^((?!freeze).)*$',  # Regex to exclude 'freeze'
    include_distance=True
)

  df = df[df['method'].str.contains(method_filter)]
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):


Error calculating distances between eng and avg: Unknown language avg (or maybe we don't have precomputed distances for this one).


100%|██████████| 30/30 [00:00<00:00, 303.63it/s]



Final Test MSE: 0.00040011704973697904


Calculating SHAP values: 100%|██████████| 630/630 [00:00<00:00, 1193.39it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [70]:
process_and_plot(
    data_loading_func=load_combined_data,
    data_loading_args=[finetuning_project_name, langs, csv_directory_distilMBERT],
    id_vars=['method', 'model'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
    },
    group_by_columns=['language_code', 'model'],
    agg_title='Aggregated Feature Importance for PoS using XLM-R - Partially Frozen Realignment',
    detail_title='Feature Importance for PoS Using XLM-R - Partially Frozen Realignment',
    group_name=group_name,
    version_name='frozen',
    method_filter='freeze|baseline',  # Regex to include 'freeze' or 'baseline'
    include_distance=True
)

Error calculating distances between eng and avg: Unknown language avg (or maybe we don't have precomputed distances for this one).


  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):
100%|██████████| 30/30 [00:00<00:00, 210.38it/s]



Final Test MSE: 0.0004505448031792457


Calculating SHAP values: 100%|██████████| 1260/1260 [00:02<00:00, 503.26it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [103]:
group_name = 'distilmbert_pos'

process_and_plot(
    data_loading_func=load_and_combine_csvs,
    data_loading_args=['./distilMBERT_percentile_data'],
    id_vars=['method', 'value'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
        'Value': 'QE Percentile'
    },
    group_by_columns=['language_code', 'value'],
    agg_title='Aggregated Feature Importance for PoS using distilMBERT',
    detail_title='Feature Importance for PoS Using distilMBERT',
    group_name=group_name,
    version_name='aggregate',
    include_distance=True
)

DataFrame after dropping empty columns from distilbert-base-multilingual-cased__opus100.csv:
   seed                               model   task            method  \
0    31  distilbert-base-multilingual-cased  udpos  before_fastalign   
1    42  distilbert-base-multilingual-cased  udpos  before_fastalign   
2    66  distilbert-base-multilingual-cased  udpos  before_fastalign   
3    23  distilbert-base-multilingual-cased  udpos  before_fastalign   
4    17  distilbert-base-multilingual-cased  udpos  before_fastalign   

   finetuning_steps  realignment_steps  distinct_realignment_samples  \
0              1965               1965                         31440   
1              1965               1965                         31440   
2              1965               1965                         31440   
3              1965               1965                         31440   
4              1965               1965                         31440   

   repeated_realignment_samples  train_lo

  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):
100%|██████████| 30/30 [00:00<00:00, 31.04it/s]



Final Test MSE: 0.000421270823263575


Calculating SHAP values: 100%|██████████| 11340/11340 [04:02<00:00, 46.75it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [104]:
process_and_plot(
    data_loading_func=load_and_combine_csvs,
    data_loading_args=['./distilMBERT_percentile_data'],
    id_vars=['method', 'value'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
        'Value': 'QE Percentile'
    },
    group_by_columns=['language_code', 'value'],
    agg_title='Aggregated Feature Importance for PoS using distilMBERT - Vanilla Realignment',
    detail_title='Feature Importance for PoS Using distilMBERT - Vanilla Realignment',
    group_name=group_name,
    version_name='vanilla',
    method_filter='^((?!freeze).)*$',  # Regex to exclude 'freeze'
    include_distance=True
)

DataFrame after dropping empty columns from distilbert-base-multilingual-cased__opus100.csv:
   seed                               model   task            method  \
0    31  distilbert-base-multilingual-cased  udpos  before_fastalign   
1    42  distilbert-base-multilingual-cased  udpos  before_fastalign   
2    66  distilbert-base-multilingual-cased  udpos  before_fastalign   
3    23  distilbert-base-multilingual-cased  udpos  before_fastalign   
4    17  distilbert-base-multilingual-cased  udpos  before_fastalign   

   finetuning_steps  realignment_steps  distinct_realignment_samples  \
0              1965               1965                         31440   
1              1965               1965                         31440   
2              1965               1965                         31440   
3              1965               1965                         31440   
4              1965               1965                         31440   

   repeated_realignment_samples  train_lo

  df = df[df['method'].str.contains(method_filter)]


Error calculating distances between eng and avg: Unknown language avg (or maybe we don't have precomputed distances for this one).


  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):
100%|██████████| 30/30 [00:00<00:00, 104.00it/s]



Final Test MSE: 0.0004159896044259229


Calculating SHAP values: 100%|██████████| 3780/3780 [00:18<00:00, 208.17it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [105]:
process_and_plot(
    data_loading_func=load_and_combine_csvs,
    data_loading_args=['./distilMBERT_percentile_data'],
    id_vars=['method', 'value'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
        'Value': 'QE Percentile'
    },
    group_by_columns=['language_code', 'value'],
    agg_title='Aggregated Feature Importance for PoS using distilMBERT - Partially Frozen Realignment',
    detail_title='Feature Importance for PoS Using distilMBERT - Partially Frozen Realignment',
    group_name=group_name,
    version_name='frozen',
    method_filter='freeze|baseline',  # Regex to include 'freeze' or 'baseline'
    include_distance=True
)

DataFrame after dropping empty columns from distilbert-base-multilingual-cased__opus100.csv:
   seed                               model   task            method  \
0    31  distilbert-base-multilingual-cased  udpos  before_fastalign   
1    42  distilbert-base-multilingual-cased  udpos  before_fastalign   
2    66  distilbert-base-multilingual-cased  udpos  before_fastalign   
3    23  distilbert-base-multilingual-cased  udpos  before_fastalign   
4    17  distilbert-base-multilingual-cased  udpos  before_fastalign   

   finetuning_steps  realignment_steps  distinct_realignment_samples  \
0              1965               1965                         31440   
1              1965               1965                         31440   
2              1965               1965                         31440   
3              1965               1965                         31440   
4              1965               1965                         31440   

   repeated_realignment_samples  train_lo

  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):
100%|██████████| 30/30 [00:00<00:00, 52.94it/s]



Final Test MSE: 0.0004168005355375482


Calculating SHAP values: 100%|██████████| 7560/7560 [00:57<00:00, 132.52it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [106]:
group_name = 'distilmbert_nli'

process_and_plot(
    data_loading_func=load_and_combine_csvs,
    data_loading_args=['./distilMBERT_nli_data'],
    id_vars=['method', 'value'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
        'Value': 'QE Percentile'
    },
    group_by_columns=['language_code', 'value'],
    agg_title='Aggregated Feature Importance for NLI using distilMBERT',
    detail_title='Feature Importance for NLI Using distilMBERT',
    group_name=group_name,
    version_name='aggregate',
    include_distance=True
)

DataFrame after dropping empty columns from distilbert-base-multilingual-cased__xnli__opus100.csv:
   seed                               model  task            method  \
0    31  distilbert-base-multilingual-cased  xnli  before_fastalign   
1    42  distilbert-base-multilingual-cased  xnli  before_fastalign   
2    66  distilbert-base-multilingual-cased  xnli  before_fastalign   
3    23  distilbert-base-multilingual-cased  xnli  before_fastalign   
4    17  distilbert-base-multilingual-cased  xnli  before_fastalign   

   finetuning_steps  realignment_steps  distinct_realignment_samples  \
0             24544              24544                        392704   
1             24544              24544                        392704   
2             24544              24544                        392704   
3             24544              24544                        392704   
4             24544              24544                        392704   

   repeated_realignment_samples  train_lo

  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):


Error calculating distances between eng and avg: Unknown language avg (or maybe we don't have precomputed distances for this one).


100%|██████████| 30/30 [00:00<00:00, 349.03it/s]



Final Test MSE: 8.47055612662108e-05


Calculating SHAP values: 100%|██████████| 702/702 [00:00<00:00, 2067.09it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [107]:
process_and_plot(
    data_loading_func=load_and_combine_csvs,
    data_loading_args=['./distilMBERT_nli_data'],
    id_vars=['method', 'value'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
        'Value': 'QE Percentile'
    },
    group_by_columns=['language_code', 'value'],
    agg_title='Aggregated Feature Importance for NLI using distilMBERT - Vanilla Realignment',
    detail_title='Feature Importance for NLI Using distilMBERT - Vanilla Realignment',
    group_name=group_name,
    version_name='vanilla',
    method_filter='^((?!freeze).)*$',  # Regex to exclude 'freeze'
    include_distance=True
)

DataFrame after dropping empty columns from distilbert-base-multilingual-cased__xnli__opus100.csv:
   seed                               model  task            method  \
0    31  distilbert-base-multilingual-cased  xnli  before_fastalign   
1    42  distilbert-base-multilingual-cased  xnli  before_fastalign   
2    66  distilbert-base-multilingual-cased  xnli  before_fastalign   
3    23  distilbert-base-multilingual-cased  xnli  before_fastalign   
4    17  distilbert-base-multilingual-cased  xnli  before_fastalign   

   finetuning_steps  realignment_steps  distinct_realignment_samples  \
0             24544              24544                        392704   
1             24544              24544                        392704   
2             24544              24544                        392704   
3             24544              24544                        392704   
4             24544              24544                        392704   

   repeated_realignment_samples  train_lo

  df = df[df['method'].str.contains(method_filter)]
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):


Error calculating distances between eng and avg: Unknown language avg (or maybe we don't have precomputed distances for this one).


100%|██████████| 30/30 [00:00<00:00, 461.14it/s]



Final Test MSE: 9.605550228466765e-05


Calculating SHAP values: 100%|██████████| 234/234 [00:00<00:00, 729.49it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [108]:
process_and_plot(
    data_loading_func=load_and_combine_csvs,
    data_loading_args=['./distilMBERT_nli_data'],
    id_vars=['method', 'value'],
    y_labels={
        'Method': 'Method',
        'Language Code': 'Languages',
        'Value': 'QE Percentile'
    },
    group_by_columns=['language_code', 'value'],
    agg_title='Aggregated Feature Importance for NLI using distilMBERT - Partially Frozen Realignment',
    detail_title='Feature Importance for NLI Using distilMBERT - Partially Frozen Realignment',
    group_name=group_name,
    version_name='frozen',
    method_filter='freeze|baseline',  # Regex to include 'freeze' or 'baseline'
    include_distance=True
)

DataFrame after dropping empty columns from distilbert-base-multilingual-cased__xnli__opus100.csv:
   seed                               model  task            method  \
0    31  distilbert-base-multilingual-cased  xnli  before_fastalign   
1    42  distilbert-base-multilingual-cased  xnli  before_fastalign   
2    66  distilbert-base-multilingual-cased  xnli  before_fastalign   
3    23  distilbert-base-multilingual-cased  xnli  before_fastalign   
4    17  distilbert-base-multilingual-cased  xnli  before_fastalign   

   finetuning_steps  realignment_steps  distinct_realignment_samples  \
0             24544              24544                        392704   
1             24544              24544                        392704   
2             24544              24544                        392704   
3             24544              24544                        392704   
4             24544              24544                        392704   

   repeated_realignment_samples  train_lo

  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  result = df.groupby(group_by_columns, group_keys=False).apply(calculate_delta_accuracy)
  for _, group in grouped_data.groupby(group_cols):


Error calculating distances between eng and avg: Unknown language avg (or maybe we don't have precomputed distances for this one).


100%|██████████| 30/30 [00:00<00:00, 397.23it/s]



Final Test MSE: 7.781479490968067e-05


Calculating SHAP values: 100%|██████████| 468/468 [00:00<00:00, 1599.03it/s]
  plt.gca().set_yticklabels(y_tick_labels, fontsize=12)


In [None]:
# csv_directory_xlmr = './xlmr_threshold_data'
# xlmr_df = load_and_combine_csvs(csv_directory_xlmr)

# id_vars = ['method', 'value',]
# y_labels = {
#     'Method': 'Method',
#     'Language Code': 'Languages',
#     'Value': 'QE Threshold'
# }

# melted_df = transform_data_for_regression(xlmr_df, id_vars, include_distance=True, mapping=lang2vec_mapping)

# # Group by 'language_code' and 'value' and apply the calculation
# group_by_columns = ['language_code', 'value']
# melted_df = process_dataframe(melted_df, group_by_columns)

# melted_df.drop('accuracy', axis=1, inplace=True)
# melted_df = melted_df[~melted_df.isna().any(axis=1)]

# rf_coefficients, feature_names = perform_random_forest_regression(melted_df, id_vars)
# plot_aggregated_coefficients(rf_coefficients, id_vars, feature_names, 'Aggregated Feature Importance for PoS using XLM-R', y_labels, include_distance=True)
# plot_detailed_coefficients(rf_coefficients, id_vars, 'Feature Importance for PoS Using XLM-R', 'QE Threshold', include_distance=True)

In [None]:
# csv_directory_xlmr = './xlmr_threshold_data'
# xlmr_df = load_and_combine_csvs(csv_directory_xlmr)

# id_vars = ['method', 'value',]
# y_labels = {
#     'Method': 'Method',
#     'Language Code': 'Languages',
#     'Value': 'QE Threshold'
# }

# xlmr_df_vanilla = xlmr_df[~xlmr_df['method'].str.contains('freeze')]

# melted_df_vanilla = transform_data_for_regression(xlmr_df_vanilla, id_vars, include_distance=True, mapping=lang2vec_mapping)

# # Group by 'language_code' and 'value' and apply the calculation
# group_by_columns = ['language_code', 'value']
# melted_df_vanilla = process_dataframe(melted_df_vanilla, group_by_columns)

# melted_df_vanilla.drop('accuracy', axis=1, inplace=True)
# melted_df_vanilla = melted_df_vanilla[~melted_df_vanilla.isna().any(axis=1)]

# rf_coefficients, feature_names = perform_random_forest_regression(melted_df_vanilla, id_vars)
# plot_aggregated_coefficients(rf_coefficients, id_vars, feature_names, 'Aggregated Feature Importance for PoS using XLM-R - Vanilla Realignment', y_labels, include_distance=True)
# plot_detailed_coefficients(rf_coefficients, id_vars, 'Feature Importance for PoS Using XLM-R - Vanilla Realignment', 'QE Threshold', include_distance=True)

In [None]:
# csv_directory_xlmr = './xlmr_threshold_data'
# xlmr_df = load_and_combine_csvs(csv_directory_xlmr)

# id_vars = ['method', 'value',]
# y_labels = {
#     'Method': 'Method',
#     'Language Code': 'Languages',
#     'Value': 'QE Threshold'
# }

# xlmr_df_freeze = xlmr_df[(xlmr_df['method'].str.contains('freeze') | xlmr_df['method'].str.contains('baseline'))]

# melted_df_freeze = transform_data_for_regression(xlmr_df_freeze, id_vars, include_distance=True, mapping=lang2vec_mapping)

# # Group by 'language_code' and 'value' and apply the calculation
# group_by_columns = ['language_code', 'value']
# melted_df_freeze = process_dataframe(melted_df_freeze, group_by_columns)

# melted_df_freeze.drop('accuracy', axis=1, inplace=True)
# melted_df_freeze = melted_df_freeze[~melted_df_freeze.isna().any(axis=1)]

# rf_coefficients, feature_names = perform_random_forest_regression(melted_df_freeze, id_vars)
# plot_aggregated_coefficients(rf_coefficients, id_vars, feature_names, 'Aggregated Feature Importance for PoS using XLM-R - Partially Frozen Realignment', y_labels, include_distance=True)
# plot_detailed_coefficients(rf_coefficients, id_vars, 'Feature Importance for PoS Using XLM-R - Partially Frozen Realignment', 'QE Threshold', include_distance=True)

In [None]:
# xlmr_baseline_data = fetch_and_process_runs(finetuning_project_name, langs)
# xlmr_baseline_df = pd.DataFrame(xlmr_baseline_data)
# xlmr_baseline_df['method'] = 'baseline'
# xlmr_baseline_df['value'] = 0.0
# xlmr_baseline_df['model'] = 'XLM-R'

# xlmr_baseline_df = standardize_xlmr_columns(xlmr_baseline_df)

# csv_directory_distilMBERT = './distilMBERT_percentile_data'
# distilMBERT_baseline_df = load_baseline_csv(csv_directory_distilMBERT, 'distilMBERT')
# combined_df = pd.concat([distilMBERT_baseline_df, xlmr_baseline_df], ignore_index=True)

# combined_df = combined_df.dropna(axis=1, how='any')

# id_vars = ['method', 'model']
# y_labels = {
#     'Method': 'Method',
#     'Language Code': 'Languages',
# }
# melted_df = transform_data_for_regression(combined_df, id_vars, include_distance=True, mapping=lang2vec_mapping)

# # Group by 'language_code' and 'value' and apply the calculation
# group_by_columns = ['language_code', 'model']
# melted_df = process_dataframe(melted_df, group_by_columns)

# melted_df.drop('accuracy', axis=1, inplace=True)
# melted_df = melted_df[~melted_df.isna().any(axis=1)]

# # subset_columns = ['method', 'model', 'language_code']
# # grouped_data = group_by_subset(melted_df, subset_columns)

# rf_coefficients, feature_names = perform_random_forest_regression(melted_df, id_vars)

# plot_aggregated_coefficients(rf_coefficients, id_vars, feature_names, 'Aggregated Feature Importance for PoS', y_labels, include_distance=True)
# plot_detailed_coefficients(rf_coefficients, id_vars, 'Feature Importance for PoS', include_distance=True)


In [None]:
# xlmr_baseline_data = fetch_and_process_runs(finetuning_project_name, langs)
# xlmr_baseline_df = pd.DataFrame(xlmr_baseline_data)
# xlmr_baseline_df['method'] = 'baseline'
# xlmr_baseline_df['value'] = 0.0
# xlmr_baseline_df['model'] = 'XLM-R'

# xlmr_baseline_df = standardize_xlmr_columns(xlmr_baseline_df)

# csv_directory_distilMBERT = './distilMBERT_percentile_data'
# distilMBERT_baseline_df = load_baseline_csv(csv_directory_distilMBERT, 'distilMBERT')
# combined_df = pd.concat([distilMBERT_baseline_df, xlmr_baseline_df], ignore_index=True)

# combined_df = combined_df.dropna(axis=1, how='any')

# combined_df_vanilla = combined_df[~combined_df['method'].str.contains('freeze')]

# id_vars = ['method', 'model']
# y_labels = {
#     'Method': 'Method',
#     'Language Code': 'Languages',
# }
# melted_df_vanilla = transform_data_for_regression(combined_df_vanilla, id_vars, include_distance=True, mapping=lang2vec_mapping)

# # Group by 'language_code' and 'value' and apply the calculation
# group_by_columns = ['language_code', 'model']
# melted_df_vanilla = process_dataframe(melted_df_vanilla, group_by_columns)

# melted_df_vanilla.drop('accuracy', axis=1, inplace=True)
# melted_df_vanilla = melted_df_vanilla[~melted_df_vanilla.isna().any(axis=1)]

# # subset_columns = ['method', 'model', 'language_code']
# # grouped_data = group_by_subset(melted_df, subset_columns)

# rf_coefficients, feature_names = perform_random_forest_regression(melted_df_vanilla, id_vars)

# plot_aggregated_coefficients(rf_coefficients, id_vars, feature_names, 'Aggregated Feature Importance for PoS - Vanilla Realignment', y_labels, include_distance=True)
# plot_detailed_coefficients(rf_coefficients, id_vars, 'Feature Importance for PoS - Vanilla Realignment', include_distance=True)


In [102]:
# xlmr_baseline_data = fetch_and_process_runs(finetuning_project_name, langs)
# xlmr_baseline_df = pd.DataFrame(xlmr_baseline_data)
# xlmr_baseline_df['method'] = 'baseline'
# xlmr_baseline_df['value'] = 0.0
# xlmr_baseline_df['model'] = 'XLM-R'

# xlmr_baseline_df = standardize_xlmr_columns(xlmr_baseline_df)

# csv_directory_distilMBERT = './distilMBERT_percentile_data'
# distilMBERT_baseline_df = load_baseline_csv(csv_directory_distilMBERT, 'distilMBERT')
# combined_df = pd.concat([distilMBERT_baseline_df, xlmr_baseline_df], ignore_index=True)

# combined_df = combined_df.dropna(axis=1, how='any')

# combined_df_freeze = combined_df[(combined_df['method'].str.contains('freeze') | combined_df['method'].str.contains('baseline'))]

# id_vars = ['method', 'model']
# y_labels = {
#     'Method': 'Method',
#     'Language Code': 'Languages',
# }
# melted_df_freeze = transform_data_for_regression(combined_df_freeze, id_vars, include_distance=True, mapping=lang2vec_mapping)

# # Group by 'language_code' and 'value' and apply the calculation
# group_by_columns = ['language_code', 'model']
# melted_df_freeze = process_dataframe(melted_df_freeze, group_by_columns)

# melted_df_freeze.drop('accuracy', axis=1, inplace=True)
# melted_df_freeze = melted_df_freeze[~melted_df_freeze.isna().any(axis=1)]

# # subset_columns = ['method', 'model', 'language_code']
# # grouped_data = group_by_subset(melted_df, subset_columns)

# rf_coefficients, feature_names = perform_random_forest_regression(melted_df_freeze, id_vars)

# plot_aggregated_coefficients(rf_coefficients, id_vars, feature_names, 'Aggregated Feature Importance for PoS - Partially Frozen Realignment', y_labels, include_distance=True)
# plot_detailed_coefficients(rf_coefficients, id_vars, 'Feature Importance for PoS - Partially Frozen Realignment', include_distance=True)


In [None]:
# csv_directory_distilMBERT = './distilMBERT_percentile_data'
# distilMBERT_baseline_df = load_and_combine_csvs(csv_directory_distilMBERT)

# id_vars = ['method', 'value']
# y_labels = {
#     'Method': 'Method',
#     'Language Code': 'Languages',
#     'Value': 'QE Percentile'
# }
# melted_df = transform_data_for_regression(distilMBERT_baseline_df, id_vars, include_distance=True, mapping=lang2vec_mapping)

# # Group by 'language_code' and 'value' and apply the calculation
# group_by_columns = ['language_code', 'value']
# melted_df = process_dataframe(melted_df, group_by_columns)

# melted_df.drop('accuracy', axis=1, inplace=True)
# melted_df = melted_df[~melted_df.isna().any(axis=1)]

# rf_coefficients, feature_names = perform_random_forest_regression(melted_df, id_vars)

# plot_aggregated_coefficients(rf_coefficients, id_vars, feature_names, 'Aggregated Feature Importance for PoS using distilMBERT', y_labels, include_distance=True)
# plot_detailed_coefficients(rf_coefficients, id_vars, 'Feature Importance for PoS Using distilMBERT', custom_y_label='QE Percentile', include_distance=True)

In [None]:
csv_directory_distilMBERT_NLI = './distilMBERT_nli_data'
distilMBERT_NLI_df = load_and_combine_csvs(csv_directory_distilMBERT_NLI)

id_vars = ['method', 'value']
y_labels = {
    'Method': 'Method',
    'Language Code': 'Languages',
    'Value': 'QE Percentile'
}
melted_df = transform_data_for_regression(distilMBERT_NLI_df, id_vars, include_distance=True, mapping=lang2vec_mapping)

# subset_columns = ['method', 'language_code', 'value']
# grouped_data = group_by_subset(melted_df, subset_columns)

# Group by 'language_code' and 'value' and apply the calculation
group_by_columns = ['language_code', 'value']
melted_df = process_dataframe(melted_df, group_by_columns)

melted_df.drop('accuracy', axis=1, inplace=True)
melted_df = melted_df[~melted_df.isna().any(axis=1)]

rf_coefficients, feature_names = perform_random_forest_regression(melted_df, id_vars)

plot_aggregated_coefficients(rf_coefficients, id_vars, feature_names, 'Aggregated Feature Importance for NLI', y_labels, include_distance=True)
plot_detailed_coefficients(rf_coefficients, id_vars, 'Feature Importance for NLI', custom_y_label='QE Percentile', include_distance=True)

In [None]:
def plot_performance_vs_feature_importance(df, rf_coefficients, title=None):
    languages = df['language_code'].unique()

    results = []

    for language in languages:
        language_df = df[df['language_code'] == language]

        # Calculate the average baseline performance for the language
        baseline_avg = language_df[language_df['method'] == 'Fine-tuning Only']['accuracy'].mean()

        # Calculate the average performance for each language across all methods (excluding baseline)
        avg_performance = language_df[language_df['method'] != 'Fine-tuning Only'].groupby('method', observed=False)['accuracy'].mean().mean()

        # Calculate the performance change relative to the baseline
        performance_change = avg_performance - baseline_avg

        # Get the feature importance for the language
        feature_importance = rf_coefficients.get(f'language_code_{language}', 0)

        # Determine the language family and shape
        specific_family = specific_language_families.get(language, 'Other')
        shape = specific_family_shapes.get(specific_family, 'x')

        # Store the results
        results.append({
            'language': language_map.get(language, language),
            'performance_change': performance_change,
            'importance': feature_importance,
            'family': specific_family,
            'shape': shape
        })

    # Create a dataframe from the results
    results_df = pd.DataFrame(results)
    
    # Remove 'avg' and 'same' entries
    results_df = results_df[~results_df['language'].isin(['Average',])]

    # Split the results into two groups
    midpoint = len(results_df) // 2
    results_df_1 = results_df.iloc[:midpoint]
    results_df_2 = results_df.iloc[midpoint:]

    def plot_group(data, title_suffix):
        plt.figure(figsize=(14, 8))
        scatter_plot = sns.scatterplot(
            data=data, x='importance', y='performance_change', hue='language', style='family', 
            markers=specific_family_shapes, s=200, palette='tab20', legend='full'
        )
        
        plt.title(f'Performance Change vs. Feature Importance {title_suffix}' if title is None else title)
        plt.xlabel('Feature Importance')
        plt.ylabel('Performance Change (relative to Baseline)')
        plt.grid(True)

        # Get the current legend handles and labels
        handles, labels = scatter_plot.get_legend_handles_labels()

        # Separate the handles and labels for the languages and shapes
        language_handles = []
        language_labels = []
        family_handles = []
        family_labels = []

        for handle, label in zip(handles, labels):
            if label in specific_family_shapes:
                family_handles.append(handle)
                family_labels.append(label)
            else:
                language_handles.append(handle)
                language_labels.append(label.title())  # Titleize the language names

        # Ensure unique language handles/labels
        unique_language_labels = list(dict.fromkeys(language_labels))
        unique_language_handles = [plt.Line2D([], [], marker='D', color=handle.get_color(), linestyle='', markersize=10) for handle in language_handles]

        # Increase the size of the custom shape markers in the legend
        for handle in family_handles:
            handle.set_markersize(10)

        # Combine the unique language handles/labels and custom shape handles/labels
        new_handles = unique_language_handles + family_handles
        new_labels = unique_language_labels + family_labels

        # Position the legend outside of the plot
        plt.legend(handles=new_handles, labels=new_labels, title='', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
        plt.tight_layout()
        plt.show()

    # Plot the first group
    plot_group(results_df_1, "(Group 1)")

    # Plot the second group
    plot_group(results_df_2, "(Group 2)")

In [None]:
def plot_performance_variability_vs_feature_importance(df, rf_coefficients, title=None):
    languages = df['language_code'].unique()

    results = []

    for language in languages:
        language_df = df[df['language_code'] == language]

        # Calculate the minimum and maximum accuracy for the language
        min_accuracy = language_df['accuracy'].min()
        max_accuracy = language_df['accuracy'].max()

        # Calculate the delta between the minimum and maximum accuracy
        accuracy_delta = max_accuracy - min_accuracy

        print(language, accuracy_delta)

        # Get the feature importance for the language
        feature_importance = rf_coefficients.get(f'language_code_{language}', 0)

        # Determine the language family and shape
        specific_family = specific_language_families.get(language, 'Other')
        shape = specific_family_shapes.get(specific_family, 'x')

        # Store the results
        results.append({
            'language': language_map.get(language, language),
            'accuracy_delta': accuracy_delta,
            'importance': feature_importance,
            'family': specific_family,
            'shape': shape
        })

    # Create a dataframe from the results
    results_df = pd.DataFrame(results)
    
    # Remove 'avg' and 'same' entries
    results_df = results_df[~results_df['language'].isin(['Average', 'Same'])]

    # Calculate the correlation between feature importance and accuracy delta
    correlation, _ = pearsonr(results_df['importance'], results_df['accuracy_delta'])
    correlation_text = f'Pearson Correlation: {correlation:.2f}'

    # Calculate z-scores to identify outliers
    results_df['z_score_importance'] = zscore(results_df['importance'])
    results_df['z_score_accuracy_delta'] = zscore(results_df['accuracy_delta'])

    # Define a threshold for identifying outliers
    z_score_threshold = 2

    # Identify outliers
    outliers = results_df[(results_df['z_score_importance'].abs() > z_score_threshold) | 
                          (results_df['z_score_accuracy_delta'].abs() > z_score_threshold)]

    # Split the results into two groups
    midpoint = len(results_df) // 2
    results_df_1 = results_df.iloc[:midpoint]
    results_df_2 = results_df.iloc[midpoint:]

    def plot_group(data, title_suffix):
        plt.figure(figsize=(14, 8))
        scatter_plot = sns.scatterplot(
            data=data, x='importance', y='accuracy_delta', hue='language', style='family', 
            markers=specific_family_shapes, s=200, palette='tab20', legend='full'
        )
        
        plt.title(f'Accuracy Variability vs. Feature Importance {title_suffix}' if title is None else title)
        plt.xlabel('Feature Importance')
        plt.ylabel('Accuracy Delta (Max - Min)')
        plt.grid(True)

        # Add correlation text to the plot
        plt.text(0.05, 0.95, correlation_text, ha='left', va='top', transform=plt.gca().transAxes, fontsize=12)

        # Add dashed line for correlation visualization
        x = data['importance']
        y = data['accuracy_delta']
        m, b = np.polyfit(x, y, 1)  # Fit a line to the data
        plt.plot(x, m*x + b, linestyle='--', color='gray')

        # Highlight outliers
        # for _, row in outliers.iterrows():
        #     plt.gca().add_patch(plt.Circle((row['importance'], row['accuracy_delta']), 0.005, color='red', fill=False, linestyle='--', linewidth=1.5))

        # Get the current legend handles and labels
        handles, labels = scatter_plot.get_legend_handles_labels()

        # Separate the handles and labels for the languages and shapes
        language_handles = []
        language_labels = []
        family_handles = []
        family_labels = []

        for handle, label in zip(handles, labels):
            if label in specific_family_shapes:
                family_handles.append(handle)
                family_labels.append(label)
            else:
                language_handles.append(handle)
                language_labels.append(label.title())  # Titleize the language names

        # Ensure unique language handles/labels
        unique_language_labels = list(dict.fromkeys(language_labels))
        unique_language_handles = [plt.Line2D([], [], marker='D', color=handle.get_color(), linestyle='', markersize=10) for handle in language_handles]

        # Increase the size of the custom shape markers in the legend
        for handle in family_handles:
            handle.set_markersize(10)

        # Combine the unique language handles/labels and custom shape handles/labels
        new_handles = unique_language_handles + family_handles
        new_labels = unique_language_labels + family_labels

        # Position the legend outside of the plot
        plt.legend(handles=new_handles, labels=new_labels, title='', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
        plt.tight_layout()
        plt.show()

    # Plot the first group
    plot_group(results_df_1, "(Group 1)")

    # Plot the second group
    plot_group(results_df_2, "(Group 2)")

In [None]:
plot_performance_vs_feature_importance(melted_df, rf_coefficients, title='Performance Change vs. Feature Importance - NLI')

In [None]:
csv_directory_distilMBERT = './distilMBERT_percentile_data'
distilMBERT_baseline_df = load_and_combine_csvs(csv_directory_distilMBERT)

id_vars = ['method', 'value']
melted_df = transform_data_for_regression(distilMBERT_baseline_df, id_vars, include_distance=True, mapping=lang2vec_mapping)

# subset_columns = ['method', 'language_code']
# grouped_data = group_by_subset(melted_df, subset_columns)

rf_coefficients, feature_names = perform_random_forest_regression(melted_df, id_vars)

plot_performance_vs_feature_importance(melted_df, rf_coefficients, title='Performance Change vs. Feature Importance - PoS - distilMBERT')

In [None]:
melted_df[melted_df['language_code'] == 'ja']['accuracy'].describe()

In [None]:
melted_df[melted_df['language_code'] == 'ja'].sort_values('accuracy').head(25)

In [None]:
melted_df[melted_df['language_code'] == 'ja'][melted_df['method'] == 'Fine-tuning Only'].sort_values('accuracy')

In [None]:
melted_df['method'].value_counts()

In [None]:
plot_performance_variability_vs_feature_importance(melted_df, rf_coefficients, title='Performance Variability vs. Feature Importance - PoS - distilMBERT')

In [None]:
melted_df[melted_df['language_code'] == 'th'].sort_values('accuracy')