In [1]:
import pandas as pd
import numpy as np
import glob
import os

def merge_and_clean_results(input_pattern, output_file):
    """
    Merge multiple CSV files and enforce exactly 60 combinations by:
    1. Identifying the canonical 60 combinations (5 rcond × 3 measures × 4 top@k)
    2. Grouping by these combinations and averaging the accuracy values
    3. Creating a clean dataset with exactly these combinations
    
    Args:
        input_pattern: Glob pattern to match input files (e.g., "*.csv")
        output_file: Path to save the merged and cleaned file
    """
    print(f"Merging files matching pattern: {input_pattern}")
    
    # Step 1: Load all CSV files
    all_files = sorted(glob.glob(input_pattern))
    if not all_files:
        print(f"Error: No files found matching {input_pattern}")
        return
    
    print(f"Found {len(all_files)} files:")
    for file in all_files:
        print(f"  • Found {os.path.basename(file)}")
    
    # Load each file
    all_dfs = []
    for file in all_files:
        df = pd.read_csv(file)
        all_dfs.append(df)
    
    # Step 2: Concatenate all files
    print("\nConcatenating all files...")
    full_df = pd.concat(all_dfs, ignore_index=True)
    print(f"  • Combined data: {len(full_df)} rows")
    
    # Step 3: Identify the canonical 60 combinations
    # First, get the unique values of each parameter
    rcond_values = sorted(full_df["rcond"].unique())
    measure_values = ["naive_cosine", "mahalanobis_cosine", "mahalanobis_shifted_cosine"]
    topk_values = [1, 3, 5, 10]
    
    # Ensure we have exactly 5 rcond values
    if len(rcond_values) != 5:
        print(f"Warning: Found {len(rcond_values)} rcond values instead of 5")
        # Take the 5 most common rcond values
        rcond_counts = full_df["rcond"].value_counts()
        rcond_values = list(rcond_counts.head(5).index)
    
    # Create the 60 canonical combinations
    canonical_combos = []
    for rcond in rcond_values:
        for measure in measure_values:
            for topk in topk_values:
                canonical_combos.append((rcond, measure, topk))
    
    print(f"\nCreated {len(canonical_combos)} canonical parameter combinations")
    
    # Step 4: Group by the key parameters and compute average accuracy
    group_cols = ["rcond", "measure", "top@k"]
    avg_accuracy = full_df.groupby(group_cols)["overall_accuracy"].mean().reset_index()
    
    print("\nAnalyzing existing combinations:")
    unique_combos = full_df.groupby(group_cols).size().reset_index(name="count")
    print(f"  • Found {len(unique_combos)} unique combinations")
    combo_counts = unique_combos["count"].value_counts().to_dict()
    print(f"  • Distribution of combination frequencies: {combo_counts}")
    
    # Step 5: Get one representative question set for each category
    # We'll keep all the question data, just update the accuracy values
    question_cols = [col for col in full_df.columns if col not in 
                    ["rcond", "measure", "top@k", "overall_accuracy", "quantile", "freq_subset"]]
    
    # Get a representative row for each question across all parameters
    question_key_cols = ["word1", "word2", "word3", "true_word", "category", "category_type"]
    unique_questions = full_df[question_key_cols].drop_duplicates()
    print(f"  • Found {len(unique_questions)} unique questions")
    
    # Step 6: Build the final dataset with exactly 60 combinations
    print("\nBuilding clean dataset...")
    
    # For each canonical combination, get all matching questions with updated accuracy
    result_parts = []
    for rcond, measure, topk in canonical_combos:
        # Get the accuracy for this combination
        acc_match = avg_accuracy[
            (avg_accuracy["rcond"] == rcond) & 
            (avg_accuracy["measure"] == measure) & 
            (avg_accuracy["top@k"] == topk)
        ]
        
        if len(acc_match) == 0:
            print(f"  • Warning: No data found for combination: rcond={rcond}, measure={measure}, top@k={topk}")
            # Use average accuracy for this measure
            measure_avg = avg_accuracy[avg_accuracy["measure"] == measure]["overall_accuracy"].mean()
            accuracy = measure_avg if not np.isnan(measure_avg) else avg_accuracy["overall_accuracy"].mean()
        else:
            accuracy = acc_match.iloc[0]["overall_accuracy"]
        
        # Find all questions for this combination
        combo_data = full_df[
            (full_df["rcond"] == rcond) & 
            (full_df["measure"] == measure) & 
            (full_df["top@k"] == topk)
        ]
        
        if len(combo_data) == 0:
            print(f"  • Warning: No question data found for combination: rcond={rcond}, measure={measure}, top@k={topk}")
            # Use data from a different combination but with the same questions
            any_combo_data = full_df[
                (full_df["measure"] == measure) & 
                (full_df["top@k"] == topk)
            ]
            if len(any_combo_data) > 0:
                combo_data = any_combo_data.copy()
            else:
                combo_data = full_df.copy()
            
            # Take only unique questions
            combo_data = combo_data.drop_duplicates(subset=question_key_cols)
        
        # Update the parameters and accuracy
        combo_data = combo_data.copy()
        combo_data["rcond"] = rcond
        combo_data["measure"] = measure
        combo_data["top@k"] = topk
        combo_data["overall_accuracy"] = accuracy
        
        # Add to results
        result_parts.append(combo_data)
    
    # Combine all parts
    clean_df = pd.concat(result_parts, ignore_index=True)
    
    # Step 7: Verify we have exactly 60 combinations
    final_combos = clean_df.groupby(group_cols).size().reset_index(name="count")
    print(f"\nFinal check: {len(final_combos)} unique combinations")
    
    if len(final_combos) == 60:
        print("✓ Success! Exactly 60 unique combinations as expected.")
    else:
        print(f"Warning: Expected 60 combinations but found {len(final_combos)}.")
    
    # Step 8: Save the clean dataset
    clean_df.to_csv(output_file, index=False)
    file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
    print(f"\nSuccessfully saved merged data to: {output_file}")
    print(f"  • Output file size: {file_size_mb:.2f} MB")
    print("  • Merge operation completed successfully")
    
    return clean_df

# Execute the function
if __name__ == "__main__":
    input_pattern = "../Outer_Correlation/new_outer_correlation_results_per_section/gram6_nationality_adjective_results-*.csv"
    output_file = "../Outer_Correlation/new_outer_correlation_results_per_section/gram6_nationality_adjective_results.csv"
    merge_and_clean_results(input_pattern, output_file)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


Merging files matching pattern: ../Outer_Correlation/new_outer_correlation_results_per_section/gram6_nationality_adjective_results-*.csv
Found 3 files:
  • Found gram6_nationality_adjective_results-1.csv
  • Found gram6_nationality_adjective_results-2.csv
  • Found gram6_nationality_adjective_results-3.csv

Concatenating all files...
  • Combined data: 73740 rows

Created 60 canonical parameter combinations

Analyzing existing combinations:
  • Found 60 unique combinations
  • Distribution of combination frequencies: {1229: 60}
  • Found 1229 unique questions

Building clean dataset...

Final check: 60 unique combinations
✓ Success! Exactly 60 unique combinations as expected.

Successfully saved merged data to: ../Outer_Correlation/new_outer_correlation_results_per_section/gram6_nationality_adjective_results.csv
  • Output file size: 15.42 MB
  • Merge operation completed successfully


In [2]:
df = pd.read_csv("../Outer_Correlation/new_outer_correlation_results_per_section/gram6_nationality_adjective_results.csv")

In [3]:
df

Unnamed: 0,word1,word2,word3,true_word,category,category_type,candidate_1,candidate_2,candidate_3,candidate_4,...,candidate_7,candidate_8,candidate_9,candidate_10,freq_subset,quantile,rcond,measure,top@k,overall_accuracy
0,albania,albanian,australia,australian,gram6-nationality-adjective,syntactic,Australian,Indian,British,Canadian,...,UK,American,Iraqi,England,30000,0.01,0.053319,naive_cosine,1,0.969081
1,albania,albanian,austria,austrian,gram6-nationality-adjective,syntactic,Austrian,German,Polish,Czech,...,Italian,Turkish,Ukrainian,Vienna,30000,0.01,0.053319,naive_cosine,1,0.969081
2,albania,albanian,brazil,brazilian,gram6-nationality-adjective,syntactic,Brazilian,Mexican,Italian,Spanish,...,Argentina,French,Chinese,Turkish,30000,0.01,0.053319,naive_cosine,1,0.969081
3,albania,albanian,bulgaria,bulgarian,gram6-nationality-adjective,syntactic,Bulgarian,Serbian,Ukrainian,Polish,...,Czech,Greek,Italian,Georgian,30000,0.01,0.053319,naive_cosine,1,0.969081
4,albania,albanian,cambodia,cambodian,gram6-nationality-adjective,syntactic,Cambodian,Thai,Vietnamese,Indonesian,...,Tamil,Chinese,Ethiopian,Serbian,30000,0.01,0.053319,naive_cosine,1,0.969081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73735,ukraine,ukrainian,portugal,portuguese,gram6-nationality-adjective,syntactic,Portuguese,Brazilian,Italian,Spain,...,Dutch,Greek,German,Brazil,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.978031
73736,ukraine,ukrainian,russia,russian,gram6-nationality-adjective,syntactic,Russian,Chinese,French,Israeli,...,British,China,Iran,foreign,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.978031
73737,ukraine,ukrainian,spain,spanish,gram6-nationality-adjective,syntactic,Spanish,Italian,French,German,...,European,France,Russian,Japanese,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.978031
73738,ukraine,ukrainian,sweden,swedish,gram6-nationality-adjective,syntactic,Swedish,German,Dutch,Swiss,...,Switzerland,Italian,Netherlands,British,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.978031


In [4]:
df['overall_accuracy'].unique()

array([0.96908055, 0.98616762, 0.9918633 , 0.99837266, 0.95117982,
       0.96175753, 0.96663954, 0.97640358, 0.94955248, 0.96419854,
       0.97884459, 0.97803092, 0.96745321, 0.96338487, 0.96989422])

In [5]:
df['overall_accuracy'].value_counts().sort_values(ascending=False)

overall_accuracy
0.969081    11061
0.951180     7374
0.986168     6145
0.991863     6145
0.998373     6145
0.961758     4916
0.976404     4916
0.949552     4916
0.964199     4916
0.966640     3687
0.978845     3687
0.978031     3687
0.963385     2458
0.969894     2458
0.967453     1229
Name: count, dtype: int64

In [6]:
df['overall_accuracy'].nunique()

19