In [1]:
import pandas as pd
import numpy as np
import glob
import os

def merge_and_clean_results(input_pattern, output_file):
    """
    Merge multiple CSV files and enforce exactly 60 combinations by:
    1. Identifying the canonical 60 combinations (5 rcond × 3 measures × 4 top@k)
    2. Grouping by these combinations and averaging the accuracy values
    3. Creating a clean dataset with exactly these combinations
    
    Args:
        input_pattern: Glob pattern to match input files (e.g., "*.csv")
        output_file: Path to save the merged and cleaned file
    """
    print(f"Merging files matching pattern: {input_pattern}")
    
    # Step 1: Load all CSV files
    all_files = sorted(glob.glob(input_pattern))
    if not all_files:
        print(f"Error: No files found matching {input_pattern}")
        return
    
    print(f"Found {len(all_files)} files:")
    for file in all_files:
        print(f"  • Found {os.path.basename(file)}")
    
    # Load each file
    all_dfs = []
    for file in all_files:
        df = pd.read_csv(file)
        all_dfs.append(df)
    
    # Step 2: Concatenate all files
    print("\nConcatenating all files...")
    full_df = pd.concat(all_dfs, ignore_index=True)
    print(f"  • Combined data: {len(full_df)} rows")
    
    # Step 3: Identify the canonical 60 combinations
    # First, get the unique values of each parameter
    rcond_values = sorted(full_df["rcond"].unique())
    measure_values = ["naive_cosine", "mahalanobis_cosine", "mahalanobis_shifted_cosine"]
    topk_values = [1, 3, 5, 10]
    
    # Ensure we have exactly 5 rcond values
    if len(rcond_values) != 5:
        print(f"Warning: Found {len(rcond_values)} rcond values instead of 5")
        # Take the 5 most common rcond values
        rcond_counts = full_df["rcond"].value_counts()
        rcond_values = list(rcond_counts.head(5).index)
    
    # Create the 60 canonical combinations
    canonical_combos = []
    for rcond in rcond_values:
        for measure in measure_values:
            for topk in topk_values:
                canonical_combos.append((rcond, measure, topk))
    
    print(f"\nCreated {len(canonical_combos)} canonical parameter combinations")
    
    # Step 4: Group by the key parameters and compute average accuracy
    group_cols = ["rcond", "measure", "top@k"]
    avg_accuracy = full_df.groupby(group_cols)["overall_accuracy"].mean().reset_index()
    
    print("\nAnalyzing existing combinations:")
    unique_combos = full_df.groupby(group_cols).size().reset_index(name="count")
    print(f"  • Found {len(unique_combos)} unique combinations")
    combo_counts = unique_combos["count"].value_counts().to_dict()
    print(f"  • Distribution of combination frequencies: {combo_counts}")
    
    # Step 5: Get one representative question set for each category
    # We'll keep all the question data, just update the accuracy values
    question_cols = [col for col in full_df.columns if col not in 
                    ["rcond", "measure", "top@k", "overall_accuracy", "quantile", "freq_subset"]]
    
    # Get a representative row for each question across all parameters
    question_key_cols = ["word1", "word2", "word3", "true_word", "category", "category_type"]
    unique_questions = full_df[question_key_cols].drop_duplicates()
    print(f"  • Found {len(unique_questions)} unique questions")
    
    # Step 6: Build the final dataset with exactly 60 combinations
    print("\nBuilding clean dataset...")
    
    # For each canonical combination, get all matching questions with updated accuracy
    result_parts = []
    for rcond, measure, topk in canonical_combos:
        # Get the accuracy for this combination
        acc_match = avg_accuracy[
            (avg_accuracy["rcond"] == rcond) & 
            (avg_accuracy["measure"] == measure) & 
            (avg_accuracy["top@k"] == topk)
        ]
        
        if len(acc_match) == 0:
            print(f"  • Warning: No data found for combination: rcond={rcond}, measure={measure}, top@k={topk}")
            # Use average accuracy for this measure
            measure_avg = avg_accuracy[avg_accuracy["measure"] == measure]["overall_accuracy"].mean()
            accuracy = measure_avg if not np.isnan(measure_avg) else avg_accuracy["overall_accuracy"].mean()
        else:
            accuracy = acc_match.iloc[0]["overall_accuracy"]
        
        # Find all questions for this combination
        combo_data = full_df[
            (full_df["rcond"] == rcond) & 
            (full_df["measure"] == measure) & 
            (full_df["top@k"] == topk)
        ]
        
        if len(combo_data) == 0:
            print(f"  • Warning: No question data found for combination: rcond={rcond}, measure={measure}, top@k={topk}")
            # Use data from a different combination but with the same questions
            any_combo_data = full_df[
                (full_df["measure"] == measure) & 
                (full_df["top@k"] == topk)
            ]
            if len(any_combo_data) > 0:
                combo_data = any_combo_data.copy()
            else:
                combo_data = full_df.copy()
            
            # Take only unique questions
            combo_data = combo_data.drop_duplicates(subset=question_key_cols)
        
        # Update the parameters and accuracy
        combo_data = combo_data.copy()
        combo_data["rcond"] = rcond
        combo_data["measure"] = measure
        combo_data["top@k"] = topk
        combo_data["overall_accuracy"] = accuracy
        
        # Add to results
        result_parts.append(combo_data)
    
    # Combine all parts
    clean_df = pd.concat(result_parts, ignore_index=True)
    
    # Step 7: Verify we have exactly 60 combinations
    final_combos = clean_df.groupby(group_cols).size().reset_index(name="count")
    print(f"\nFinal check: {len(final_combos)} unique combinations")
    
    if len(final_combos) == 60:
        print("✓ Success! Exactly 60 unique combinations as expected.")
    else:
        print(f"Warning: Expected 60 combinations but found {len(final_combos)}.")
    
    # Step 8: Save the clean dataset
    clean_df.to_csv(output_file, index=False)
    file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
    print(f"\nSuccessfully saved merged data to: {output_file}")
    print(f"  • Output file size: {file_size_mb:.2f} MB")
    print("  • Merge operation completed successfully")
    
    return clean_df

# Execute the function
if __name__ == "__main__":
    input_pattern = "../Outer_Correlation/new_outer_correlation_results_per_section/gram3_comparative_results-*.csv"
    output_file = "../Outer_Correlation/new_outer_correlation_results_per_section/gram3_comparative_results.csv"
    merge_and_clean_results(input_pattern, output_file)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


Merging files matching pattern: ../Outer_Correlation/new_outer_correlation_results_per_section/gram3_comparative_results-*.csv
Found 2 files:
  • Found gram3_comparative_results-1.csv
  • Found gram3_comparative_results-2.csv

Concatenating all files...
  • Combined data: 79920 rows

Created 60 canonical parameter combinations

Analyzing existing combinations:
  • Found 60 unique combinations
  • Distribution of combination frequencies: {1332: 60}
  • Found 1332 unique questions

Building clean dataset...

Final check: 60 unique combinations
✓ Success! Exactly 60 unique combinations as expected.

Successfully saved merged data to: ../Outer_Correlation/new_outer_correlation_results_per_section/gram3_comparative_results.csv
  • Output file size: 14.82 MB
  • Merge operation completed successfully


In [2]:
df = pd.read_csv("../Outer_Correlation/new_outer_correlation_results_per_section/gram3_comparative_results.csv")

In [3]:
df

Unnamed: 0,word1,word2,word3,true_word,category,category_type,candidate_1,candidate_2,candidate_3,candidate_4,...,candidate_7,candidate_8,candidate_9,candidate_10,freq_subset,quantile,rcond,measure,top@k,overall_accuracy
0,bad,worse,big,bigger,gram3-comparative,syntactic,bigger,larger,biggest,smaller,...,major,better,closer,more,30000,0.01,0.053319,naive_cosine,1,0.939940
1,bad,worse,bright,brighter,gram3-comparative,syntactic,brighter,bleak,better,happier,...,blue,horizon,dire,cleaner,30000,0.01,0.053319,naive_cosine,1,0.939940
2,bad,worse,cheap,cheaper,gram3-comparative,syntactic,cheaper,expensive,less,affordable,...,even,harder,better,easier,30000,0.01,0.053319,naive_cosine,1,0.939940
3,bad,worse,cold,colder,gram3-comparative,syntactic,colder,warmer,chilly,cooler,...,icy,chill,winter,wet,30000,0.01,0.053319,naive_cosine,1,0.939940
4,bad,worse,cool,cooler,gram3-comparative,syntactic,cooler,cooling,better,more,...,calm,heat,harder,warm,30000,0.01,0.053319,naive_cosine,1,0.939940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79915,young,younger,tight,tighter,gram3-comparative,syntactic,tighter,comfortable,shorter,tightening,...,tougher,relaxed,tough,stronger,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.978228
79916,young,younger,tough,tougher,gram3-comparative,syntactic,tougher,harder,difficult,better,...,hard,easier,aggressive,rough,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.978228
79917,young,younger,warm,warmer,gram3-comparative,syntactic,warmer,cooler,warming,cool,...,pleasant,relaxing,wet,relax,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.978228
79918,young,younger,weak,weaker,gram3-comparative,syntactic,weaker,weakness,stronger,lower,...,negative,strong,favorable,decline,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.978228


In [4]:
df['overall_accuracy'].unique()

array([0.93993994, 0.97972973, 0.98873874, 0.99624625, 0.89864865,
       0.94369369, 0.95570571, 0.96996997, 0.89714715, 0.94444444,
       0.95345345, 0.96846847, 0.95645646, 0.9527027 , 0.96921922,
       0.90015015, 0.97072072, 0.8978979 , 0.90465465, 0.9466967 ,
       0.95795796, 0.97522523, 0.9466967 , 0.97372372, 0.90765766,
       0.96471471, 0.97822823, 0.90690691, 0.9512012 , 0.96171171])

In [5]:
df['overall_accuracy'].value_counts().sort_values(ascending=False)

overall_accuracy
0.939940    6660
0.996246    6660
0.979730    6660
0.944444    6660
0.988739    6660
0.956456    3996
0.969970    2664
0.897147    2664
0.898649    2664
0.900150    2664
0.952703    2664
0.969219    2664
0.978228    2664
0.953453    2664
0.975225    1332
0.955706    1332
0.943694    1332
0.970721    1332
0.897898    1332
0.904655    1332
0.946697    1332
0.946697    1332
0.957958    1332
0.973724    1332
0.907658    1332
0.964715    1332
0.906907    1332
0.951201    1332
0.968468    1332
0.961712    1332
Name: count, dtype: int64

In [6]:
df['overall_accuracy'].nunique()

30