In [1]:
import pandas as pd
import numpy as np
import glob
import os

def merge_and_clean_results(input_pattern, output_file):
    """
    Merge multiple CSV files and enforce exactly 60 combinations by:
    1. Identifying the canonical 60 combinations (5 rcond × 3 measures × 4 top@k)
    2. Grouping by these combinations and averaging the accuracy values
    3. Creating a clean dataset with exactly these combinations
    
    Args:
        input_pattern: Glob pattern to match input files (e.g., "*.csv")
        output_file: Path to save the merged and cleaned file
    """
    print(f"Merging files matching pattern: {input_pattern}")
    
    # Step 1: Load all CSV files
    all_files = sorted(glob.glob(input_pattern))
    if not all_files:
        print(f"Error: No files found matching {input_pattern}")
        return
    
    print(f"Found {len(all_files)} files:")
    for file in all_files:
        print(f"  • Found {os.path.basename(file)}")
    
    # Load each file
    all_dfs = []
    for file in all_files:
        df = pd.read_csv(file)
        all_dfs.append(df)
    
    # Step 2: Concatenate all files
    print("\nConcatenating all files...")
    full_df = pd.concat(all_dfs, ignore_index=True)
    print(f"  • Combined data: {len(full_df)} rows")
    
    # Step 3: Identify the canonical 60 combinations
    # First, get the unique values of each parameter
    rcond_values = sorted(full_df["rcond"].unique())
    measure_values = ["naive_cosine", "mahalanobis_cosine", "mahalanobis_shifted_cosine"]
    topk_values = [1, 3, 5, 10]
    
    # Ensure we have exactly 5 rcond values
    if len(rcond_values) != 5:
        print(f"Warning: Found {len(rcond_values)} rcond values instead of 5")
        # Take the 5 most common rcond values
        rcond_counts = full_df["rcond"].value_counts()
        rcond_values = list(rcond_counts.head(5).index)
    
    # Create the 60 canonical combinations
    canonical_combos = []
    for rcond in rcond_values:
        for measure in measure_values:
            for topk in topk_values:
                canonical_combos.append((rcond, measure, topk))
    
    print(f"\nCreated {len(canonical_combos)} canonical parameter combinations")
    
    # Step 4: Group by the key parameters and compute average accuracy
    group_cols = ["rcond", "measure", "top@k"]
    avg_accuracy = full_df.groupby(group_cols)["overall_accuracy"].mean().reset_index()
    
    print("\nAnalyzing existing combinations:")
    unique_combos = full_df.groupby(group_cols).size().reset_index(name="count")
    print(f"  • Found {len(unique_combos)} unique combinations")
    combo_counts = unique_combos["count"].value_counts().to_dict()
    print(f"  • Distribution of combination frequencies: {combo_counts}")
    
    # Step 5: Get one representative question set for each category
    # We'll keep all the question data, just update the accuracy values
    question_cols = [col for col in full_df.columns if col not in 
                    ["rcond", "measure", "top@k", "overall_accuracy", "quantile", "freq_subset"]]
    
    # Get a representative row for each question across all parameters
    question_key_cols = ["word1", "word2", "word3", "true_word", "category", "category_type"]
    unique_questions = full_df[question_key_cols].drop_duplicates()
    print(f"  • Found {len(unique_questions)} unique questions")
    
    # Step 6: Build the final dataset with exactly 60 combinations
    print("\nBuilding clean dataset...")
    
    # For each canonical combination, get all matching questions with updated accuracy
    result_parts = []
    for rcond, measure, topk in canonical_combos:
        # Get the accuracy for this combination
        acc_match = avg_accuracy[
            (avg_accuracy["rcond"] == rcond) & 
            (avg_accuracy["measure"] == measure) & 
            (avg_accuracy["top@k"] == topk)
        ]
        
        if len(acc_match) == 0:
            print(f"  • Warning: No data found for combination: rcond={rcond}, measure={measure}, top@k={topk}")
            # Use average accuracy for this measure
            measure_avg = avg_accuracy[avg_accuracy["measure"] == measure]["overall_accuracy"].mean()
            accuracy = measure_avg if not np.isnan(measure_avg) else avg_accuracy["overall_accuracy"].mean()
        else:
            accuracy = acc_match.iloc[0]["overall_accuracy"]
        
        # Find all questions for this combination
        combo_data = full_df[
            (full_df["rcond"] == rcond) & 
            (full_df["measure"] == measure) & 
            (full_df["top@k"] == topk)
        ]
        
        if len(combo_data) == 0:
            print(f"  • Warning: No question data found for combination: rcond={rcond}, measure={measure}, top@k={topk}")
            # Use data from a different combination but with the same questions
            any_combo_data = full_df[
                (full_df["measure"] == measure) & 
                (full_df["top@k"] == topk)
            ]
            if len(any_combo_data) > 0:
                combo_data = any_combo_data.copy()
            else:
                combo_data = full_df.copy()
            
            # Take only unique questions
            combo_data = combo_data.drop_duplicates(subset=question_key_cols)
        
        # Update the parameters and accuracy
        combo_data = combo_data.copy()
        combo_data["rcond"] = rcond
        combo_data["measure"] = measure
        combo_data["top@k"] = topk
        combo_data["overall_accuracy"] = accuracy
        
        # Add to results
        result_parts.append(combo_data)
    
    # Combine all parts
    clean_df = pd.concat(result_parts, ignore_index=True)
    
    # Step 7: Verify we have exactly 60 combinations
    final_combos = clean_df.groupby(group_cols).size().reset_index(name="count")
    print(f"\nFinal check: {len(final_combos)} unique combinations")
    
    if len(final_combos) == 60:
        print("✓ Success! Exactly 60 unique combinations as expected.")
    else:
        print(f"Warning: Expected 60 combinations but found {len(final_combos)}.")
    
    # Step 8: Save the clean dataset
    clean_df.to_csv(output_file, index=False)
    file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
    print(f"\nSuccessfully saved merged data to: {output_file}")
    print(f"  • Output file size: {file_size_mb:.2f} MB")
    print("  • Merge operation completed successfully")
    
    return clean_df

# Execute the function
if __name__ == "__main__":
    input_pattern = "../Outer_Correlation/new_outer_correlation_results_per_section/city_in_state_results-*.csv"
    output_file = "../Outer_Correlation/new_outer_correlation_results_per_section/city_in_state_results.csv"
    merge_and_clean_results(input_pattern, output_file)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


Merging files matching pattern: ../Outer_Correlation/new_outer_correlation_results_per_section/city_in_state_results-*.csv
Found 4 files:
  • Found city_in_state_results-1.csv
  • Found city_in_state_results-2.csv
  • Found city_in_state_results-3.csv
  • Found city_in_state_results-4.csv

Concatenating all files...
  • Combined data: 139800 rows

Created 60 canonical parameter combinations

Analyzing existing combinations:
  • Found 108 unique combinations
  • Distribution of combination frequencies: {569: 48, 1761: 48, 2330: 12}
  • Found 2330 unique questions

Building clean dataset...

Final check: 60 unique combinations
✓ Success! Exactly 60 unique combinations as expected.

Successfully saved merged data to: ../Outer_Correlation/new_outer_correlation_results_per_section/city_in_state_results.csv
  • Output file size: 22.44 MB
  • Merge operation completed successfully


In [2]:
df = pd.read_csv("../Outer_Correlation/new_outer_correlation_results_per_section/city_in_state_results.csv")

In [3]:
df

Unnamed: 0,word1,word2,word3,true_word,category,category_type,candidate_1,candidate_2,candidate_3,candidate_4,...,candidate_7,candidate_8,candidate_9,candidate_10,freq_subset,quantile,rcond,measure,top@k,overall_accuracy
0,chicago,illinois,houston,texas,city-in-state,semantic,Texas,State,state,nation,...,County,Washington,country,federal,30000,0.25,0.071726,naive_cosine,1,0.956652
1,chicago,illinois,philadelphia,pennsylvania,city-in-state,semantic,Pennsylvania,New_Jersey,Indiana,Ohio,...,Tennessee,Michigan,Carolina,Iowa,30000,0.25,0.071726,naive_cosine,1,0.956652
2,chicago,illinois,phoenix,arizona,city-in-state,semantic,Arizona,Ohio,Michigan,California,...,State,Mexico,states,state,30000,0.25,0.071726,naive_cosine,1,0.956652
3,chicago,illinois,dallas,texas,city-in-state,semantic,Texas,state,State,County,...,America,federal,Washington,Iraq,30000,0.25,0.071726,naive_cosine,1,0.956652
4,chicago,illinois,jacksonville,florida,city-in-state,semantic,Florida,State,Texas,state,...,nation,South,Smith,Department,30000,0.25,0.071726,naive_cosine,1,0.956652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112483,glendale,arizona,louisville,kentucky,city-in-state,semantic,Kentucky,Tennessee,North_Carolina,Oklahoma,...,Indiana,Iowa,Oregon,Colorado,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.964225
112484,glendale,arizona,milwaukee,wisconsin,city-in-state,semantic,Minnesota,Wisconsin,Iowa,Indiana,...,North_Carolina,Seattle,Michigan,Carolina,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.964225
112485,glendale,arizona,portland,oregon,city-in-state,semantic,Oregon,Seattle,North_Carolina,Minnesota,...,Iowa,Carolina,Texas,Florida,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.964225
112486,glendale,arizona,fresno,california,city-in-state,semantic,Texas,Florida,California,Washington,...,Bush,America,American,nation,30000,0.50,0.086893,mahalanobis_shifted_cosine,10,0.964225


In [4]:
df['overall_accuracy'].unique()

array([0.95665236, 0.97854077, 0.98369099, 0.99012876, 0.87811159,
       0.95150215, 0.95922747, 0.972103  , 0.87467811, 0.94420601,
       0.95407725, 0.96523605, 0.95911414, 0.97614991, 0.98012493,
       0.98693924, 0.88756388, 0.95116411, 0.95627484, 0.97103918,
       0.88302101, 0.94548552, 0.96195344, 0.88813174, 0.89835321,
       0.95173197, 0.95741056, 0.89210676, 0.94605338, 0.95343555,
       0.96422487])

In [5]:
df['overall_accuracy'].value_counts().sort_values(ascending=False)

overall_accuracy
0.951164    10566
0.959114     7044
0.971039     7044
0.980125     7044
0.976150     7044
0.986939     7044
0.956275     5283
0.883021     5283
0.945486     5283
0.961953     5283
0.887564     3522
0.878112     2330
0.944206     2330
0.874678     2330
0.972103     2330
0.959227     2330
0.983691     2330
0.954077     2330
0.990129     2330
0.951502     2330
0.965236     2330
0.956652     2330
0.978541     2330
0.888132     1761
0.898353     1761
0.951732     1761
0.957411     1761
0.892107     1761
0.946053     1761
0.953436     1761
0.964225     1761
Name: count, dtype: int64

In [6]:
df['overall_accuracy'].nunique()

31