In [1]:
import os
import pandas as pd
from scipy.stats import wasserstein_distance
import ast

def unpack_csv_files(folder_path):
    """Unpack experimental CSV files and extract frequency histograms."""
    results = {}
    for file in os.listdir(folder_path):
        if file.endswith(".csv"):
            file_path = os.path.join(folder_path, file)
            with open(file_path, "r") as f:
                lines = f.readlines()
            
            frequency_data = []
            is_frequency_section = False
            for line in lines:
                line = line.strip()
                if line == "Degree,Frequency":
                    is_frequency_section = True
                    continue
                if is_frequency_section:
                    degree, frequency = line.split(",")
                    degree = int(degree)
                    if degree not in {0, 4}:  # Ignore 0 and 4 nearest neighbors
                        frequency_data.append((degree, int(frequency)))
            results[file] = frequency_data
    return results

def compare_histograms(simulated_file, experimental_folder):
    """Compare simulated histograms with experimental histograms."""
    # Load simulated data
    simulated_data = pd.read_csv(simulated_file)
    
    # Extract experimental histograms
    experimental_histograms = unpack_csv_files(experimental_folder)
    
    # Store comparison results
    comparison_results = []
    
    for _, row in simulated_data.iterrows():
        # Parse the Averaged Neighbour Frequency as a dictionary
        sim_histogram = ast.literal_eval(row["Averaged Neighbour Frequency"])
        
        # Convert dictionary to a full histogram list (bins = degrees)
        max_degree = max(sim_histogram.keys())
        sim_histogram_list = [sim_histogram.get(degree, 0) for degree in range(1, max_degree + 1)]
        
        # Compare with each experimental histogram
        for exp_file, exp_data in experimental_histograms.items():
            # Convert experimental histogram data back to a list
            exp_histogram = [freq for degree, freq in exp_data]

            # Ensure the experimental histogram has the same number of bins
            max_bins = max(len(sim_histogram_list), len(exp_histogram))
            sim_histogram_list = sim_histogram_list + [0] * (max_bins - len(sim_histogram_list))
            exp_histogram = exp_histogram + [0] * (max_bins - len(exp_histogram))
            
            # Compute Wasserstein distance
            distance = wasserstein_distance(range(len(sim_histogram_list)), range(len(exp_histogram)), sim_histogram_list, exp_histogram)
            
            # Save results
            comparison_results.append({
                "Diffusion Energy": row["Diffusion Energy"],
                "Rotation Energy": row["Rotation Energy"],
                "Coupling Energy": row["Coupling Energy"],
                "Experimental File": exp_file,
                "Wasserstein Distance": distance
            })
    
    # Convert results to a DataFrame
    results_df = pd.DataFrame(comparison_results)
    return results_df

# Example Usage
simulated_csv = r"C:\Users\User\Desktop\2D_KMC\data\cluster_results.csv"  # Replace with your simulated data file
experimental_folder = r"C:\Users\User\Desktop\research-updates\NetworkQualityAnalysis"  # Replace with your folder path

# Perform the comparison
comparison_results = compare_histograms(simulated_csv, experimental_folder)

# Save and display results
comparison_results.to_csv("wasserstein-distance-experiment-vs-simulations.csv", index=False)
print("Comparison results saved to 'comparison_results.csv'")


  from pandas.core import (


Comparison results saved to 'comparison_results.csv'
