In [19]:
import pandas as pd
import os
from typing import List, Dict

In [20]:
def read_csv_files(file_paths: List[str]) -> Dict[str, pd.DataFrame]:
    """
    Reads multiple CSV files and returns them as a dictionary of DataFrames.
    """
    dataframes = {}
    for file_path in file_paths:
        try:
            dataframes[file_path] = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    return dataframes

In [21]:
def compare_entries(dataframes: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    """
    Compares the number of entries across DataFrames.
    """
    comparison = {
        "File Name": [],
        "Number of Entries": []
    }
    
    for file_name, df in dataframes.items():
        comparison["File Name"].append(file_name)
        comparison["Number of Entries"].append(len(df))
    
    return pd.DataFrame(comparison)

In [22]:
def compare_feature_counts(dataframes: Dict[str, pd.DataFrame], feature: str) -> pd.DataFrame:
    """
    Compares feature-based counts across DataFrames.
    """
    comparison = {}
    
    for file_name, df in dataframes.items():
        if feature in df.columns:
            feature_counts = df[feature].value_counts()
            comparison[file_name] = feature_counts
        else:
            print(f"Feature '{feature}' not found in {file_name}.")
    
    # Combine counts into a single DataFrame
    result = pd.concat(comparison, axis=1).fillna(0)
    result.columns = [f"{col} Count" for col in result.columns]
    return result

In [23]:
# Define file paths (update paths accordingly in Jupyter Notebook)
folder_path = "E:/FAU/Thesis/Code/fl_client_splitting_shuvanon/fl_base_code/experiments/Baseline_feature_basednonenone_2025_01_07_04_39_32/data/"  # Change this to your folder containing CSV files
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Read the files
dataframes = read_csv_files(file_paths)

In [24]:
entries_comparison = compare_entries(dataframes)
print("Number of Entries Comparison:")
print(entries_comparison)

Number of Entries Comparison:
                                           File Name  Number of Entries
0  E:/FAU/Thesis/Code/fl_client_splitting_shuvano...               5772
1  E:/FAU/Thesis/Code/fl_client_splitting_shuvano...               5772
2  E:/FAU/Thesis/Code/fl_client_splitting_shuvano...               5772
3  E:/FAU/Thesis/Code/fl_client_splitting_shuvano...               5772
4  E:/FAU/Thesis/Code/fl_client_splitting_shuvano...               5771


In [25]:
# Compare feature counts for a specific feature
feature = "is_PseudoFehler"  # Example feature to compare
feature_comparison = compare_feature_counts(dataframes, feature)
print(f"\nFeature Count Comparison for '{feature}':")
print(feature_comparison)


Feature Count Comparison for 'is_PseudoFehler':
                 E:/FAU/Thesis/Code/fl_client_splitting_shuvanon/fl_base_code/experiments/Baseline_feature_basednonenone_2025_01_07_04_39_32/data/data0.csv Count  \
is_PseudoFehler                                                                                                                                                     
0                                                             2974                                                                                                  
1                                                             2798                                                                                                  

                 E:/FAU/Thesis/Code/fl_client_splitting_shuvanon/fl_base_code/experiments/Baseline_feature_basednonenone_2025_01_07_04_39_32/data/data1.csv Count  \
is_PseudoFehler                                                                                                              

In [26]:
def generate_feature_tables(dataframes: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
    """
    Generates a table for all files and all features, showing counts for each unique value.
    """
    tables = {}

    for feature in dataframes[list(dataframes.keys())[0]].columns:  # Get features from the first dataframe
        feature_table = {}
        for file_name, df in dataframes.items():
            if feature in df.columns:
                feature_counts = df[feature].value_counts()
                feature_table[file_name] = feature_counts
            else:
                print(f"Feature '{feature}' not found in {file_name}.")
        
        # Combine counts into a single DataFrame
        combined_table = pd.concat(feature_table, axis=1).fillna(0)
        combined_table.columns = [f"{file} Count" for file in combined_table.columns.get_level_values(0)]
        tables[feature] = combined_table

    return tables


In [27]:
feature_tables = generate_feature_tables(dataframes)

In [28]:
# Display tables for each feature
for feature, table in feature_tables.items():
    print(f"\nFeature Table for '{feature}':")
    print(table)


Feature Table for 'imageIndex':
            E:/FAU/Thesis/Code/fl_client_splitting_shuvanon/fl_base_code/experiments/Baseline_feature_basednonenone_2025_01_07_04_39_32/data/data0.csv Count  \
imageIndex                                                                                                                                                     
17556                                                     1.0                                                                                                  
15292                                                     1.0                                                                                                  
6403                                                      1.0                                                                                                  
2528                                                      1.0                                                                                                  
28235  