In [14]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, entropy
from sklearn.metrics.pairwise import euclidean_distances
from statsmodels.stats.stattools import medcouple

In [15]:
# Calculate KL divergence
def kl_divergence(p, q):
    return entropy(p, q)

def kl_divergence_column_wise(original, generated):
    kl_divs = []
    for col in original.columns:
        bins = np.histogram(np.hstack((original[col], generated[col])), bins=10)[1]
        p_hist, _ = np.histogram(original[col], bins=bins, density=True)
        q_hist, _ = np.histogram(generated[col], bins=bins, density=True)
        kl_div = entropy(p_hist, q_hist)
        kl_divs.append(kl_div)
    return np.mean(kl_divs)

# def scuffed_average_kl_divergence(df_original, df_generated):
    #     kl_divergences = []
    #     for column in df_original.columns:
    #         # Normalize the columns to get the PDFs
    #         p_pdf = df_original[column] #/ df_original[column].sum()
    #         q_pdf = df_generated[column] #/ df_generated[column].sum()
    #         # Add a small constant to avoid division by zero or log(0)
    #         epsilon = 1e-10
    #         p_pdf += epsilon
    #         q_pdf += epsilon

    #         # Calculate KL divergence and append to the list
    #         kl_div = kl_divergence(p_pdf, q_pdf)
    #         kl_divergences.append(kl_div)

    #     # Calculate the average KL divergence
    #     average_kl = np.mean(kl_divergences)
    #     return average_kl

In [16]:
def mc_metrics(original, generated):
    lmc_diff_list = []
    mc_diff_list = []
    rmc_diff_list = []
    
    for col in original.columns:
        mc_diff = abs(medcouple(original[col])-medcouple(generated[col]))
        mc_diff_list.append(mc_diff)
        
        lmc_original = medcouple(original[col][original[col] <= np.median(original[col])])
        lmc_generated = medcouple(generated[col][generated[col] <= np.median(generated[col])])
        lmc_diff = abs(lmc_original-lmc_generated)
        lmc_diff_list.append(lmc_diff)
        
        rmc_original = medcouple(original[col][original[col] >= np.median(original[col])])
        rmc_generated = medcouple(generated[col][generated[col] >= np.median(generated[col])])
        rmc_diff = abs(rmc_original-rmc_generated)
        rmc_diff_list.append(rmc_diff)
    
    return np.mean(lmc_diff_list), np.mean(mc_diff_list), np.mean(rmc_diff_list)

In [17]:
def hellinger_distance(p, q):
    """Calculate the Hellinger distance between two probability distributions"""
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2)

def is_discrete(column):
    """Check if a column in a DataFrame is discrete."""
    return pd.api.types.is_integer_dtype(column) or pd.api.types.is_categorical_dtype(column)

def hellinger_distance_column_wise(original, generated):
    hd_diff_list = []
    for col in original.columns:
        # Check if the column is discrete before calculating Hellinger distance
        if is_discrete(original[col]):
            # Normalize the columns to get the probability distributions
            p_hist = original[col].value_counts(normalize=True).sort_index()
            q_hist = generated[col].value_counts(normalize=True).sort_index()

            # If the generated data has new categories, align both series on the same index
            if not p_hist.index.equals(q_hist.index):
                p_hist, q_hist = p_hist.align(q_hist, fill_value=0)

            # Calculate Hellinger distance and append to the list
            hd = hellinger_distance(p_hist, q_hist)
            hd_diff_list.append(hd)

    # Calculate the average of Hellinger distances
    avg_hd_diff = np.mean(hd_diff_list) if hd_diff_list else np.nan
    return avg_hd_diff

# Example usage
# Assuming df_original and df_generated are two pandas DataFrames containing the original and generated data


In [18]:
dataset = "compas" #,"cc", "german"
sampler_list = ["perturbation", "DropoutVAE", "RBF", "forest", "CTGAN"]

for sampler in sampler_list:
    orig_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_RBF_train.csv"
    if sampler == "CTGAN":
        generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_adversarial_train_{sampler}_10000.csv"
    else:
        generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_adversarial_train_{sampler}.csv"
    data_original = pd.read_csv(orig_path)
    data_generated = pd.read_csv(generated_path)

    # Convert to DataFrame
    df_original = pd.get_dummies(data_original)
    df_original.drop(columns=['response'], inplace=True)
    df_generated = pd.get_dummies(data_generated)

    #mean and std
    avg_abs_diff_mean = abs(df_generated.mean() - df_original.mean()).mean()
    avg_abs_diff_std = abs(df_generated.std() - df_original.std()).mean()

    # Calculate skewness and kurtosis
    skew_original = df_original.skew()
    kurtosis_original = df_original.kurtosis()
    skew_generated = df_generated.skew()
    kurtosis_generated = df_generated.kurtosis()
    # Calculate the average absolute difference of skewness and kurtosis
    avg_abs_diff_skew = np.mean(np.abs(skew_original - skew_generated))
    avg_abs_diff_kurtosis = np.mean(np.abs(kurtosis_original - kurtosis_generated))
    
    # Discretize the data for KL divergence calculation
    bins = np.histogram(np.hstack((df_original.values, df_generated.values)), bins=10)[1]
    p_hist, _ = np.histogram(df_original.values, bins=bins, density=True)
    q_hist, _ = np.histogram(df_generated.values, bins=bins, density=True)
    kl_div = kl_divergence(p_hist, q_hist)
    avg_kl_div = kl_divergence_column_wise(df_original, df_generated)
    lmc, mc, rmc = mc_metrics(df_original, df_generated)
    # scuffed_kl = scuffed_average_kl_divergence(df_original, df_generated)
    avg_hd_diff = hellinger_distance_column_wise(df_original, df_generated)
    


    # Print results
    print("------------------------------------------------------------")
    print(f"----------------{dataset} - {sampler}-----------------------")
    print("Average Absolute Difference in Mean:", avg_abs_diff_mean)
    print("Average Absolute Difference in Standard Deviation:", avg_abs_diff_std)
    print("Average Absolute Difference in Skewness:", avg_abs_diff_skew)
    print("Average Absolute Difference in Kurtosis:", avg_abs_diff_kurtosis)
    print("KL Divergence:", kl_div)
    print("Average KL Divergence:", avg_kl_div)
    # print("Scuffed Average KL Divergence:", scuffed_kl)
    print("LMC:", lmc)
    print("MC:", mc)
    print("RMC:", rmc)
    print(f"Average Hellinger distance over all discrete attributes: {avg_hd_diff}")
    print('')

------------------------------------------------------------
----------------compas - perturbation-----------------------
Average Absolute Difference in Mean: 5.133666298235823
Average Absolute Difference in Standard Deviation: 5.724423320829299
Average Absolute Difference in Skewness: 1.2782660273695632
Average Absolute Difference in Kurtosis: 6.664798982878144
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.6111786722586104
MC: 0.8951914176467276
RMC: 0.4538519186498961
Average Hellinger distance over all discrete attributes: 1.0

------------------------------------------------------------
----------------compas - DropoutVAE-----------------------
Average Absolute Difference in Mean: 1.2291878089501425
Average Absolute Difference in Standard Deviation: 3.8591671080326106
Average Absolute Difference in Skewness: 1.2608807549502552
Average Absolute Difference in Kurtosis: 6.113835481063533
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.29797979797979796
MC: 0.51769442678533

In [7]:
dataset = "cc"
sampler_list = ["perturbation", "DropoutVAE", "RBF", "forest", "CTGAN"]

for sampler in sampler_list:
    orig_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_RBF_train.csv"
    if sampler == "CTGAN":
        generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_adversarial_train_{sampler}_10000.csv"
    else:
        generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_adversarial_train_{sampler}.csv"
    data_original = pd.read_csv(orig_path)
    data_generated = pd.read_csv(generated_path)

    # Convert to DataFrame
    df_original = pd.get_dummies(data_original)
    df_original.drop(columns=['response'], inplace=True)
    df_generated = pd.get_dummies(data_generated)

    #mean and std
    avg_abs_diff_mean = abs(df_generated.mean() - df_original.mean()).mean()
    avg_abs_diff_std = abs(df_generated.std() - df_original.std()).mean()

    # Calculate skewness and kurtosis
    skew_original = df_original.skew()
    kurtosis_original = df_original.kurtosis()
    skew_generated = df_generated.skew()
    kurtosis_generated = df_generated.kurtosis()
    # Calculate the average absolute difference of skewness and kurtosis
    avg_abs_diff_skew = np.mean(np.abs(skew_original - skew_generated))
    avg_abs_diff_kurtosis = np.mean(np.abs(kurtosis_original - kurtosis_generated))
    
    # Discretize the data for KL divergence calculation
    bins = np.histogram(np.hstack((df_original.values, df_generated.values)), bins=10)[1]
    p_hist, _ = np.histogram(df_original.values, bins=bins, density=True)
    q_hist, _ = np.histogram(df_generated.values, bins=bins, density=True)
    kl_div = kl_divergence(p_hist, q_hist)
    avg_kl_div = kl_divergence_column_wise(df_original, df_generated)
    lmc, mc, rmc = mc_metrics(df_original, df_generated)
    # scuffed_kl = scuffed_average_kl_divergence(df_original, df_generated)


    # Print results
    print("------------------------------------------------------------")
    print(f"----------------{dataset} - {sampler}-----------------------")
    print("Average Absolute Difference in Mean:", avg_abs_diff_mean)
    print("Average Absolute Difference in Standard Deviation:", avg_abs_diff_std)
    print("Average Absolute Difference in Skewness:", avg_abs_diff_skew)
    print("Average Absolute Difference in Kurtosis:", avg_abs_diff_kurtosis)
    print("KL Divergence:", kl_div)
    print("Average KL Divergence:", avg_kl_div)
    # print("Scuffed Average KL Divergence:", scuffed_kl)
    print("LMC:", lmc)
    print("MC:", mc)
    print("RMC:", rmc)
    print('')

------------------------------------------------------------
----------------cc - perturbation-----------------------
Average Absolute Difference in Mean: 6773.018880734391
Average Absolute Difference in Standard Deviation: 8949.496287663773
Average Absolute Difference in Skewness: 3.919930696203487
Average Absolute Difference in Kurtosis: 85.95730477708764
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.19283458745153184
MC: 0.2938267439366068
RMC: 0.1807564578564402

------------------------------------------------------------
----------------cc - DropoutVAE-----------------------
Average Absolute Difference in Mean: 2456.68848301528
Average Absolute Difference in Standard Deviation: 3975.8317393207703
Average Absolute Difference in Skewness: 3.3833548830578923
Average Absolute Difference in Kurtosis: 83.93776883078444
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.304129655500767
MC: 0.32498086112364444
RMC: 0.2825323747520234

--------------------------------------------

In [6]:
dataset = "german"
sampler_list = ["perturbation", "DropoutVAE", "RBF", "forest", "CTGAN"]

for sampler in sampler_list:
    orig_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_RBF_train.csv"
    if sampler == "CTGAN":
        generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_adversarial_train_{sampler}_10000.csv"
    else:
        generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_adversarial_train_{sampler}.csv"
    data_original = pd.read_csv(orig_path)
    data_generated = pd.read_csv(generated_path)

    # Convert to DataFrame
    df_original = pd.get_dummies(data_original)
    df_original.drop(columns=['response'], inplace=True)
    df_generated = pd.get_dummies(data_generated)

    #mean and std
    avg_abs_diff_mean = abs(df_generated.mean() - df_original.mean()).mean()
    avg_abs_diff_std = abs(df_generated.std() - df_original.std()).mean()

    # Calculate skewness and kurtosis
    skew_original = df_original.skew()
    kurtosis_original = df_original.kurtosis()
    skew_generated = df_generated.skew()
    kurtosis_generated = df_generated.kurtosis()
    # Calculate the average absolute difference of skewness and kurtosis
    avg_abs_diff_skew = np.mean(np.abs(skew_original - skew_generated))
    avg_abs_diff_kurtosis = np.mean(np.abs(kurtosis_original - kurtosis_generated))
    
    # Discretize the data for KL divergence calculation
    bins = np.histogram(np.hstack((df_original.values, df_generated.values)), bins=10)[1]
    p_hist, _ = np.histogram(df_original.values, bins=bins, density=True)
    q_hist, _ = np.histogram(df_generated.values, bins=bins, density=True)
    kl_div = kl_divergence(p_hist, q_hist)
    avg_kl_div = kl_divergence_column_wise(df_original, df_generated)
    lmc, mc, rmc = mc_metrics(df_original, df_generated)
    # scuffed_kl = scuffed_average_kl_divergence(df_original, df_generated)


    # Print results
    print("------------------------------------------------------------")
    print(f"----------------{dataset} - {sampler}-----------------------")
    print("Average Absolute Difference in Mean:", avg_abs_diff_mean)
    print("Average Absolute Difference in Standard Deviation:", avg_abs_diff_std)
    print("Average Absolute Difference in Skewness:", avg_abs_diff_skew)
    print("Average Absolute Difference in Kurtosis:", avg_abs_diff_kurtosis)
    print("KL Divergence:", kl_div)
    print("Average KL Divergence:", avg_kl_div)
    # print("Scuffed Average KL Divergence:", scuffed_kl)
    print("LMC:", lmc)
    print("MC:", mc)
    print("RMC:", rmc)
    print('')

------------------------------------------------------------
----------------german - perturbation-----------------------
Average Absolute Difference in Mean: 107.2327113020077
Average Absolute Difference in Standard Deviation: 91.73495791640458
Average Absolute Difference in Skewness: 1.7681751114471316
Average Absolute Difference in Kurtosis: 4.564795716115089
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.3329221362022862
MC: 0.8352047793897254
RMC: 0.609383959977203

------------------------------------------------------------
----------------german - DropoutVAE-----------------------
Average Absolute Difference in Mean: 23.044910394265234
Average Absolute Difference in Standard Deviation: 46.156809756611935
Average Absolute Difference in Skewness: 4.121523597110049
Average Absolute Difference in Kurtosis: 55.45032495910259
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.24268127806936687
MC: 0.8326644742840824
RMC: 0.6826748523279492

-----------------------------------

In [13]:
dataset = "german"
sampler_list = ["perturbation", "DropoutVAE", "RBF", "forest", "CTGAN"]

for sampler in sampler_list:
    orig_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\shap_{dataset}_RBF_train.csv"
    generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\shap_{dataset}_adversarial_train_{sampler}.csv"
    data_original = pd.read_csv(orig_path)
    data_generated = pd.read_csv(generated_path)

    # Convert to DataFrame
    df_original = pd.get_dummies(data_original)
    df_original.drop(columns=['response'], inplace=True)
    df_original = df_original.sample(n=100, random_state=42)
    df_generated = pd.get_dummies(data_generated)
    if sampler == "perturbation":
        df_generated = df_generated.sample(n=100, random_state=42)
    

    #mean and std
    avg_abs_diff_mean = abs(df_generated.mean() - df_original.mean()).mean()
    avg_abs_diff_std = abs(df_generated.std() - df_original.std()).mean()

    # Calculate skewness and kurtosis
    skew_original = df_original.skew()
    kurtosis_original = df_original.kurtosis()
    skew_generated = df_generated.skew()
    kurtosis_generated = df_generated.kurtosis()
    # Calculate the average absolute difference of skewness and kurtosis
    avg_abs_diff_skew = np.mean(np.abs(skew_original - skew_generated))
    avg_abs_diff_kurtosis = np.mean(np.abs(kurtosis_original - kurtosis_generated))
    
    # Discretize the data for KL divergence calculation
    bins = np.histogram(np.hstack((df_original.values, df_generated.values)), bins=10)[1]
    p_hist, _ = np.histogram(df_original.values, bins=bins, density=True)
    q_hist, _ = np.histogram(df_generated.values, bins=bins, density=True)
    kl_div = kl_divergence(p_hist, q_hist)
    avg_kl_div = kl_divergence_column_wise(df_original, df_generated)
    lmc, mc, rmc = mc_metrics(df_original, df_generated)
    # scuffed_kl = scuffed_average_kl_divergence(df_original, df_generated)


    # Print results
    print("------------------------------------------------------------")
    print(f"----------------{dataset} - {sampler}-----------------------")
    print("Average Absolute Difference in Mean:", avg_abs_diff_mean)
    print("Average Absolute Difference in Standard Deviation:", avg_abs_diff_std)
    print("Average Absolute Difference in Skewness:", avg_abs_diff_skew)
    print("Average Absolute Difference in Kurtosis:", avg_abs_diff_kurtosis)
    print("KL Divergence:", kl_div)
    print("Average KL Divergence:", avg_kl_div)
    # print("Scuffed Average KL Divergence:", scuffed_kl)
    print("LMC:", lmc)
    print("MC:", mc)
    print("RMC:", rmc)
    print('')

------------------------------------------------------------
----------------german - perturbation-----------------------
Average Absolute Difference in Mean: 80.41806451612905
Average Absolute Difference in Standard Deviation: 74.2928565635607
Average Absolute Difference in Skewness: 1.6691132123801522
Average Absolute Difference in Kurtosis: 14.368515549307283
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.14693382318928397
MC: 0.2221062896855537
RMC: 0.2812498927543176

------------------------------------------------------------
----------------german - DropoutVAE-----------------------
Average Absolute Difference in Mean: 27.276129032258073
Average Absolute Difference in Standard Deviation: 25.264379212123497
Average Absolute Difference in Skewness: 1.9512612489124639
Average Absolute Difference in Kurtosis: 13.83960814061914
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.29850591611533045
MC: 0.5279652359301037
RMC: 0.42257680204303927

-------------------------------