In [16]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, entropy
from sklearn.metrics.pairwise import euclidean_distances
from statsmodels.stats.stattools import medcouple

In [17]:
# Calculate KL divergence
def kl_divergence(p, q):
    return entropy(p, q)

def kl_divergence_column_wise(original, generated):
    kl_divs = []
    for col in original.columns:
        bins = np.histogram(np.hstack((original[col], generated[col])), bins=10)[1]
        p_hist, _ = np.histogram(original[col], bins=bins, density=True)
        q_hist, _ = np.histogram(generated[col], bins=bins, density=True)
        kl_div = entropy(p_hist, q_hist)
        kl_divs.append(kl_div)
    return np.mean(kl_divs)

# def scuffed_average_kl_divergence(df_original, df_generated):
    #     kl_divergences = []
    #     for column in df_original.columns:
    #         # Normalize the columns to get the PDFs
    #         p_pdf = df_original[column] #/ df_original[column].sum()
    #         q_pdf = df_generated[column] #/ df_generated[column].sum()
    #         # Add a small constant to avoid division by zero or log(0)
    #         epsilon = 1e-10
    #         p_pdf += epsilon
    #         q_pdf += epsilon

    #         # Calculate KL divergence and append to the list
    #         kl_div = kl_divergence(p_pdf, q_pdf)
    #         kl_divergences.append(kl_div)

    #     # Calculate the average KL divergence
    #     average_kl = np.mean(kl_divergences)
    #     return average_kl

In [18]:
def mc_metrics(original, generated):
    lmc_diff_list = []
    mc_diff_list = []
    rmc_diff_list = []
    
    for col in original.columns:
        mc_diff = abs(medcouple(original[col])-medcouple(generated[col]))
        mc_diff_list.append(mc_diff)
        
        lmc_original = medcouple(original[col][original[col] <= np.median(original[col])])
        lmc_generated = medcouple(generated[col][generated[col] <= np.median(generated[col])])
        lmc_diff = abs(lmc_original-lmc_generated)
        lmc_diff_list.append(lmc_diff)
        
        rmc_original = medcouple(original[col][original[col] >= np.median(original[col])])
        rmc_generated = medcouple(generated[col][generated[col] >= np.median(generated[col])])
        rmc_diff = abs(rmc_original-rmc_generated)
        rmc_diff_list.append(rmc_diff)
    
    return np.mean(lmc_diff_list), np.mean(mc_diff_list), np.mean(rmc_diff_list)

In [19]:
dataset = "compas" #,"cc", "german"
sampler_list = ["perturbation", "DropoutVAE", "RBF", "forest", "CTGAN"]

for sampler in sampler_list:
    orig_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_RBF_train.csv"
    generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_adversarial_train_{sampler}.csv"
    data_original = pd.read_csv(orig_path)
    data_generated = pd.read_csv(generated_path)

    # Convert to DataFrame
    df_original = pd.get_dummies(data_original)
    df_original.drop(columns=['response'], inplace=True)
    df_generated = pd.get_dummies(data_generated)

    #mean and std
    avg_abs_diff_mean = abs(df_generated.mean() - df_original.mean()).mean()
    avg_abs_diff_std = abs(df_generated.std() - df_original.std()).mean()

    # Calculate skewness and kurtosis
    skew_original = df_original.skew()
    kurtosis_original = df_original.kurtosis()
    skew_generated = df_generated.skew()
    kurtosis_generated = df_generated.kurtosis()
    # Calculate the average absolute difference of skewness and kurtosis
    avg_abs_diff_skew = np.mean(np.abs(skew_original - skew_generated))
    avg_abs_diff_kurtosis = np.mean(np.abs(kurtosis_original - kurtosis_generated))
    
    # Discretize the data for KL divergence calculation
    bins = np.histogram(np.hstack((df_original.values, df_generated.values)), bins=10)[1]
    p_hist, _ = np.histogram(df_original.values, bins=bins, density=True)
    q_hist, _ = np.histogram(df_generated.values, bins=bins, density=True)
    kl_div = kl_divergence(p_hist, q_hist)
    avg_kl_div = kl_divergence_column_wise(df_original, df_generated)
    lmc, mc, rmc = mc_metrics(df_original, df_generated)
    # scuffed_kl = scuffed_average_kl_divergence(df_original, df_generated)


    # Print results
    print("------------------------------------------------------------")
    print(f"----------------{dataset} - {sampler}-----------------------")
    print("Average Absolute Difference in Mean:", avg_abs_diff_mean)
    print("Average Absolute Difference in Standard Deviation:", avg_abs_diff_std)
    print("Average Absolute Difference in Skewness:", avg_abs_diff_skew)
    print("Average Absolute Difference in Kurtosis:", avg_abs_diff_kurtosis)
    print("KL Divergence:", kl_div)
    print("Average KL Divergence:", avg_kl_div)
    # print("Scuffed Average KL Divergence:", scuffed_kl)
    print("LMC:", lmc)
    print("MC:", mc)
    print("RMC:", rmc)
    print('')

------------------------------------------------------------
----------------compas - perturbation-----------------------
Average Absolute Difference in Mean: 5.133666298235823
Average Absolute Difference in Standard Deviation: 5.724423320829299
Average Absolute Difference in Skewness: 1.2782660273695632
Average Absolute Difference in Kurtosis: 6.664798982878144
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.6111786722586104
MC: 0.8951914176467276
RMC: 0.4538519186498961

------------------------------------------------------------
----------------compas - DropoutVAE-----------------------
Average Absolute Difference in Mean: 1.254607653779422
Average Absolute Difference in Standard Deviation: 3.8597245643750537
Average Absolute Difference in Skewness: 1.2730919479135403
Average Absolute Difference in Kurtosis: 6.1490212569598945
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.29797979797979796
MC: 0.5163170163170162
RMC: 0.30707070707070705

--------------------------------

In [28]:
dataset = "cc"
sampler_list = ["perturbation", "DropoutVAE", "RBF", "forest", "CTGAN"]

for sampler in sampler_list:
    orig_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_RBF_train.csv"
    generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_adversarial_train_{sampler}.csv"
    data_original = pd.read_csv(orig_path)
    data_generated = pd.read_csv(generated_path)

    # Convert to DataFrame
    df_original = pd.get_dummies(data_original)
    df_original.drop(columns=['response'], inplace=True)
    df_generated = pd.get_dummies(data_generated)

    #mean and std
    avg_abs_diff_mean = abs(df_generated.mean() - df_original.mean()).mean()
    avg_abs_diff_std = abs(df_generated.std() - df_original.std()).mean()

    # Calculate skewness and kurtosis
    skew_original = df_original.skew()
    kurtosis_original = df_original.kurtosis()
    skew_generated = df_generated.skew()
    kurtosis_generated = df_generated.kurtosis()
    # Calculate the average absolute difference of skewness and kurtosis
    avg_abs_diff_skew = np.mean(np.abs(skew_original - skew_generated))
    avg_abs_diff_kurtosis = np.mean(np.abs(kurtosis_original - kurtosis_generated))
    
    # Discretize the data for KL divergence calculation
    bins = np.histogram(np.hstack((df_original.values, df_generated.values)), bins=10)[1]
    p_hist, _ = np.histogram(df_original.values, bins=bins, density=True)
    q_hist, _ = np.histogram(df_generated.values, bins=bins, density=True)
    kl_div = kl_divergence(p_hist, q_hist)
    avg_kl_div = kl_divergence_column_wise(df_original, df_generated)
    lmc, mc, rmc = mc_metrics(df_original, df_generated)
    # scuffed_kl = scuffed_average_kl_divergence(df_original, df_generated)


    # Print results
    print("------------------------------------------------------------")
    print(f"----------------{dataset} - {sampler}-----------------------")
    print("Average Absolute Difference in Mean:", avg_abs_diff_mean)
    print("Average Absolute Difference in Standard Deviation:", avg_abs_diff_std)
    print("Average Absolute Difference in Skewness:", avg_abs_diff_skew)
    print("Average Absolute Difference in Kurtosis:", avg_abs_diff_kurtosis)
    print("KL Divergence:", kl_div)
    print("Average KL Divergence:", avg_kl_div)
    # print("Scuffed Average KL Divergence:", scuffed_kl)
    print("LMC:", lmc)
    print("MC:", mc)
    print("RMC:", rmc)
    print('')

------------------------------------------------------------
----------------cc - perturbation-----------------------
Average Absolute Difference in Mean: 6773.018880734391
Average Absolute Difference in Standard Deviation: 8949.496287663773
Average Absolute Difference in Skewness: 3.919930696203487
Average Absolute Difference in Kurtosis: 85.95730477708764
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.19283458745153184
MC: 0.2938267439366068
RMC: 0.1807564578564402

------------------------------------------------------------
----------------cc - DropoutVAE-----------------------
Average Absolute Difference in Mean: 1792.1605492709903
Average Absolute Difference in Standard Deviation: 2451.164210139004
Average Absolute Difference in Skewness: 3.4826599154731404
Average Absolute Difference in Kurtosis: 81.64572481440766
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.35326110020263213
MC: 0.37215151909004857
RMC: 0.3071958809067015

-----------------------------------------

In [21]:
dataset = "german"
sampler_list = ["perturbation", "DropoutVAE", "RBF", "forest", "CTGAN"]

for sampler in sampler_list:
    orig_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_RBF_train.csv"
    generated_path = rf"C:\Users\shrey\Desktop\DSC 261\DSC-261-FINAL\Data\{dataset}_adversarial_train_{sampler}.csv"
    data_original = pd.read_csv(orig_path)
    data_generated = pd.read_csv(generated_path)

    # Convert to DataFrame
    df_original = pd.get_dummies(data_original)
    df_original.drop(columns=['response'], inplace=True)
    df_generated = pd.get_dummies(data_generated)

    #mean and std
    avg_abs_diff_mean = abs(df_generated.mean() - df_original.mean()).mean()
    avg_abs_diff_std = abs(df_generated.std() - df_original.std()).mean()

    # Calculate skewness and kurtosis
    skew_original = df_original.skew()
    kurtosis_original = df_original.kurtosis()
    skew_generated = df_generated.skew()
    kurtosis_generated = df_generated.kurtosis()
    # Calculate the average absolute difference of skewness and kurtosis
    avg_abs_diff_skew = np.mean(np.abs(skew_original - skew_generated))
    avg_abs_diff_kurtosis = np.mean(np.abs(kurtosis_original - kurtosis_generated))
    
    # Discretize the data for KL divergence calculation
    bins = np.histogram(np.hstack((df_original.values, df_generated.values)), bins=10)[1]
    p_hist, _ = np.histogram(df_original.values, bins=bins, density=True)
    q_hist, _ = np.histogram(df_generated.values, bins=bins, density=True)
    kl_div = kl_divergence(p_hist, q_hist)
    avg_kl_div = kl_divergence_column_wise(df_original, df_generated)
    lmc, mc, rmc = mc_metrics(df_original, df_generated)
    # scuffed_kl = scuffed_average_kl_divergence(df_original, df_generated)


    # Print results
    print("------------------------------------------------------------")
    print(f"----------------{dataset} - {sampler}-----------------------")
    print("Average Absolute Difference in Mean:", avg_abs_diff_mean)
    print("Average Absolute Difference in Standard Deviation:", avg_abs_diff_std)
    print("Average Absolute Difference in Skewness:", avg_abs_diff_skew)
    print("Average Absolute Difference in Kurtosis:", avg_abs_diff_kurtosis)
    print("KL Divergence:", kl_div)
    print("Average KL Divergence:", avg_kl_div)
    # print("Scuffed Average KL Divergence:", scuffed_kl)
    print("LMC:", lmc)
    print("MC:", mc)
    print("RMC:", rmc)
    print('')

------------------------------------------------------------
----------------german - perturbation-----------------------
Average Absolute Difference in Mean: 107.14102671419408
Average Absolute Difference in Standard Deviation: 91.13934982728641
Average Absolute Difference in Skewness: 1.7579706408394304
Average Absolute Difference in Kurtosis: 4.504175786901902
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.33298013455886133
MC: 0.83477259128008
RMC: 0.6100072722802227

------------------------------------------------------------
----------------german - DropoutVAE-----------------------
Average Absolute Difference in Mean: 13.168028673835124
Average Absolute Difference in Standard Deviation: 49.43139869310851
Average Absolute Difference in Skewness: 4.301660938883748
Average Absolute Difference in Kurtosis: 48.75511757548001
KL Divergence: inf
Average KL Divergence: inf
LMC: 0.1501927297714691
MC: 0.5350675487561307
RMC: 0.4760925488217826

------------------------------------