In [None]:
import pandas as pd
from scipy.stats import ttest_ind
from scipy.stats import anderson
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu
from scipy.stats import norm
import numpy as np

In [None]:
# Table description of Scorecard
no_md_df = pd.read_csv('dataset/noMD_ssf.csv')
have_md_df = pd.read_csv('dataset/haveMD_ssf.csv')

metrics_columns = no_md_df.columns[1:]

metrics_summary = {
    "Metric": [],
    "Min_noMD": [], "Max_noMD": [], "Mean_noMD": [], "Std_noMD": [],
    "Min_haveMD": [], "Max_haveMD": [], "Mean_haveMD": [], "Std_haveMD": []
}

for metric in metrics_columns:
    no_md_values = no_md_df[metric].dropna()
    have_md_values = have_md_df[metric].dropna()
    
    metrics_summary["Metric"].append(metric)

    metrics_summary["Min_noMD"].append(no_md_values.min())
    metrics_summary["Max_noMD"].append(no_md_values.max())
    metrics_summary["Mean_noMD"].append(no_md_values.mean())
    metrics_summary["Std_noMD"].append(no_md_values.std())
    
    metrics_summary["Min_haveMD"].append(have_md_values.min())
    metrics_summary["Max_haveMD"].append(have_md_values.max())
    metrics_summary["Mean_haveMD"].append(have_md_values.mean())
    metrics_summary["Std_haveMD"].append(have_md_values.std())

metrics_summary_df = pd.DataFrame(metrics_summary)
display(metrics_summary_df)


In [None]:
# visualization of distributions by CDFs plots
df_haveMD = pd.read_csv('dataset/haveMD_ssf.csv')
df_noMD = pd.read_csv('dataset/noMD_ssf.csv')

practice_columns = df_haveMD.columns[1:]

# Number of rows for subplots
num_columns = len(practice_columns)
num_rows = (num_columns // 4) + (num_columns % 4 > 0)

# Create figure
fig, axes = plt.subplots(num_rows, 4, figsize=(20, 5 * num_rows))
axes = axes.flatten()

# Define labels
label_ecdf_haveMD = "ECDF (With Security Policy)"
label_ecdf_noMD = "ECDF (Without Security Policy)"
label_cdf_haveMD = "Normal CDF (With Security Policy)"
label_cdf_noMD = "Normal CDF (Without Security Policy)"

for i, col in enumerate(practice_columns):
    data_haveMD = df_haveMD[col].dropna().sort_values()
    data_noMD = df_noMD[col].dropna().sort_values()
    ax = axes[i]

    # ECDF plots
    sns.ecdfplot(data_haveMD, ax=ax, linestyle="-", linewidth=2.5, color="cornflowerblue")
    sns.ecdfplot(data_noMD, ax=ax, linestyle="-", linewidth=2.5, color="orange")

    # Normal CDF plots
    x_vals_haveMD = np.linspace(min(data_haveMD), max(data_haveMD), 303)
    mean_haveMD, std_haveMD = np.mean(x_vals_haveMD), np.std(x_vals_haveMD)
    cdf_haveMD = norm.cdf(x_vals_haveMD, loc=mean_haveMD, scale=std_haveMD)
    ax.plot(x_vals_haveMD, cdf_haveMD, linestyle="--", color="cornflowerblue")

    x_vals_noMD = np.linspace(min(data_noMD), max(data_noMD), 376)
    mean_noMD, std_noMD = np.mean(x_vals_noMD), np.std(x_vals_noMD)
    cdf_noMD = norm.cdf(x_vals_noMD, loc=mean_noMD, scale=std_noMD)
    ax.plot(x_vals_noMD, cdf_noMD, linestyle="--", color="orange")

    ax.set_title(col, fontsize=24)
    ax.set_xlabel("Score", fontsize=20)
    ax.set_ylabel("Probability", fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=18)

# Remove empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()

fig.legend(
    handles=[
        plt.Line2D([0], [0], linestyle="-", linewidth=2.5, color="cornflowerblue", label=label_ecdf_haveMD),
        plt.Line2D([0], [0], linestyle="-", linewidth=2.5, color="orange", label=label_ecdf_noMD),
        plt.Line2D([0], [0], linestyle="--", color="cornflowerblue", label=label_cdf_haveMD),
        plt.Line2D([0], [0], linestyle="--", color="orange", label=label_cdf_noMD),
    ],
    loc="lower center",
    bbox_to_anchor=(0.5, -0.01),
    ncol=2,
    fontsize=22,
)

plt.subplots_adjust(hspace=0.3, bottom=0.065)
plt.show()


In [None]:
# A-D test for normality and Mann-Whitney U Statistical test
no_md_df = pd.read_csv('dataset/noMD_ssf.csv')
have_md_df = pd.read_csv('dataset/haveMD_ssf.csv')

metrics_columns = no_md_df.columns[1:]

anderson_results = {
    "Metric": [],
    "Statistic_noMD": [], "Critical_Values_noMD": [], "Significance_noMD": [],
    "Statistic_haveMD": [], "Critical_Values_haveMD": [], "Significance_haveMD": []
}

for metric in metrics_columns:
    no_md_values = no_md_df[metric].dropna()
    have_md_values = have_md_df[metric].dropna()

    anderson_results["Metric"].append(metric)

    anderson_test_no_md = anderson(no_md_values, dist='norm')
    anderson_results["Statistic_noMD"].append(anderson_test_no_md.statistic)
    anderson_results["Critical_Values_noMD"].append(anderson_test_no_md.critical_values)
    anderson_results["Significance_noMD"].append(anderson_test_no_md.significance_level)

    anderson_test_have_md = anderson(have_md_values, dist='norm')
    anderson_results["Statistic_haveMD"].append(anderson_test_have_md.statistic)
    anderson_results["Critical_Values_haveMD"].append(anderson_test_have_md.critical_values)
    anderson_results["Significance_haveMD"].append(anderson_test_have_md.significance_level)

anderson_results_df = pd.DataFrame(anderson_results)

# Mann-Whitney U Test
mann_whitney_results = {
    "Metric": [],
    "U_Statistic": [],
    "P_Value": []
}

for metric in metrics_columns:
    no_md_values = no_md_df[metric].dropna()
    have_md_values = have_md_df[metric].dropna()

    u_statistic, p_value = mannwhitneyu(no_md_values, have_md_values, alternative='two-sided')

    mann_whitney_results["Metric"].append(metric)
    mann_whitney_results["U_Statistic"].append(u_statistic)
    mann_whitney_results["P_Value"].append(round(p_value, 4))

mann_whitney_results_df = pd.DataFrame(mann_whitney_results)

display(anderson_results_df)
display(mann_whitney_results_df)
