In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.preprocessing import power_transform, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


from paretoset import paretoset
from lifelines import WeibullAFTFitter, LogNormalAFTFitter, LogLogisticAFTFitter


import argparse
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

In [2]:
FOLDER = Path("output/plots/")
csv_file = FOLDER / "data.csv"
data = pd.read_csv(csv_file, index_col=0)
data.head()

Unnamed: 0,stage,attack,data,files,kwargs,model,name,scorers,train_time,train_time_per_sample,...,model_layers,def_param,def_value,atk_param,atk_value,failure_rate,adv_failure_rate,training_time_per_failure,training_time_per_adv_failure,adv_training_time_per_failure
0,attack,0009cf1504e3689e79934f78fcb00c7c,0009cf1504e3689e79934f78fcb00c7c,0009cf1504e3689e79934f78fcb00c7c,0009cf1504e3689e79934f78fcb00c7c,0009cf1504e3689e79934f78fcb00c7c,0009cf1504e3689e79934f78fcb00c7c,0009cf1504e3689e79934f78fcb00c7c,206.672264,0.003691,...,34,cutoff,1.0,eps,0.3,276634.9,22947.190553,1.334096e-08,1.608291e-07,1.608291e-07
5,attack,0022118d46ac791faba90c6f907516f0,0022118d46ac791faba90c6f907516f0,0022118d46ac791faba90c6f907516f0,0022118d46ac791faba90c6f907516f0,0022118d46ac791faba90c6f907516f0,0022118d46ac791faba90c6f907516f0,0022118d46ac791faba90c6f907516f0,203.195971,0.003628,...,34,,,eps,0.123905,1435355.0,147157.749244,2.527946e-09,2.465721e-08,2.465721e-08
6,attack,0036b069268b81ebc05519c3372608b9,0036b069268b81ebc05519c3372608b9,0036b069268b81ebc05519c3372608b9,0036b069268b81ebc05519c3372608b9,0036b069268b81ebc05519c3372608b9,0036b069268b81ebc05519c3372608b9,0036b069268b81ebc05519c3372608b9,46.047791,0.000822,...,34,,,nb_grads,1.0,6608925.0,6524.344491,1.244199e-10,1.260329e-07,1.260329e-07
8,attack,0059d34a86c5a8e25c3bb06da1cd6555,0059d34a86c5a8e25c3bb06da1cd6555,0059d34a86c5a8e25c3bb06da1cd6555,0059d34a86c5a8e25c3bb06da1cd6555,0059d34a86c5a8e25c3bb06da1cd6555,0059d34a86c5a8e25c3bb06da1cd6555,0059d34a86c5a8e25c3bb06da1cd6555,25.424851,0.000454,...,18,,,nb_grads,1.0,94884.74,10296.684193,4.784913e-09,4.409334e-08,4.409334e-08
9,attack,00603349e13a42bef3170206db60dea6,00603349e13a42bef3170206db60dea6,00603349e13a42bef3170206db60dea6,00603349e13a42bef3170206db60dea6,00603349e13a42bef3170206db60dea6,00603349e13a42bef3170206db60dea6,00603349e13a42bef3170206db60dea6,207.58749,0.003707,...,34,cutoff,0.224719,max_iter,0.210526,31840.56,73.692387,1.164213e-07,5.030261e-05,5.030261e-05


In [3]:
def plot_aft(
    data,
    file,
    event_col,
    duration_col,
    title,
    mtype,
    xlabel="$\log(\eta)$ - 95% CI",
    ylabel="Covariate",
    replacement_dict={},
    **kwargs,
):
    if mtype == "weibull":
        aft = WeibullAFTFitter(**kwargs)
    elif mtype == "log_normal":
        aft = LogNormalAFTFitter(**kwargs)
    elif mtype == "log_logistic":
        aft = LogLogisticAFTFitter(**kwargs)
    df, test = train_test_split(data, test_size=0.2, random_state=42)
    assert (
        duration_col in df.columns
    ), f"Column {duration_col} not in dataframe with columns {df.columns}"
    assert (
        event_col in df.columns
    ), f"Column {event_col} not in dataframe with columns {df.columns}"
    aft.fit(df, duration_col=duration_col, event_col=event_col)
    aft.fit(df, duration_col=duration_col, event_col=event_col)
    ax = aft.plot()
    ax.set_ylabel(ylabel)
    labels = ax.get_yticklabels()
    labels = [label.get_text() for label in labels]
    for k, v in replacement_dict.items():
        labels = [label.replace(k, v) for label in labels]
    ax.set_yticklabels(labels)
    ax.set_xlabel(xlabel)
    ax.set_title(title)
    ax.get_figure().tight_layout()
    ax.get_figure().savefig(FOLDER / file)
    logger.info(f"Saved graph to {FOLDER / file}")
    return ax, aft


def clean_data_for_aft(
    data, kwarg_list, standard_scaling=True, target="adv_failure_rate"
):
    subset = data.copy()
    y = subset[target].copy(deep=True)
    cleaned = pd.DataFrame()
    if target in kwarg_list:
        kwarg_list.remove(target)
    for kwarg in kwarg_list:
        cleaned = pd.concat([cleaned, subset[kwarg]], axis=1)
    cols = cleaned.columns
    if standard_scaling is True:
        scaler = StandardScaler()
        scaler = scaler.fit(cleaned)
        cleaned_numeric = pd.DataFrame(scaler.transform(cleaned), columns=cols)
    else:
        cleaned_numeric = cleaned

    cleaned_numeric = pd.DataFrame(subset, columns=cols)
    cleaned_numeric.def_value.fillna(0, inplace=True)
    # replace 0 with 1e-6
    # cleaned_numeric = cleaned_numeric.replace(0, replace_0)
    return cleaned_numeric, y

In [4]:
kwarg_list = [
    "accuracy",
    "train_time",
    "atk_value",
    "def_value",
    "adv_fit_time",
    "random_state",
    "adv_failure_rate",
    "predict_time",
    # "adv_fit_time",
    # "adv_accuracy",
    # "adv_fit_time_per_sample",
    # "adv_accuracy",
]


cleaned, y = clean_data_for_aft(data, kwarg_list, standard_scaling=True)
cleaned.dropna(axis=0, how="any", subset=kwarg_list.remove("def_value"), inplace=True)
cleaned["adv_failure_rate"] = y

KeyError: 'random_state'

In [None]:
weibull_dict = {
    "Intercept: rho_": "$\\rho$",
    "Intercept: lambda_": "$\lambda$",
    "random_state: lambda_": "Random State",
    "def_value: lambda_": "Defence Strength",
    "atk_value: lambda_": "Attack Strength",
    "train_time: lambda_": "Training Time",
    "predict_time: lambda_": "Inference Time",
    "accuracy: lambda_": "Ben. Accuracy",
}

weibull_graph, wft = plot_aft(
    cleaned,
    "weibull_aft.pdf",
    "adv_failure_rate",
    "adv_fit_time",
    "Weibull AFT Model",
    "weibull",
    replacement_dict=weibull_dict,
)

In [None]:
log_normal_dict = {
    "Intercept: sigma_": "$\sigma$",
    "Intercept: mu_": "$\mu$",
    "random_state: mu_": "Random State",
    "def_value: mu_": "Defence Strength",
    "atk_value: mu_": "Attack Strength",
    "train_time: mu_": "Training Time",
    "predict_time: mu_": "Inference Time",
    "accuracy: mu_": "Ben. Accuracy",
    "adv_fit_time: mu_": "Adv. Fit Time",
}

log_normal_graph, lnt = plot_aft(
    cleaned,
    "log_normal_aft.pdf",
    "adv_failure_rate",
    "adv_fit_time",
    "Log Normal AFT Model",
    "log_normal",
    replacement_dict=log_normal_dict,
)

In [None]:
log_logistic_dict = {
    "Intercept: beta_": "$\\beta$",
    "Intercept: alpha_": "$\\alpha$",
    "random_state: alpha_": "Random State",
    "def_value: alpha_": "Defence Strength",
    "atk_value: alpha_": "Attack Strength",
    "train_time: alpha_": "Training Time",
    "predict_time: alpha_": "Inference Time",
    "accuracy: alpha_": "Ben. Accuracy",
    "adv_fit_time: alpha_": "Adv. Fit Time",
}

log_logistic_graph, llt = plot_aft(
    cleaned,
    "log_logistic_aft.pdf",
    "adv_failure_rate",
    "adv_fit_time",
    "Log Logistic AFT Model",
    "log_logistic",
    replacement_dict=log_logistic_dict,
)

NameError: name 'cleaned' is not defined

In [None]:
aft_dict = {
    "Weibull": wft,
    "LogNormal": lnt,
    "LogLogistic": llt,
}
aft_data = pd.DataFrame()
aft_data.index.name = "Model"
aft_data.index = aft_dict.keys()
aft_data["AIC"] = [x.AIC_ for x in aft_dict.values()]
aft_data["LogLikelihood"] = [x.log_likelihood_ for x in aft_dict.values()]
aft_data["Concordance Score"] = [x.concordance_index_ for x in aft_dict.values()]
aft_data["BIC"] = [x.BIC_ for x in aft_dict.values()]
aft_data = aft_data.round(2)
aft_data.to_csv(FOLDER / "aft_comparison.csv")
logger.info(f"Saved AFT comparison to {FOLDER / 'aft_comparison.csv'}")

NameError: name 'wft' is not defined