In [1]:
%load_ext  autoreload
%autoreload 2

In [8]:
import sys
sys.path.append('..')

# Ignore warnings.
import warnings
warnings.filterwarnings('ignore')

# Handle library imports.
import numpy as np
import pandas as pd

from trickster.adversarial_helper import *
from trickster.expansion import *
from sklearn.linear_model import LogisticRegressionCV

In [3]:
# Handle global variables.
SEED = 2018
np.random.seed(seed=SEED)

In [4]:
# Define experiment helper functions.
def load_transform_data_fn(data_file, bins, **kwargs):
    '''Description goes here.'''
    # Load the file
    df = pd.read_csv(data_file)

    # Remove the index column.
    df = df.drop(df.columns[0], axis=1)

    # Quantize credit amount, duration and age.
    features_to_quantize = ['Credit amount', 'Duration', 'Age']
    for feat in features_to_quantize:
        series = df.loc[:, feat]
        df.loc[:, feat] = pd.qcut(series, bins, duplicates='drop')

    # Set Job type to object for one-hot encoding
    df.loc[:, 'Job'] = df.loc[:, 'Job'].astype(object)

    # Perform one-hot encoding
    df = pd.get_dummies(df)
    # Drop binary features
    df = df.drop(columns=['Sex_male', 'Risk_bad'])

    # Separate features from targets
    df_X = df.iloc[:, :-1]
    df_y = df.iloc[:, -1]

    # Convert to numpy.
    X = df_X.values.astype('float')
    y = df_y.values.astype('float')

    return X, y, df_X.columns

def clf_fit_fn(X_train, y_train, **kwargs):
    '''Fit logistic regression by performing a Grid Search with Cross Validation.'''
    Cs = np.arange(0.1, 2, 0.025)
    class_weight = None # balanced or None
    scoring = 'f1' # accuracy, f1 or roc_auc

    clf = LogisticRegressionCV(
        Cs=Cs,
        cv=5,
        n_jobs=-1,
        penalty='l2',
        scoring=scoring,
        class_weight=class_weight,
        random_state=SEED
    )

    clf.fit(X_train, y_train)
    return clf

def get_expansions_fn(features, expand_quantized_fn, **kwargs):
    '''Add description here.'''

    # Find indexes of required features in the original feature space.
    idxs_credit = find_substring_occurences(features, 'Credit amount')
    idxs_duration = find_substring_occurences(features, 'Duration')
    idxs_purpose = find_substring_occurences(features, 'Purpose')

    # Concatenate indexes of transformable features.
    transformable_feature_idxs = sorted(idxs_credit + idxs_duration + idxs_purpose)
    reduced_features = features[transformable_feature_idxs]

    # Find indexes of required features in the reduced feature space.
    idxs_credit = find_substring_occurences(reduced_features, 'Credit amount')
    idxs_duration = find_substring_occurences(reduced_features, 'Duration')
    idxs_purpose = find_substring_occurences(reduced_features, 'Purpose')

    # Set required expansions for features in the reduced feature space.
    expansions = [
        (idxs_credit, expand_quantized_fn),
        (idxs_duration, expand_quantized_fn),
        (idxs_purpose, expand_categorical)
    ]

    return expansions, transformable_feature_idxs

def benchmark_search_fn(**kwargs):
    '''Perform BFS adversarial example search to benchmark against A* search.'''
    heuristic_fn = lambda x, clf, epsilon, zero_to_one, q_norm: 0
    results = adversarial_search(heuristic_fn=heuristic_fn, **kwargs)
    return results

In [30]:
def run_experiment(bins, p_norm, q_norm):
    # Define dataset location.
    data_file = '../data/german_credit_data.csv'
    
    # Perform the experiment.
    result = experiment_wrapper(
        load_transform_data_fn=load_transform_data_fn,
        data_file=data_file,
        bins=bins,
        p_norm=p_norm,
        q_norm=q_norm,
        clf_fit_fn=clf_fit_fn,
        get_expansions_fn=get_expansions_fn,
        expand_quantized_fn=expand_quantized,
        benchmark_search_fn=benchmark_search_fn,
        target_confidence=0.5,
        zero_to_one=True,
        random_state=SEED
    )

    ast_expanded = result['search_results']['nodes_expanded']
    bfs_expanded = result['benchmark_results']['nodes_expanded']
    ast_runtime = result['search_results']['runtime']
    bfs_runtime = result['benchmark_results']['runtime']

    return [(ast_expanded, bfs_expanded), [(ast_runtime, bfs_runtime)]]

In [31]:
# Define experiment parameters.
bin_counts = np.arange(5, 101, 5)
p_norm = 1
q_norm = np.inf

results = []

for bins in bin_counts:
    print('Performing experiments for {} bins.'.format(bins))
    expanded_stats, runtime_stats = run_experiment(bins=bins, p_norm=p_norm, q_norm=q_norm)
    results.append((bins, expanded_stats, runtime_stats))

Performing experiments for 5 bins.


100%|##########| 135/135 [00:02<00:00, 66.09it/s]
100%|##########| 135/135 [00:01<00:00, 81.56it/s]


Performing experiments for 10 bins.


100%|##########| 122/122 [00:02<00:00, 43.73it/s]
100%|##########| 122/122 [00:02<00:00, 47.88it/s]


Performing experiments for 15 bins.


100%|##########| 126/126 [00:02<00:00, 50.80it/s]
100%|##########| 126/126 [00:02<00:00, 60.84it/s]


Performing experiments for 20 bins.


100%|##########| 137/137 [00:03<00:00, 43.27it/s]
100%|##########| 137/137 [00:02<00:00, 46.63it/s]


Performing experiments for 25 bins.


100%|##########| 131/131 [00:02<00:00, 47.20it/s]
100%|##########| 131/131 [00:02<00:00, 55.64it/s]


Performing experiments for 30 bins.


100%|##########| 122/122 [00:02<00:00, 44.34it/s]
100%|##########| 122/122 [00:02<00:00, 49.18it/s]


Performing experiments for 35 bins.


100%|##########| 124/124 [00:02<00:00, 54.12it/s]
100%|##########| 124/124 [00:01<00:00, 64.12it/s]


Performing experiments for 40 bins.


100%|##########| 123/123 [00:02<00:00, 51.38it/s]
100%|##########| 123/123 [00:02<00:00, 58.02it/s]


Performing experiments for 45 bins.


100%|##########| 122/122 [00:02<00:00, 45.64it/s]
100%|##########| 122/122 [00:02<00:00, 51.88it/s]


Performing experiments for 50 bins.


100%|##########| 152/152 [00:03<00:00, 47.72it/s]
100%|##########| 152/152 [00:02<00:00, 55.84it/s]


Performing experiments for 55 bins.


100%|##########| 121/121 [00:02<00:00, 45.14it/s]
100%|##########| 121/121 [00:02<00:00, 52.43it/s]


Performing experiments for 60 bins.


100%|##########| 122/122 [00:02<00:00, 45.20it/s]
100%|##########| 122/122 [00:02<00:00, 52.10it/s]


Performing experiments for 65 bins.


100%|##########| 124/124 [00:02<00:00, 46.76it/s]
100%|##########| 124/124 [00:02<00:00, 53.77it/s]


Performing experiments for 70 bins.


100%|##########| 120/120 [00:02<00:00, 43.15it/s]
100%|##########| 120/120 [00:02<00:00, 50.40it/s]


Performing experiments for 75 bins.


100%|##########| 168/168 [00:04<00:00, 40.58it/s]
100%|##########| 168/168 [00:03<00:00, 49.14it/s]


Performing experiments for 80 bins.


100%|##########| 121/121 [00:02<00:00, 44.74it/s]
100%|##########| 121/121 [00:02<00:00, 52.88it/s]


Performing experiments for 85 bins.


100%|##########| 119/119 [00:02<00:00, 40.96it/s]
100%|##########| 119/119 [00:02<00:00, 48.91it/s]


Performing experiments for 90 bins.


100%|##########| 121/121 [00:03<00:00, 37.11it/s]
100%|##########| 121/121 [00:02<00:00, 42.43it/s]


Performing experiments for 95 bins.


100%|##########| 123/123 [00:03<00:00, 37.65it/s]
100%|##########| 123/123 [00:02<00:00, 44.33it/s]


Performing experiments for 100 bins.


100%|##########| 118/118 [00:02<00:00, 42.57it/s]
100%|##########| 118/118 [00:02<00:00, 50.40it/s]


In [34]:
def percentage_change(new, old):
    # Compute percentage change. 
    return ((new - old) / old) * 100

def performance_change(new, old):
    # Compute percentage change. 
    return ((old - new) / new) * 100

for result in results:
    bins, expanded_stats, runtime_stats = result
    percentage = -percentage_change(expanded_stats[0], expanded_stats[1]).mean()
    performance = performance_change(expanded_stats[0], expanded_stats[1]).mean()
    
    print('For {} bins percentage decrease is: {:.2f} and performance improvement is: {:.2f}.'
         .format(bins, percentage, performance))

For 5 bins percentage decrease is: 6.04 and performance improvement is: 9.07.
For 10 bins percentage decrease is: 8.81 and performance improvement is: 13.25.
For 15 bins percentage decrease is: 11.18 and performance improvement is: 18.45.
For 20 bins percentage decrease is: 11.12 and performance improvement is: 19.27.
For 25 bins percentage decrease is: 10.58 and performance improvement is: 17.63.
For 30 bins percentage decrease is: 10.01 and performance improvement is: 17.12.
For 35 bins percentage decrease is: 8.66 and performance improvement is: 14.46.
For 40 bins percentage decrease is: 11.19 and performance improvement is: 19.13.
For 45 bins percentage decrease is: 9.98 and performance improvement is: 17.46.
For 50 bins percentage decrease is: 7.92 and performance improvement is: 12.78.
For 55 bins percentage decrease is: 7.19 and performance improvement is: 11.69.
For 60 bins percentage decrease is: 9.34 and performance improvement is: 16.15.
For 65 bins percentage decrease is: 9