In [None]:
"""Analysis script for analyzing standard experiment, adversarial attacks and random attacks.
First must perform experiments, where the result files are saved under the path "results/{}_{}"".format(exp_id, date_time)
"""

import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import pickle
from IPython.display import display, HTML
pd.options.display.max_rows = 1500
pd.options.display.max_columns = 200
pd.options.display.width = 1000

import sys, os
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))
sys.path.append(os.path.join(os.path.dirname("__file__"), '..', '..'))
from GIB.experiments.GIB_node_model import get_evasive_dict, get_data, remove_edge_random, add_random_edge
from GIB.pytorch_net.util import to_np_array, to_Variable, filter_filename, make_dir
from GIB.util import parse_filename, GIB_PATH

## Helper functions:

In [None]:
def get_dfs(exp_ids, date_time, is_adversarial, include=None):
    """Get the combined DataFrames for a list of exp_ids, under results folders "results/{}_{}".format(exp_id, date_time).
    
    Args:
        exp_ids: a list of exp_id set for the experiments.
        date_time: month and date of conducting the experiment.
        is_adversarial: if True, the experiment is adversarial attack experiment (run by GIB_node_attack_exp).
            If False, the experiment is standard experiment (run by GIB_node_exp).
        include: strings much be included for the result files selected for analysis. Default None for no filtering.
    
    Returns:
        df: a Pandas DataFrame containing all the results.
    """
    df = None
    for exp_id in exp_ids:
        dirname = GIB_PATH + "/{}_{}/".format(exp_id, date_time)
        # Get individual df from each exp_id:
        if is_adversarial:
            df_ele = get_df_adversarial(dirname, include=include)
        else:
            df_ele = get_df_standard(dirname, include=include)
        # Combine the dfs into a single df:
        if df is None:
            df = df_ele
        else:
            df = pd.concat([df, df_ele])
    return df


def get_df_adversarial(dirname, include=None, num_attacked=None):
    """Get the combined DataFrames for a list of exp_ids for attack experiments, under results folders "results/{}_{}".format(exp_id, date_time)."""
    if include is None:
        include = []
    filenames = filter_filename(dirname, include=include)
    df_dict_list = []
    for i, filename in enumerate(filenames):
        if not filename.endswith(".p"):
            continue
        df_dict = {}
        baseline = True if "RGCN" in filename or "GCNJaccard" in filename else False  # The models using DeepRobust should set baseline=True
        df_dict.update(parse_filename(filename, is_adversarial=True, baseline=baseline))
        try:
            data_record = pickle.load(open(dirname + filename, "rb"))
        except:
            print("Unable to load {}".format(filename))
            continue
        for metric in ['classification_margins_clean_best', 'classification_margins_evasive_best', 'classification_margins_attacked_best']:
            metric_list = []
            for i, (key, item) in enumerate(data_record.items()):
                if num_attacked is not None:
                    if i >= num_attacked:
                        break
                if key in ["params", "models_before", "model_dict", "best_model_dict","num_layers", "best_epoch"]:
                    continue
                metric_core = metric[:-5] if baseline else metric  # For baseline models by DeepRobust, the models saved are the best validation model
                if metric_core not in item:
                    metric_list.append(np.NaN)
                else:
                    metric_list.append(np.mean(item[metric_core] > 0))
                
            df_dict[metric] = np.mean(metric_list)
        df_dict["num_attacked"] = len(data_record) if num_attacked is None else num_attacked
        df_dict_list.append(df_dict)
    df = pd.DataFrame(df_dict_list)
    return df


def get_df_standard(dirname, include=None):
    """Get the combined DataFrames for a list of exp_ids for standard experiments, under results folders "results/{}_{}".format(exp_id, date_time)."""
    if include is None:
        include = []
    filenames = filter_filename(dirname, include=include)
    df_dict_list = []
    for i, filename in enumerate(filenames):
        if not filename.endswith(".p"):
            continue
        df_dict = {}
        df_dict.update(parse_filename(filename, is_adversarial=False))
        try:
            data_record = pickle.load(open(dirname + filename, "rb"))
        except:
            print("{} not loaded.".format(filename))
        for key, item in data_record.items():
            if key not in ["model_dict", "best_model_dict","num_layers", "best_epoch"]:
                df_dict[key] = item[-1]
        df_dict_list.append(df_dict)
    df = pd.DataFrame(df_dict_list)
    return df

## 1. Analyze standard experiment:

In [None]:
exp_ids = ["node1.0"]   # Here pass in the list of exp_id that you want to analyze. This exp_id correponds to the "exp_id" set in "GIB_node_attack_exp"
date_time = "4-6"       # Month and date that you conduct the experiment. Will access the result files under "results/{}_{}".format(exp_id, date_time)
df_standard = get_dfs(exp_ids, date_time, is_adversarial=False)  # Obtain combined DataFrame

In [None]:
# Group the dataframe by experimental settings:
dff = df_standard.groupby(by=["data_type","model_type", "beta1","beta2","num_layers","struct_dropout_mode"]).count()[["b_test_f1_micro", "seed"]]
dff.unstack(-1)

## 2. Analyze adversarial attacks:

In [None]:
exp_ids = ["Cora-GIB-Bern"]   # Here pass in the list of exp_id that you want to analyze. This exp_id correponds to the "exp_id" set in "GIB_node_attack_exp"
date_time = "4-6"             # Month and date that you conduct the experiment. Will access the result files under "results/{}_{}".format(exp_id, date_time)
df = get_dfs(exp_ids, date_time, is_adversarial=True)  # Obtain combined DataFrame

In [None]:
# Group the dataframe by experimental settings:
metrics = ['classification_margins_evasive_best', 'classification_margins_attacked_best', 'classification_margins_clean_best', 'num_attacked']
dff = df.groupby(by = ["data_type", "direct_attack", "model_type", "n_perturbations"]).mean()[metrics]
dff.unstack(level=-1)

## 3. Perform evasive random attacks and analyze:

In [None]:
exp_id = "node1.0"
date_time = "4-6"
dirname = GIB_PATH + "/{}_{}/".format(exp_id, date_time)
filenames = sorted(filter_filename(dirname, include=["ed_0.0"]))

# For each file specified in filenames, perform evaluation on the randomly corrupted (feature with additive Gaussian noise) graph with varying corruption intensity.
df_dict_GIBD_feature = []
for i, filename in enumerate(filenames):
    print("Attacking the {}th model".format(i))
    df_dict_GIBD_feature += get_evasive_dict(
        dirname,
        filename,
        perturb_mode="feature",   # Corrupt type. Here since it is feature additive noise, use "feature".
        feature_perturb_target=[0.5, 1., 1.5],   # Feature noise ratios
        n_repeats=5,              # Number of feature attacks per seed. Larger n_repeats results in less variation in the mean.
        device_name="cuda:0" if torch.cuda.is_available() else "cpu",
        verbose=False,
    )

In [None]:
df_GIBD_features = pd.DataFrame(df_dict_GIBD_feature)
dff = df_GIBD_features
dff = dff.groupby(by=["data_type","model_type","struct_dropout_mode","beta1", "beta2","reparam_mode","feature_noise_ratio_evasive"]).mean()[["test_f1_micro_evasive_best"]]
dff_GIBD_features = dff.unstack(-1).round(3) * 100
dff_GIBD_features