In [3]:
import pandas as pd
from pathlib import Path
import numpy as np
import random

from sklearn.preprocessing import StandardScaler

In [4]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [7]:
from captum.attr import IntegratedGradients, DeepLift, GradientShap, NoiseTunnel, FeatureAblation, FeaturePermutation, GuidedBackprop, Saliency, InputXGradient, ShapleyValueSampling

from captum.metrics import sensitivity_max, infidelity

In [8]:
def annotate_attributions(attributions, columns):
    return pd.DataFrame(attributions.cpu().detach().numpy(), columns = columns)

In [9]:
def calculate_avg_attributions(attributions, axis=0):
    return attributions.mean(axis=axis).sort_values(ascending=False).to_frame(name="avg_attribution_coeff")

In [10]:
# Rank features by their absolute coefficient value to identify the most important ones
def rank_features_by_importance(summarized_attributions_df):
    summarized_attributions_df["abs_avg_attribution_coeff"] = summarized_attributions_df["avg_attribution_coeff"].abs()
    summarized_attributions_df["feature_importance_rank"] = summarized_attributions_df["abs_avg_attribution_coeff"].rank(ascending=False)
    summarized_attributions_df.drop(["abs_avg_attribution_coeff"], axis=1, inplace=True)

    return summarized_attributions_df

In [11]:
#For each feature, show its relative order (1 - most positive influence, n - least positive influence)
def add_relative_order_of_features(summarized_attributions_df):
    summarized_attributions_df = summarized_attributions_df.sort_values(by="avg_attribution_coeff", ascending=False)
    summarized_attributions_df["relative_order"] = summarized_attributions_df.rank(ascending=False)

    return summarized_attributions_df

### Download model artifact from a previous run

In [16]:
MODEL_FILENAME = "SmoothL1Loss_fixed_Adamax_fewer_neurons_0.2_testSize_new_StandardScaler_2048_batch_0.008_lr_norm_4_16_48_64_144_240_nprocs_0.05_dropout_pytorch_v1.11.0.tar"
MODEL_DIR = rf"/home/thes1067/models/blue_waters"
MODEL_PATH = Path(MODEL_DIR, MODEL_FILENAME)

In [17]:
DATASET_DIR = r"/home/thes1067/data/claix_dataset"
DATASET_NAME = "claix_posix_npb_4_16_64_nprocs_Ciao_C_1288.csv"
DATASET_PATH = Path(DATASET_DIR, DATASET_NAME)

In [19]:
MODEL_PATH.is_file()

True

## Load the model

In [20]:
bw_model = nn.Sequential(
    nn.Linear(97, 2048),
    nn.Dropout(p=0.05),
    nn.ReLU(),
    nn.Linear(2048, 512),
    nn.Dropout(p=0.05),
    nn.ReLU(),
    nn.Linear(512, 128),
    nn.Dropout(p=0.05),
    nn.ReLU(),
    nn.Linear(128, 1),
).to(device)

In [21]:
print("Loading pretrained model...")

checkpoint = torch.load(MODEL_PATH, map_location=torch.device(device))
bw_model.load_state_dict(checkpoint['model_state_dict'])
model_epoch = checkpoint['epoch']

print(f"Current epoch: {model_epoch}")

Loading pretrained model...
Current epoch: 599


In [22]:
bw_model.eval()

Sequential(
  (0): Linear(in_features=97, out_features=2048, bias=True)
  (1): Dropout(p=0.05, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=2048, out_features=512, bias=True)
  (4): Dropout(p=0.05, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=512, out_features=128, bias=True)
  (7): Dropout(p=0.05, inplace=False)
  (8): ReLU()
  (9): Linear(in_features=128, out_features=1, bias=True)
)

### Fix the seeds


In [23]:
random_seed = 1234
split_seed = 42

random.seed(random_seed)
np.random.seed(random_seed)

torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)

# Set PyTorch to use deterministic algorithms if possible
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Load the data

In [24]:
df_claix_posix = pd.read_csv(DATASET_PATH)

In [25]:
len(df_claix_posix)

1163

In [26]:
df_claix_posix.head()

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_FDSYNCS,...,POSIX_F_VARIANCE_RANK_TIME,POSIX_F_VARIANCE_RANK_BYTES,uid,start_time,end_time,nprocs,jobid,lib_ver,hints,bandwidth
0,32,0,0,440,440,78,0,-1,0,0,...,2.520036,1.084529e+19,38849,2022-07-26 20:10:07,2022-07-26 20:11:18,16,28860382,3.3.1,romio_no_indep_rw=true;cb_nodes=4,1971.700985
1,32,0,0,440,440,78,0,-1,0,0,...,2.567556,1.084529e+19,38849,2022-07-25 23:45:34,2022-07-25 23:46:49,16,28845229,3.3.1,romio_no_indep_rw=true;cb_nodes=4,1952.503983
2,32,0,0,440,440,78,0,-1,0,0,...,2.458673,1.084529e+19,38849,2022-07-26 20:52:45,2022-07-26 20:53:56,16,28860382,3.3.1,romio_no_indep_rw=true;cb_nodes=4,1996.763339
3,32,0,0,440,440,78,0,-1,0,0,...,2.409477,1.084529e+19,38849,2022-07-26 21:41:19,2022-07-26 21:42:29,16,28860382,3.3.1,romio_no_indep_rw=true;cb_nodes=4,2017.040999
4,32,0,0,440,440,78,0,-1,0,0,...,2.466827,1.084529e+19,38849,2022-07-26 18:02:28,2022-07-26 18:03:39,16,28860381,3.3.1,romio_no_indep_rw=true;cb_nodes=4,1992.35133


### Drop the non-invariant columns

In [27]:
df_claix_posix = df_claix_posix.drop(['uid', 'jobid', 'hints', 'start_time', 'end_time', 'lib_ver'],
                                                               axis=1)

### Drop columns to match the Blue Waters dataset on which the model was trained

In [28]:
df_claix_posix = df_claix_posix.drop(['POSIX_FDSYNCS',
                                        'POSIX_RENAMED_FROM',
                                        'POSIX_F_VARIANCE_RANK_TIME',
                                        'POSIX_F_VARIANCE_RANK_BYTES'],
                                        axis=1)			

### Separate bandwidth from input features

In [29]:
df_bandwidth = df_claix_posix.pop('bandwidth')
df_claix_posix

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,rank,POSIX_F_READ_TIME,POSIX_F_WRITE_TIME,POSIX_F_META_TIME,POSIX_TOTAL_TIME,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,nprocs
0,32,0,0,440,440,78,0,-1,0,0,...,-1,1.396742,5.161620,0.022065,6.580427,0.005862,0.155172,0.000530,6.559490,16
1,32,0,0,440,440,78,0,-1,0,0,...,-1,1.388736,5.230402,0.025987,6.645125,0.005642,0.016469,0.000806,6.621231,16
2,32,0,0,440,440,78,0,-1,0,0,...,-1,1.374670,5.102114,0.021048,6.497833,0.006568,0.130111,0.000555,6.479013,16
3,32,0,0,440,440,78,0,-1,0,0,...,-1,1.382730,5.028884,0.020894,6.432509,0.006548,0.017362,0.000570,6.413866,16
4,32,0,0,440,440,78,0,-1,0,0,...,-1,1.392669,5.095290,0.024262,6.512222,0.006047,0.083048,0.000641,6.489974,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1158,8,0,0,440,440,78,0,-1,0,0,...,-1,1.408159,4.991706,0.003280,6.403145,0.005342,0.089964,0.000726,6.400820,4
1159,8,0,0,440,440,78,0,-1,0,0,...,-1,1.421173,5.007220,0.003163,6.431556,0.005848,0.090479,0.000674,6.429183,4
1160,3248,2542,0,107198,9041,78248,397,-43,0,0,...,-1,284.566834,198.561634,0.594587,483.723055,1.709868,0.548601,0.000000,14.079545,48
1161,16112,12718,0,389198,35189,236638,1933,-43,0,0,...,-1,2329.200567,100.632465,3.125118,2432.958150,1.931765,0.226163,0.000000,17.771790,240


In [30]:
df_bandwidth.describe()

count    1163.000000
mean     1814.994168
std       393.298187
min         0.084217
25%      1953.342184
50%      1986.288164
75%      2007.708468
max      2070.166839
Name: bandwidth, dtype: float64

### Split the Claix data into the train/test pair

In [34]:
X = df_claix_posix.to_numpy()

### Scale the input features

In [36]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [37]:
tensor_X = torch.Tensor(X_scaled).to(device)
tensor_y = torch.Tensor(df_bandwidth.to_numpy()).view(-1, 1).to(device) 

## Use Integrated Gradients to attribute the importance to the features

In [43]:
baseline = torch.zeros(len(tensor_X), 97).to(device)
baseline

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [44]:
bw_ig = IntegratedGradients(bw_model)
bw_ig_attributions, bw_ig_delta = bw_ig.attribute(tensor_X, baseline, n_steps=300, return_convergence_delta=True)
bw_ig_attributions_df = annotate_attributions(bw_ig_attributions, columns = list(df_claix_posix.columns))
print('Delta Avg:', torch.mean(bw_ig_delta))

Delta Avg: tensor(-0.2472, device='cuda:0', dtype=torch.float64)


## Other feature attribution methods

In [45]:
ig_nt_bw = NoiseTunnel(bw_ig)
dl_bw = DeepLift(bw_model)
gs_bw = GradientShap(bw_model)
fa_bw = FeatureAblation(bw_model)
fp_bw = FeaturePermutation(bw_model)
sal_bw = Saliency(bw_model)
input_x_grad_bw = InputXGradient(bw_model)
guided_back_prop_bw = GuidedBackprop(bw_model)
shap_sampling_bw = ShapleyValueSampling(bw_model)

In [46]:
def perturb_fn(inputs):
    noise = torch.tensor(np.random.normal(0, 0.003, inputs.shape)).float().to(device)
    return noise, inputs - noise

In [48]:
sal_attr_bw = sal_bw.attribute(tensor_X)
sal_attr_df_bw = annotate_attributions(sal_attr_bw, columns=list(df_claix_posix.columns))



In [49]:
sens = sensitivity_max(sal_bw.attribute, tensor_X)
torch.mean(sens)



tensor(0.0356, device='cuda:0')

In [50]:
infid = infidelity(bw_model, perturb_fn, tensor_X, sal_attr_bw)
torch.mean(infid)

tensor(2481037.7500, device='cuda:0')

In [51]:
fp_attr_bw = fp_bw.attribute(tensor_X)
fp_attr_df_bw = annotate_attributions(fp_attr_bw, columns=list(df_claix_posix.columns))

In [52]:
sens = sensitivity_max(fp_bw.attribute, tensor_X)
torch.mean(sens)

tensor(10.0365, device='cuda:0')

In [53]:
infid = infidelity(bw_model, perturb_fn, tensor_X, fp_attr_bw)
torch.mean(infid)

tensor(638123.1875, device='cuda:0')

In [54]:
input_x_grad_attr_bw = input_x_grad_bw.attribute(tensor_X)
input_x_grad_attr_df_bw = annotate_attributions(input_x_grad_attr_bw, columns=list(df_claix_posix.columns))



In [55]:
sens = sensitivity_max(input_x_grad_bw.attribute, tensor_X)
torch.mean(sens)



tensor(0.1989, device='cuda:0')

In [56]:
infid = infidelity(bw_model, perturb_fn, tensor_X, input_x_grad_attr_bw)
torch.mean(infid)

tensor(735415.2500, device='cuda:0')

In [57]:
guided_back_prop_attr_bw = guided_back_prop_bw.attribute(tensor_X)
guided_back_prop_attr_df_bw = annotate_attributions(guided_back_prop_attr_bw, columns=list(df_claix_posix.columns))



In [58]:
sens = sensitivity_max(guided_back_prop_bw.attribute, tensor_X)
torch.mean(sens)



tensor(0.0604, device='cuda:0')

In [59]:
infid = infidelity(bw_model, perturb_fn, tensor_X, guided_back_prop_attr_bw)
torch.mean(infid)

tensor(9909.1270, device='cuda:0')

In [60]:
shap_sampling_attr_bw = shap_sampling_bw.attribute(tensor_X)
shap_sampling_attr_df_bw = annotate_attributions(shap_sampling_attr_bw, columns=list(df_claix_posix.columns))

In [61]:
sens = sensitivity_max(shap_sampling_bw.attribute, tensor_X)
torch.mean(sens)

tensor(0.2753, device='cuda:0')

In [62]:
infid = infidelity(bw_model, perturb_fn, tensor_X, shap_sampling_attr_bw)
torch.mean(infid)

tensor(691033.1875, device='cuda:0')

In [152]:
ig_nt_attr_bw, ig_nt_delta_bw = ig_nt_bw.attribute(tensor_X, return_convergence_delta=True)
print("Delta Avg: ", torch.mean(ig_nt_delta_bw))
ig_nt_attr_df_bw = annotate_attributions(ig_nt_attr_bw, columns=list(df_claix_posix.columns))

Delta Avg:  tensor(0.5005, device='cuda:0', dtype=torch.float64)


In [64]:
dl_attr_bw, dl_delta_bw = dl_bw.attribute(tensor_X, return_convergence_delta=True)
print("Delta Avg: ", torch.mean(dl_delta_bw))
dl_attr_df_bw = annotate_attributions(dl_attr_bw, columns=list(df_claix_posix.columns))

Delta Avg:  tensor(0.0003, device='cuda:0')


               activations. The hooks and attributes will be removed
            after the attribution is finished


In [65]:
fa_attr_bw = fa_bw.attribute(tensor_X)
fa_attr_df_bw = annotate_attributions(fa_attr_bw, columns=list(df_claix_posix.columns))

In [66]:
sens = sensitivity_max(fa_bw.attribute, tensor_X)
torch.mean(sens)

tensor(0.2343, device='cuda:0')

In [67]:
infid = infidelity(bw_model, perturb_fn, tensor_X, fa_attr_bw)
torch.mean(infid)

tensor(721449.8750, device='cuda:0')

In [70]:
def calc_norm_sum(attributions_df):
    attributions_sum = attributions_df.sum(0)
    return (attributions_sum / np.linalg.norm(attributions_sum, ord=1)).to_frame(name="norm_attr_coeff") # Just division returns a Series, so cast it to a DataFrame

In [71]:
ig_attr_norm_sum_bw = calc_norm_sum(bw_ig_attributions_df)
ig_attr_norm_sum_bw

Unnamed: 0,norm_attr_coeff
POSIX_OPENS,-0.000538
POSIX_FILENOS,-0.000727
POSIX_DUPS,0.000000
POSIX_READS,0.003232
POSIX_WRITES,-0.004742
...,...
POSIX_F_MAX_READ_TIME,0.007788
POSIX_F_MAX_WRITE_TIME,0.013595
POSIX_F_FASTEST_RANK_TIME,0.000392
POSIX_F_SLOWEST_RANK_TIME,0.014220


In [153]:
ig_nt_attr_test_norm_sum_bw = calc_norm_sum(ig_nt_attr_df_bw)

In [73]:
dl_attr_test_norm_sum_bw = calc_norm_sum(dl_attr_df_bw)

In [75]:
fa_attr_test_norm_sum_bw =calc_norm_sum(fa_attr_df_bw)

In [76]:
sal_norm_sum_bw = calc_norm_sum(sal_attr_df_bw)

In [77]:
input_x_grad_norm_sum_bw = calc_norm_sum(input_x_grad_attr_df_bw)

In [78]:
guided_back_prop_norm_sum_bw = calc_norm_sum(guided_back_prop_attr_df_bw)

In [79]:
fp_attr_norm_sum_bw = calc_norm_sum(fp_attr_df_bw)

In [80]:
shap_sampling_attr_norm_sum_bw = calc_norm_sum(shap_sampling_attr_df_bw)

In [154]:
aggregated_feature_attributions_normalized_df_bw = pd.concat([ig_attr_norm_sum_bw, ig_nt_attr_test_norm_sum_bw, dl_attr_test_norm_sum_bw,
                                                            fa_attr_test_norm_sum_bw, shap_sampling_attr_norm_sum_bw, guided_back_prop_norm_sum_bw, fp_attr_norm_sum_bw, input_x_grad_norm_sum_bw, sal_norm_sum_bw],
                                                            axis=1, join="inner")
aggregated_feature_attributions_normalized_df_bw.columns = ["ig", "ig_noise_tunnel", "deeplift",
                                                            "feature_ablation", "shap_sampling", "guided_backprop", "feature_permutation", "input_x_grad", "saliency"]
aggregated_feature_attributions_normalized_df_bw

Unnamed: 0,ig,ig_noise_tunnel,deeplift,feature_ablation,shap_sampling,guided_backprop,feature_permutation,input_x_grad,saliency
POSIX_OPENS,-0.000538,-0.000177,-0.000529,-0.000435,-0.000522,0.000372,0.003408,-0.000471,0.000304
POSIX_FILENOS,-0.000727,-0.000074,-0.000760,-0.000588,-0.000705,0.000311,0.001900,-0.000600,0.000422
POSIX_DUPS,0.000000,-0.000021,0.000000,0.000000,0.000000,-0.000352,0.000000,0.000000,0.000314
POSIX_READS,0.003232,0.000514,0.003167,0.003287,0.002943,-0.002275,0.014992,0.003259,0.002137
POSIX_WRITES,-0.004742,-0.000694,-0.006035,-0.003019,-0.002542,-0.000226,-0.004328,-0.003117,0.002083
...,...,...,...,...,...,...,...,...,...
POSIX_F_MAX_READ_TIME,0.007788,0.002614,0.006396,0.010632,0.009513,-0.010306,0.018056,0.010624,0.008158
POSIX_F_MAX_WRITE_TIME,0.013595,0.006002,0.012175,0.013013,0.015313,-0.011995,0.012253,0.014386,0.010243
POSIX_F_FASTEST_RANK_TIME,0.000392,-0.000483,0.000409,-0.000350,0.000191,-0.004890,-0.000797,0.000317,0.004271
POSIX_F_SLOWEST_RANK_TIME,0.014220,0.002793,0.013344,0.013356,0.012447,-0.014954,0.022987,0.013230,0.012866


In [155]:
summary_feature_attributions_normalized_df_bw = calculate_avg_attributions(aggregated_feature_attributions_normalized_df_bw, axis=1)

# For each feature, show its relative order (1 - most positive influence, n - least positive influence)
# Rank features by their absolute coefficient value to identify the most important ones
summary_feature_attributions_normalized_df_bw = rank_features_by_importance(add_relative_order_of_features(summary_feature_attributions_normalized_df_bw))
summary_feature_attributions_normalized_df_bw

Unnamed: 0,avg_attribution_coeff,relative_order,feature_importance_rank
POSIX_F_READ_TIME,0.241142,1.0,1.0
POSIX_TOTAL_TIME,0.104137,2.0,2.0
POSIX_F_WRITE_TIME,0.092017,3.0,3.0
POSIX_F_META_TIME,0.069713,4.0,4.0
POSIX_ACCESS2_ACCESS,0.010361,5.0,6.0
...,...,...,...
POSIX_SIZE_WRITE_0_100,-0.004580,93.0,16.0
POSIX_MEM_NOT_ALIGNED,-0.008242,94.0,11.0
READ_1G_PLUS,-0.009140,95.0,10.0
POSIX_SIZE_READ_1G_PLUS,-0.009203,96.0,9.0


In [156]:
summary_feature_attributions_normalized_df_bw.loc["POSIX_BYTES_READ"]

avg_attribution_coeff      -0.021065
relative_order             97.000000
feature_importance_rank     5.000000
Name: POSIX_BYTES_READ, dtype: float64

# Evaluate the model fine-tuned on the Claix dataset

### Load the pre-trained model

In [100]:
claix_model = nn.Sequential(
    nn.Linear(97, 2048),
    nn.Dropout(p=0.05),
    nn.ReLU(),
    nn.Linear(2048, 512),
    nn.Dropout(p=0.05),
    nn.ReLU(),
    nn.Linear(512, 128),
    nn.Dropout(p=0.05),
    nn.ReLU(),
    nn.Linear(128, 1),
).to(device)

In [101]:
claix_model.modules

<bound method Module.modules of Sequential(
  (0): Linear(in_features=97, out_features=2048, bias=True)
  (1): Dropout(p=0.05, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=2048, out_features=512, bias=True)
  (4): Dropout(p=0.05, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=512, out_features=128, bias=True)
  (7): Dropout(p=0.05, inplace=False)
  (8): ReLU()
  (9): Linear(in_features=128, out_features=1, bias=True)
)>

In [None]:
CLAIX_MODEL_DIR = "/home/thes1067/models/claix"
CLAIX_MODEL_NAME = "SmoothL1Loss_0.2_testSize_StandardScaler_0.001_lr_4_16_48_64_144_240_nprocs_0.05_dropout_pytorch_v1.12.0"
CLAIX_MODEL_PATH = Path(CLAIX_MODEL_DIR, CLAIX_MODEL_NAME)
CLAIX_MODEL_PATH

In [104]:
CLAIX_MODEL_PATH.exists()

True

In [105]:
print("Loading pretrained model...")

checkpoint = torch.load(CLAIX_MODEL_PATH, map_location=torch.device(device))
claix_model.load_state_dict(checkpoint['model_state_dict'])
model_epoch = checkpoint['epoch']

print(f"Current epoch: {model_epoch}")

Loading pretrained model...
Current epoch: 1199


In [106]:
ig_claix = IntegratedGradients(claix_model)
ig_attributions_claix, ig_delta_claix = ig_claix.attribute(tensor_X, baseline, n_steps=300, return_convergence_delta=True)
ig_attributions_df_claix = annotate_attributions(ig_attributions_claix, columns = list(df_claix_posix.columns))
print('Delta Avg:', torch.mean(ig_delta_claix))

Delta Avg: tensor(-2.2503, device='cuda:0', dtype=torch.float64)


In [107]:
ig_nt_claix = NoiseTunnel(ig_claix)
dl_claix = DeepLift(claix_model)
gs_claix = GradientShap(claix_model)
fa_claix = FeatureAblation(claix_model)
fp_claix = FeaturePermutation(claix_model)
sal_claix = Saliency(claix_model)
input_x_grad_claix = InputXGradient(claix_model)
guided_back_prop_claix = GuidedBackprop(claix_model)
shap_sampling_claix = ShapleyValueSampling(claix_model)

In [108]:
def perturb_fn(inputs):
    noise = torch.tensor(np.random.normal(0, 0.003, inputs.shape)).float().to(device)
    return noise, inputs - noise

In [110]:
sal_attr_claix = sal_claix.attribute(tensor_X)
sal_attr_df_claix = annotate_attributions(sal_attr_claix, columns=list(df_claix_posix.columns))



In [111]:
sens = sensitivity_max(sal_claix.attribute, tensor_X)
torch.mean(sens)



tensor(0.2232, device='cuda:0')

In [112]:
infid = infidelity(claix_model, perturb_fn, tensor_X, sal_attr_claix)
torch.mean(infid)

tensor(55610.0039, device='cuda:0')

In [113]:
fp_attr_claix = fp_claix.attribute(tensor_X)
fp_attr_df_claix = annotate_attributions(fp_attr_claix, columns=list(df_claix_posix.columns))

In [114]:
sens = sensitivity_max(fp_claix.attribute, tensor_X)
torch.mean(sens)

tensor(2.2639, device='cuda:0')

In [115]:
infid = infidelity(claix_model, perturb_fn, tensor_X, fp_attr_claix)
torch.mean(infid)

tensor(49360.8672, device='cuda:0')

In [116]:
input_x_grad_attr_claix = input_x_grad_claix.attribute(tensor_X)
input_x_grad_attr_df_claix = annotate_attributions(input_x_grad_attr_claix, columns=list(df_claix_posix.columns))



In [117]:
sens = sensitivity_max(input_x_grad_claix.attribute, tensor_X)
torch.mean(sens)



tensor(0.3036, device='cuda:0')

In [118]:
infid = infidelity(claix_model, perturb_fn, tensor_X, input_x_grad_attr_claix)
torch.mean(infid)

tensor(48032.1992, device='cuda:0')

In [119]:
guided_back_prop_attr_claix = guided_back_prop_claix.attribute(tensor_X)
guided_back_prop_attr_df_claix = annotate_attributions(guided_back_prop_attr_claix, columns=list(df_claix_posix.columns))



In [120]:
sens = sensitivity_max(guided_back_prop_claix.attribute, tensor_X)
torch.mean(sens)



tensor(0.1195, device='cuda:0')

In [121]:
infid = infidelity(claix_model, perturb_fn, tensor_X, guided_back_prop_attr_claix)
torch.mean(infid)

tensor(46559.8945, device='cuda:0')

In [122]:
shap_sampling_attr_claix = shap_sampling_claix.attribute(tensor_X)
shap_sampling_attr_df_claix = annotate_attributions(shap_sampling_attr_claix, columns=list(df_claix_posix.columns))

In [123]:
sens = sensitivity_max(shap_sampling_claix.attribute, tensor_X)
torch.mean(sens)

tensor(0.4336, device='cuda:0')

In [124]:
infid = infidelity(claix_model, perturb_fn, tensor_X, shap_sampling_attr_claix)
torch.mean(infid)

tensor(48765.1211, device='cuda:0')

In [125]:
ig_nt_attr_claix, ig_nt_delta_claix = ig_nt_claix.attribute(tensor_X, return_convergence_delta=True)
print("Delta Avg: ", torch.mean(ig_nt_delta_claix))
ig_nt_attr_df_claix = annotate_attributions(ig_nt_attr_claix, columns=list(df_claix_posix.columns))

Delta Avg:  tensor(0.2106, device='cuda:0', dtype=torch.float64)


In [126]:
dl_attr_claix, dl_delta_claix = dl_claix.attribute(tensor_X, return_convergence_delta=True)
print("Delta Avg: ", torch.mean(dl_delta_claix))
dl_attr_df_claix = annotate_attributions(dl_attr_claix, columns=list(df_claix_posix.columns))

Delta Avg:  tensor(29.1444, device='cuda:0')


               activations. The hooks and attributes will be removed
            after the attribution is finished


In [127]:
fa_attr_claix = fa_claix.attribute(tensor_X)
fa_attr_df_claix = annotate_attributions(fa_attr_claix, columns=list(df_claix_posix.columns))

In [128]:
sens = sensitivity_max(fa_claix.attribute, tensor_X)
torch.mean(sens)

tensor(1.8132, device='cuda:0')

In [129]:
infid = infidelity(claix_model, perturb_fn, tensor_X, fa_attr_claix)
torch.mean(infid)

tensor(47416.7852, device='cuda:0')

In [132]:
def calc_norm_sum(attributions_df):
    attributions_sum = attributions_df.sum(0)
    return (attributions_sum / np.linalg.norm(attributions_sum, ord=1)).to_frame(name="norm_attr_coeff") # Just division returns a Series, so cast it to a DataFrame

In [133]:
ig_attr_norm_sum_claix = calc_norm_sum(ig_attributions_df_claix)
ig_attr_norm_sum_claix

Unnamed: 0,norm_attr_coeff
POSIX_OPENS,0.000506
POSIX_FILENOS,0.000238
POSIX_DUPS,0.000000
POSIX_READS,0.003827
POSIX_WRITES,-0.002928
...,...
POSIX_F_MAX_READ_TIME,0.006486
POSIX_F_MAX_WRITE_TIME,0.007066
POSIX_F_FASTEST_RANK_TIME,-0.000308
POSIX_F_SLOWEST_RANK_TIME,0.017646


In [134]:
ig_nt_attr_test_norm_sum_claix = calc_norm_sum(ig_nt_attr_df_claix)

In [135]:
dl_attr_test_norm_sum_claix = calc_norm_sum(dl_attr_df_claix)

In [137]:
fa_attr_test_norm_sum_claix =calc_norm_sum(fa_attr_df_claix)

In [138]:
sal_norm_sum_claix = calc_norm_sum(sal_attr_df_claix)

In [139]:
input_x_grad_norm_sum_claix = calc_norm_sum(input_x_grad_attr_df_claix)

In [140]:
guided_back_prop_norm_sum_claix = calc_norm_sum(guided_back_prop_attr_df_claix)

In [141]:
fp_attr_norm_sum_claix = calc_norm_sum(fp_attr_df_claix)

In [142]:
shap_sampling_attr_norm_sum_claix = calc_norm_sum(shap_sampling_attr_df_claix)

In [143]:
aggregated_feature_attributions_normalized_df_claix = pd.concat([ig_attr_norm_sum_claix, ig_nt_attr_test_norm_sum_claix, dl_attr_test_norm_sum_claix,
                                                                fa_attr_test_norm_sum_claix, shap_sampling_attr_norm_sum_claix, guided_back_prop_norm_sum_claix, fp_attr_norm_sum_claix, input_x_grad_norm_sum_claix, sal_norm_sum_claix],
                                                                axis=1, join="inner")
aggregated_feature_attributions_normalized_df_claix.columns = ["ig", "ig_noise_tunnel", "deeplift",
                                                                "feature_ablation", "shap_sampling", "guided_backprop", "feature_permutation", "input_x_grad", "saliency"]
aggregated_feature_attributions_normalized_df_claix

Unnamed: 0,ig,ig_noise_tunnel,deeplift,feature_ablation,shap_sampling,guided_backprop,feature_permutation,input_x_grad,saliency
POSIX_OPENS,0.000506,0.000116,0.000516,-0.000363,0.000217,-0.000304,0.006641,0.000631,0.000495
POSIX_FILENOS,0.000238,0.000230,0.000239,-0.002010,0.000790,-0.000547,0.000575,0.000632,0.000427
POSIX_DUPS,0.000000,0.000002,0.000000,0.003602,0.000768,0.000040,0.009781,0.000000,0.000030
POSIX_READS,0.003827,0.000834,0.003840,0.001296,0.002608,-0.002762,0.016784,0.003819,0.002519
POSIX_WRITES,-0.002928,-0.000274,-0.003579,-0.001705,-0.001774,-0.000870,0.000634,-0.001377,0.000975
...,...,...,...,...,...,...,...,...,...
POSIX_F_MAX_READ_TIME,0.006486,0.003712,0.005942,0.010220,0.008393,-0.009642,0.016982,0.008835,0.006893
POSIX_F_MAX_WRITE_TIME,0.007066,0.004321,0.006671,0.001578,0.005058,-0.004206,0.010123,0.004350,0.001668
POSIX_F_FASTEST_RANK_TIME,-0.000308,-0.000125,-0.000364,-0.000211,-0.000047,-0.000691,0.009754,-0.000416,0.000222
POSIX_F_SLOWEST_RANK_TIME,0.017646,0.004403,0.017033,0.013705,0.015667,-0.016577,0.029480,0.016897,0.015303


In [144]:
summary_feature_attributions_normalized_df_claix = calculate_avg_attributions(aggregated_feature_attributions_normalized_df_claix, axis=1)

# For each feature, show its relative order (1 - most positive influence, n - least positive influence)
# Rank features by their absolute coefficient value to identify the most important ones
summary_feature_attributions_normalized_df_claix = rank_features_by_importance(add_relative_order_of_features(summary_feature_attributions_normalized_df_claix))
summary_feature_attributions_normalized_df_claix

Unnamed: 0,avg_attribution_coeff,relative_order,feature_importance_rank
POSIX_F_READ_TIME,0.246626,1.0,1.0
POSIX_TOTAL_TIME,0.110199,2.0,2.0
POSIX_F_WRITE_TIME,0.096342,3.0,3.0
POSIX_F_META_TIME,0.068759,4.0,4.0
POSIX_F_SLOWEST_RANK_TIME,0.012618,5.0,6.0
...,...,...,...
WRITE_0_100,-0.003555,93.0,19.0
POSIX_MEM_NOT_ALIGNED,-0.005309,94.0,13.0
POSIX_SIZE_READ_1G_PLUS,-0.006814,95.0,9.0
READ_1G_PLUS,-0.007429,96.0,8.0


In [145]:
summary_feature_attributions_normalized_df_claix.loc["POSIX_BYTES_READ"]

avg_attribution_coeff      -0.017711
relative_order             97.000000
feature_importance_rank     5.000000
Name: POSIX_BYTES_READ, dtype: float64

In [157]:
features_importance_comparison_df = pd.merge(summary_feature_attributions_normalized_df_bw, summary_feature_attributions_normalized_df_claix, left_index=True, right_index=True, suffixes=["_bw", "_claix"])
features_importance_comparison_df

Unnamed: 0,avg_attribution_coeff_bw,relative_order_bw,feature_importance_rank_bw,avg_attribution_coeff_claix,relative_order_claix,feature_importance_rank_claix
POSIX_F_READ_TIME,0.241142,1.0,1.0,0.246626,1.0,1.0
POSIX_TOTAL_TIME,0.104137,2.0,2.0,0.110199,2.0,2.0
POSIX_F_WRITE_TIME,0.092017,3.0,3.0,0.096342,3.0,3.0
POSIX_F_META_TIME,0.069713,4.0,4.0,0.068759,4.0,4.0
POSIX_ACCESS2_ACCESS,0.010361,5.0,6.0,0.011412,6.0,7.0
...,...,...,...,...,...,...
POSIX_SIZE_WRITE_0_100,-0.004580,93.0,16.0,-0.002226,87.0,38.0
POSIX_MEM_NOT_ALIGNED,-0.008242,94.0,11.0,-0.005309,94.0,13.0
READ_1G_PLUS,-0.009140,95.0,10.0,-0.007429,96.0,8.0
POSIX_SIZE_READ_1G_PLUS,-0.009203,96.0,9.0,-0.006814,95.0,9.0


In [158]:
features_importance_comparison_df.loc[features_importance_comparison_df["relative_order_bw"]!=features_importance_comparison_df["relative_order_claix"]]

Unnamed: 0,avg_attribution_coeff_bw,relative_order_bw,feature_importance_rank_bw,avg_attribution_coeff_claix,relative_order_claix,feature_importance_rank_claix
POSIX_ACCESS2_ACCESS,0.010361,5.0,6.0,0.011412,6.0,7.0
POSIX_F_SLOWEST_RANK_TIME,0.010032,6.0,7.0,0.012618,5.0,6.0
POSIX_F_MAX_WRITE_TIME,0.009443,7.0,8.0,0.004070,13.0,17.0
POSIX_F_MAX_READ_TIME,0.007053,8.0,12.0,0.006425,7.0,10.0
READ_100_1K,0.005458,9.0,13.0,0.006259,8.0,11.0
...,...,...,...,...,...,...
POSIX_SIZE_READ_1M_4M,-0.003905,91.0,18.0,-0.002390,88.0,35.0
WRITE_0_100,-0.004347,92.0,17.0,-0.003555,93.0,19.0
POSIX_SIZE_WRITE_0_100,-0.004580,93.0,16.0,-0.002226,87.0,38.0
READ_1G_PLUS,-0.009140,95.0,10.0,-0.007429,96.0,8.0


In [159]:
features_importance_comparison_df.loc[features_importance_comparison_df["feature_importance_rank_bw"]!=features_importance_comparison_df["feature_importance_rank_claix"]]

Unnamed: 0,avg_attribution_coeff_bw,relative_order_bw,feature_importance_rank_bw,avg_attribution_coeff_claix,relative_order_claix,feature_importance_rank_claix
POSIX_ACCESS2_ACCESS,0.010361,5.0,6.0,0.011412,6.0,7.0
POSIX_F_SLOWEST_RANK_TIME,0.010032,6.0,7.0,0.012618,5.0,6.0
POSIX_F_MAX_WRITE_TIME,0.009443,7.0,8.0,0.004070,13.0,17.0
POSIX_F_MAX_READ_TIME,0.007053,8.0,12.0,0.006425,7.0,10.0
READ_100_1K,0.005458,9.0,13.0,0.006259,8.0,11.0
...,...,...,...,...,...,...
POSIX_SIZE_READ_1M_4M,-0.003905,91.0,18.0,-0.002390,88.0,35.0
WRITE_0_100,-0.004347,92.0,17.0,-0.003555,93.0,19.0
POSIX_SIZE_WRITE_0_100,-0.004580,93.0,16.0,-0.002226,87.0,38.0
POSIX_MEM_NOT_ALIGNED,-0.008242,94.0,11.0,-0.005309,94.0,13.0


In [160]:
features_importance_comparison_df.to_csv(r"/home/eo080593/Projects/2021-dmytro-povaliaiev/sourcecode/visualization/data/Feature_Attributions_Filtered_Nprocs_vs_Fine-tuned.csv")

In [287]:
aggregated_feature_attributions_avg = pd.concat([avg_attributions_bw_df, ig_nt_avg_attr_bw, dl_attr_avg_bw, gs_attr_avg, fa_attr_avg, shap_sampling_attr_avg, guided_back_prop_attr_avg, fp_attr_avg, input_x_grad_attr_avg, sal_avg_attr], axis=1, join="inner")
aggregated_feature_attributions_avg.columns = ["ig", "ig_noise_tunnel", "deeplift", "gradientshap", "feature_ablation", "shap_sampling", "guided_backprop", "feature_permutation", "input_x_grad", "saliency"]
aggregated_feature_attributions_avg

Unnamed: 0,ig,ig_noise_tunnel,deeplift,gradientshap,feature_ablation,shap_sampling,guided_backprop,feature_permutation,input_x_grad,saliency
POSIX_F_READ_TIME,22555.235103,61291.281147,25156.373047,-708.246582,29200.218750,24147.093750,-296217.812500,326.970581,29735.417969,291629.375000
POSIX_TOTAL_TIME,16660.750364,86396.627293,18804.513672,-231.063446,21874.421875,17994.226562,-333323.250000,316.392670,22085.056641,333218.781250
POSIX_F_META_TIME,12806.998625,148915.586186,14409.471680,3.067998,18685.117188,13581.669922,-391480.093750,46.429176,18564.048828,561296.125000
POSIX_F_SLOWEST_RANK_TIME,4987.124063,3232.381576,5369.414062,617.464478,7213.912598,4636.626953,-34776.613281,1844.427246,7124.248535,46195.160156
POSIX_F_MAX_READ_TIME,3691.950321,6658.701544,3876.315674,-46.512398,4957.162598,3767.761719,-55492.292969,70.694702,4996.842773,47867.132812
...,...,...,...,...,...,...,...,...,...,...
POSIX_BYTES_WRITTEN,-2169.990030,-2383.134204,-2302.347168,15.510158,-3827.800781,-2104.962646,15386.246094,135.771759,-4375.528809,39114.734375
WRITE_10K_100K,-2189.946022,-3865.961135,-2164.196533,-20.773529,-4777.125488,-2761.839111,6103.601562,130.342880,-5055.902344,40884.074219
READ_100K_1M,-2195.874262,-4023.120175,-2255.310059,22.440395,-4341.409668,-3003.796631,-1124.387817,167.452576,-5313.726074,50255.136719
POSIX_MODE,-2575.584399,-2689.931016,-2768.075195,-25.652847,-3885.096680,-2185.000977,9348.550781,189.910950,-4753.264160,36103.546875


In [288]:
aggregated_feature_attributions_avg.mean(axis=1).to_frame("avg_attribution_coeff")

Unnamed: 0,avg_attribution_coeff
POSIX_F_READ_TIME,18711.590626
POSIX_TOTAL_TIME,18379.645688
POSIX_F_META_TIME,39682.842085
POSIX_F_SLOWEST_RANK_TIME,4644.414639
POSIX_F_MAX_READ_TIME,2034.775678
...,...
POSIX_BYTES_WRITTEN,3748.849875
WRITE_10K_100K,2628.227450
READ_100K_1M,2818.740500
POSIX_MODE,2675.940333


In [289]:
aggregated_feature_attributions_avg = rank_features_by_importance(add_relative_order_of_features(aggregated_feature_attributions_avg.mean(axis=1).to_frame("avg_attribution_coeff")))
aggregated_feature_attributions_avg

Unnamed: 0,avg_attribution_coeff,relative_order,feature_importance_rank
POSIX_F_META_TIME,39682.842085,1.0,1.0
POSIX_F_READ_TIME,18711.590626,2.0,2.0
POSIX_TOTAL_TIME,18379.645688,3.0,3.0
POSIX_MAX_WRITE_TIME_SIZE,8744.312691,4.0,4.0
POSIX_ACCESS2_ACCESS,6479.626682,5.0,5.0
...,...,...,...
READ_1K_10K,-578.440659,93.0,63.0
POSIX_SIZE_READ_1K_10K,-615.256061,94.0,62.0
POSIX_SLOWEST_RANK,-970.510207,95.0,43.0
rank,-1369.661555,96.0,31.0
