In [1]:
#imports
import os
import pickle
from tqdm import tqdm
import utils
import gzip
import numpy as np

from dataset import Dataset
from feature_extractor import FeatureExtractor

from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report

2022-08-31 14:39:56.939273: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-31 14:39:56.939287: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def load_OOD_data(id_dataset, num_classes_id, threat_type, variation, severity, compl_path):
    #loading test set
    X_test, y_test = utils.load_dataset(
        threat_type, compl_path, id_dataset, 'test', root_path='../PRDC_2021_Data_profile_module/data')
    #correcting classes (for ood) from threat generator
    y_test = [y-num_classes_id if y >= num_classes_id else y for y in y_test ]
    X_test = X_test.astype('float32')
    print(np.shape(X_test), np.shape(y_test))

    #change here the number of instances to be used in the test
    num_instances = len(y_test)
    X_test, y_test = X_test[:num_instances], y_test[:num_instances]

    return X_test, y_test

In [3]:
# it returns an updated dictionaire containing the monitors results 
def test_monitor(arr_monitor_results, monitor, X_test, y_test, pred_test, features_test, logits_test, softmax_test):
    
    for X, y, pred, features, logits, softmax in tqdm(zip(X_test, y_test, pred_test, 
                                             features_test, logits_test, softmax_test)): 

        m_score, m_pred = None, None
        
        if monitor.name == 'react':
            m_score = monitor.predict(np.array([features]))[0]

        elif monitor.name == 'maxlogit' or monitor.name == 'energy':
            m_score = monitor.predict(np.array([logits]))[0]

        elif monitor.name == 'msp':
            m_score = monitor.predict(np.array([softmax]))[0]

        elif monitor.name == 'mahalanobis':
            m_score = monitor.predict(np.array([features]), [pred])[0]

        #all above methods are based on scores    
        if m_score is not None: m_pred = m_score<=monitor.arr_thresholds[pred]

        if monitor.name == 'oob':
            m_pred = monitor.predict([features], [pred])[0]

        elif monitor.name == 'shine':
            avg_sim_TP, logprob = monitor.predict(np.array([X]), features, pred)

            if avg_sim_TP >= monitor.arr_thresholds['S'][pred]['TP']:
                m_pred = False
            elif avg_sim_TP <= monitor.arr_thresholds['S'][pred]['FN']:
                m_pred = True
            else:
                m_pred = np.exp(logprob) <= monitor.arr_thresholds['HINE'][pred]

        if m_pred is not None:
            #print('{} reacted? ... {}'.format(monitor.name, m_pred))
            try:
                arr_monitor_results[monitor.name].append(m_pred)
            except:
                arr_monitor_results.update({monitor.name: [m_pred]})
        else:
            print('unknown monitor')
            return None

    return arr_monitor_results

In [4]:
### testing monitors
#model params
batch_size = 10
model = "resnet"

#monitor params
# layer observed by monitors that uses info from activation function layer
layer_relu_ids = [32]
id_layer_monitored = -1
additional_transform = None
adversarial_attack = None#

#dataset params
id_dataset = "cifar10"
num_classes_id = 10
threat_type = 'distributional_shift'
variations = ['snow', 'fog', 'brightness', 'saturate', 'contrast']
severities = [1, 2, 3, 4, 5]

#loading monitors
monitor_names = ['oob','react', 'msp', 'maxlogit', 'energy', 'mahalanobis','shine']
arr_monitors = []

for name in monitor_names:
    monitor_filename = os.path.join('Monitors', id_dataset, '{}.sav'.format(name))
    monitor = pickle.load(open(monitor_filename, 'rb'))
    arr_monitors.append(monitor)

for variation in variations:
    for severity in severities:
        compl_path = '{}_severity_{}'.format(variation, severity)
        
        #loading OOD data
        X_test, y_test = load_OOD_data(id_dataset, num_classes_id, threat_type, variation, severity, compl_path)
        
        #extracting and storing features from test data using an ML model trained on ID data
        dataset_test = Dataset(compl_path, "test", model, (X_test, y_test), batch_size=batch_size)
        feature_extractor = FeatureExtractor(model, id_dataset, layer_relu_ids)
        features_test, logits_test, softmax_test, pred_test, lab_test = feature_extractor.get_features(dataset_test)
        
        arr_monitor_results = {}
        
        for monitor in arr_monitors:
            print('processing', monitor.name, compl_path)
            arr_monitor_results = test_monitor(arr_monitor_results, monitor, X_test, y_test, pred_test, 
                         features_test[id_layer_monitored], logits_test, softmax_test)

        #saving results
        filename = os.path.join('results', id_dataset, threat_type, '{}.sav'.format(compl_path))
        pickle.dump(arr_monitor_results, open(filename, 'wb'))

loading data from ../PRDC_2021_Data_profile_module/data/benchmark_dataset/distributional_shift/cifar10/snow_severity_1
(70000, 32, 32, 3) (70000,)


70000it [00:02, 28152.00it/s]
70000it [00:04, 17387.15it/s]
70000it [00:00, 124116.65it/s]
70000it [00:00, 135543.36it/s]
70000it [00:02, 28947.82it/s]
70000it [00:01, 35890.55it/s]
70000it [02:24, 484.97it/s]


loading data from ../PRDC_2021_Data_profile_module/data/benchmark_dataset/distributional_shift/cifar10/snow_severity_2
(70000, 32, 32, 3) (70000,)
Extracting layers: 'layer4.2.relu_1'


100%|███████████████████████████████████████| 7000/7000 [01:41<00:00, 69.21it/s]
70000it [00:02, 24007.33it/s]
70000it [00:04, 16882.66it/s]
70000it [00:00, 136228.81it/s]
70000it [00:00, 131364.81it/s]
70000it [00:02, 27935.29it/s]
70000it [00:01, 35917.61it/s]
70000it [02:22, 489.77it/s]


loading data from ../PRDC_2021_Data_profile_module/data/benchmark_dataset/distributional_shift/cifar10/snow_severity_3
(70000, 32, 32, 3) (70000,)


70000it [00:02, 25563.10it/s]
70000it [00:04, 17398.08it/s]
70000it [00:00, 137775.78it/s]
70000it [00:00, 137821.76it/s]
70000it [00:02, 28521.03it/s]
70000it [00:02, 31500.40it/s]
70000it [02:21, 493.74it/s]


loading data from ../PRDC_2021_Data_profile_module/data/benchmark_dataset/distributional_shift/cifar10/snow_severity_4
(70000, 32, 32, 3) (70000,)
Extracting layers: 'layer4.2.relu_1'


100%|███████████████████████████████████████| 7000/7000 [01:39<00:00, 70.27it/s]
70000it [00:02, 24483.84it/s]
70000it [00:04, 17301.49it/s]
70000it [00:00, 135899.21it/s]
70000it [00:00, 127165.69it/s]
70000it [00:02, 28821.20it/s]
70000it [00:01, 35609.42it/s]
70000it [02:22, 491.59it/s]


loading data from ../PRDC_2021_Data_profile_module/data/benchmark_dataset/distributional_shift/cifar10/snow_severity_5
(70000, 32, 32, 3) (70000,)
Extracting layers: 'layer4.2.relu_1'


100%|███████████████████████████████████████| 7000/7000 [01:40<00:00, 69.49it/s]
70000it [00:02, 25535.54it/s]
70000it [00:04, 17164.08it/s]
70000it [00:00, 137175.51it/s]
70000it [00:00, 151745.57it/s]
70000it [00:02, 29264.05it/s]
70000it [00:01, 38006.45it/s]
70000it [02:22, 490.94it/s]


loading data from ../PRDC_2021_Data_profile_module/data/benchmark_dataset/distributional_shift/cifar10/fog_severity_1
(70000, 32, 32, 3) (70000,)
Extracting layers: 'layer4.2.relu_1'


100%|███████████████████████████████████████| 7000/7000 [01:40<00:00, 69.52it/s]
70000it [00:01, 43207.27it/s]
70000it [00:04, 17307.17it/s]
70000it [00:00, 124624.04it/s]
70000it [00:00, 126389.66it/s]
70000it [00:02, 28193.40it/s]
70000it [00:01, 36504.96it/s]
70000it [02:23, 487.48it/s]


loading data from ../PRDC_2021_Data_profile_module/data/benchmark_dataset/distributional_shift/cifar10/fog_severity_2
(70000, 32, 32, 3) (70000,)
Extracting layers: 'layer4.2.relu_1'


100%|███████████████████████████████████████| 7000/7000 [01:41<00:00, 69.25it/s]
70000it [00:01, 38385.21it/s]
70000it [00:04, 17125.33it/s]
70000it [00:00, 138341.02it/s]
70000it [00:00, 137963.39it/s]
70000it [00:02, 28706.21it/s]
70000it [00:01, 37905.54it/s]
70000it [02:21, 493.08it/s]


loading data from ../PRDC_2021_Data_profile_module/data/benchmark_dataset/distributional_shift/cifar10/fog_severity_3
(70000, 32, 32, 3) (70000,)
Extracting layers: 'layer4.2.relu_1'


 32%|████████████▍                          | 2224/7000 [00:32<01:08, 69.35it/s]

KeyboardInterrupt



In [None]:
#Evaluation
## Ex: loading results for distributional shift snow severity 4
file = os.path.join('results', id_dataset, threat_type, '{}.sav'.format('snow_severity_4'))
all_results = pickle.load(open(file, 'rb'))
## ground truth
#if pred == y then monitor does not need to activate (False)
#or monitor should activate otherwise (True)
m_true = [False if pred == y else True for pred, y in zip(pred_test, y_test)]

#m_true = np.array(m_true).astype(bool)
#m_shine = np.array(m_shine).astype(bool)
#m_shine_2 = np.array(m_shine_2).astype(bool)
#m_oob = np.array(m_oob).astype(bool)
#m_react = np.array(m_react).astype(bool)
#m_msp = np.array(m_msp).astype(bool)
#m_logits = np.array(m_logits).astype(bool)
#m_energy = np.array(m_energy).astype(bool)
#m_mhlnbis = np.array(m_mhlnbis).astype(bool)

print('accuracy of the ML model alone')
print('classification_report', classification_report(lab_test, pred_test))
print('mcc', mcc(lab_test, pred_test))
print('balanced_accuracy_score', balanced_accuracy_score(lab_test, pred_test))
print('\n\n')

for k,v in all_results.items():
    print(k, 'monitor')
    print('classification_report', classification_report(m_true, v))
    print('mcc', mcc(m_true, v))
    print('balanced_accuracy_score', balanced_accuracy_score(m_true, v))
    print('\n\n')

In [None]:
'''
snow threshold 0.9
0.3713363713102381
0.640411057439756


id_threshold = int(len_scores*threshold_shine)
min_threshold_sim = sorted_scores[-id_threshold]
max_threshold_sim = sorted_scores[id_threshold]

SHINE new 2
              precision    recall  f1-score   support

       False       0.62      0.86      0.72      9367
        True       0.94      0.82      0.87     26665

    accuracy                           0.83     36032
   macro avg       0.78      0.84      0.80     36032
weighted avg       0.86      0.83      0.83     36032

0.6156203195994662
0.8364962945766474
'''

In [None]:
'''
React
              precision    recall  f1-score   support

       False       0.98      0.02      0.04     25818
        True       0.64      1.00      0.78     44182

    accuracy                           0.64     70000
   macro avg       0.81      0.51      0.41     70000
weighted avg       0.76      0.64      0.50     70000

0.11052419184412475
0.5098443945123656

Max Softmax probability
              precision    recall  f1-score   support

       False       0.00      0.00      0.00     25818
        True       0.63      1.00      0.77     44182

    accuracy                           0.63     70000
   macro avg       0.32      0.50      0.39     70000
weighted avg       0.40      0.63      0.49     70000

0.0
0.5

Max Logits
              precision    recall  f1-score   support

       False       0.37      1.00      0.54     25818
        True       0.87      0.00      0.01     44182

    accuracy                           0.37     70000
   macro avg       0.62      0.50      0.27     70000
weighted avg       0.69      0.37      0.20     70000

0.026908422094739797
0.5015141376479284

Energy
              precision    recall  f1-score   support

       False       0.58      0.65      0.62     25818
        True       0.78      0.73      0.75     44182

    accuracy                           0.70     70000
   macro avg       0.68      0.69      0.69     70000
weighted avg       0.71      0.70      0.70     70000

0.37349132101691174
0.6905844936380467

Mahalanobis
              precision    recall  f1-score   support

       False       1.00      0.02      0.03     25818
        True       0.64      1.00      0.78     44182

    accuracy                           0.64     70000
   macro avg       0.82      0.51      0.41     70000
weighted avg       0.77      0.64      0.50     70000

0.10475331158410182
0.5086373847703153
'''