In [60]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
from tqdm import tqdm
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #for no tensorflow warnings"

# Try Clustering Neurons with similar Gradients

## build model

In [7]:
# load dataset
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

x_train = x_train.reshape(60000, 28, 28).astype("float32") / 255
x_test = x_test.reshape(10000, 28, 28).astype("float32") / 255

y_train = y_train.astype("float32")
y_test = y_test.astype("float32")
#y_train = tf.keras.utils.to_categorical(y_train, num_classes = 10)
#y_test = tf.keras.utils.to_categorical(y_test, num_classes = 10)

# Reserve 10,000 samples for validation
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]

In [18]:
# calculate jacobian -- single input 
def get_jacobian(inp):
    outputs = []
    with tf.GradientTape(persistent=True) as tape:
        tape.watch(inp)
        outputs.append(inp)
        out = inp
        for layer in model.layers:
            out = layer(out)
            tape.watch(out)
            outputs.append(out)
        ret = out
        print([var.name for var in tape.watched_variables()])

    jacobian_list = []
    for o in outputs:
        grad = tape.jacobian(ret, o)# works dret/do
        assert grad.shape[0] == 1
        if grad.shape[3] == 28:#TODO: hard-coded -- find way to only squeeze dim with shape=1
            grad = tf.reshape(grad, shape=(grad.shape[1], grad.shape[3], grad.shape[4]))
        else:
            grad = tf.reshape(grad, shape=(grad.shape[1], grad.shape[3]))        
        jacobian_list.append(tf.transpose(grad)) # i-th row corresponds to da_i/dy for activation neuron a_i
        #jacobian_list.append(grad)
    return jacobian_list

# calculate jacobian -- multiple inputs
def calc_avg_jacobian(inputs):
    jac_total = None
    for i in range(inputs.shape[0]):
        inp_test = inputs[i]
        inp_test = tf.expand_dims(inp_test, axis=0)
        jac_list = get_jacobian(inp_test)
        if not jac_total:
            jac_total = jac_list
        else:
            for i in range(len(jac_total)):
                jac_total[i] += jac_list[i]
    return jac_total


In [62]:
num_samples=30
x_test_labeli = tf.squeeze(x_test[tf.where(y_test == float(0))])
inpt = x_test_labeli[0:num_samples]
jac_dict = calc_avg_jacobian(inpt)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:22<00:00,  1.34it/s]


In [9]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=(28,28,1)))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dense(10))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               78500     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 10)                1

In [10]:
def softmax_loss():        
    def loss(y_true, y_pred):
        y_soft = tf.keras.activations.softmax(y_pred, axis=1)
        loss_soft=keras.losses.SparseCategoricalCrossentropy()(y_true, y_soft)
        return loss_soft
    return loss

In [11]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=3e-4),
    loss=softmax_loss(), 
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

In [None]:
history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=20,
    validation_data=(x_val, y_val),
    verbose=2,    
)

In [55]:
test_loss, test_acc = model.evaluate(x_test,  y_test, verbose=1)



### calculate Jacobian for same-class inputs

In [16]:
num_samples = 50
for i in range(0, 10):    
    x_test_labeli = tf.squeeze(x_test[tf.where(y_test == float(i))])
    inpt = x_test_labeli[0:num_samples]
    jac_total = calc_avg_jacobian(inpt)
    np.save("gradients_class_{0}.npy".format(i), jac_total[2:-1], allow_pickle=True)

None


AttributeError: 'NoneType' object has no attribute 'shape'

## save and load Jacobians

In [41]:
#np.save("gradients_class_{0}.npy".format(i), jac_total[2:-1], allow_pickle=True)

In [3]:
jac_per_class = {}
for i in range(0, 10):
    jac_per_class[i] = np.load("gradients_class_{0}.npy".format(i), allow_pickle=True)

## normalize Jacobians

In [4]:
tf.executing_eagerly()

True

In [5]:
def pairwise_euclidean_dist(A):
    r = tf.reduce_sum(A*A, 1)
    r = tf.reshape(r, [-1, 1])    # turn r into column vector
    D = r - 2*tf.matmul(A, tf.transpose(A)) + tf.transpose(r)
    return tf.math.abs(D)

In [182]:
import sklearn
def pairwise_cosine_dist(A):
    D = sklearn.metrics.pairwise.cosine_distances(A)
    return D

In [6]:
def jac_l2normalized(jac_total):
    # every row-vector (here: da_i/dy for one a_i-neuron) has length 1
    jac_total_l2normalized = []
    jac_l2norm_dists = []
    for i in range(len(jac_total)):
        j = jac_total[i]
        a = tf.math.l2_normalize(j, axis=1)
        jac_total_l2normalized.append(a)
        d = pairwise_euclidean_dist(a)
        jac_l2norm_dists.append(d)
    return jac_l2norm_dists

In [126]:
def jac_unnormalized(jac_total):
    jac_l2norm_dists = []
    for i in range(len(jac_total)):
        j = jac_total[i]
        #d = pairwise_euclidean_dist(j)
        d = pairwise_cosine_dist(j)
        jac_l2norm_dists.append(d)
    return jac_l2norm_dists

In [8]:
def jac_normalized(jac_total):
    # all elements lie between 0 and 1
    jac_total_normalized = []
    jac_norm_dists = []
    for i in range(len(jac_total)):
        j = jac_total[i]
        mean = tf.expand_dims(tf.math.reduce_mean(j, axis=1),axis=1)
        stdv =  tf.expand_dims(tf.math.reduce_std(j, axis=1),axis=1)
        a = (j - mean)/stdv
        jac_total_normalized.append(a)
        d = pairwise_euclidean_dist(a)
        jac_norm_dists.append(d)
    return jac_norm_dists

In [127]:
# compute da_i/dy da_j/dy distances for each label-class, then average over distances
#dists = jac_l2normalized(jac_per_class[0])
dist_fn = jac_unnormalized
dists = dist_fn(jac_per_class[0])

dists_tensor = tf.expand_dims(tf.convert_to_tensor(jac_l2normalized(jac_per_class[0])), axis=1)
for class_ in range(1, 10):
    dists2 = dist_fn(jac_per_class[class_])
    dists2_tensor = tf.expand_dims(tf.convert_to_tensor(dists2), axis=1)
    dists_tensor = tf.concat([dists_tensor, dists2_tensor], axis=1)
    for j in range(len(dists2)):
        dists[j] += dists2[j]
for j in range(len(dists)):
    dists[j] = dists[j]/10
    
print(dists_tensor.shape)
stdvs = tf.math.reduce_std(dists_tensor, axis=1)
print(tf.math.reduce_max(stdvs, axis=(1, 2)))

(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(5, 10, 100, 100)
tf.Tensor([0.9130637 0.9090496 0.9069576 0.7158661 0.5781299], shape=(5,), dtype=float32)


In [183]:
# only use distances over _one_ class
class_idx = 1
dists = jac_unnormalized(jac_per_class[class_idx])

In [79]:
# use only da_i/dy_k of input-class k --> build "jacobian" with less noise
# init new jacs
num_classes = 10
denoised_jacobian = np.zeros((5, 100, num_classes)) # shape = (#layers, #neurons_p_layer, #out_neurons)
for idx in jac_per_class:
    denoised_jacobian[:, :, idx] = jac_per_class[idx][:,:,idx]

dists = jac_normalized(denoised_jacobian) # list of layer-1 dists in idx 1

## save and load model

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import os
import sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #for no tensorflow warnings"
sys.path.append(os.path.abspath('..'))

In [11]:
from src import *
from models import *
from src.main import Cluster_Class, set_file_model, set_keras_model
from src.models import Keras_Model
from src.cluster_methods import SensitivityAnalysis



In [3]:
#model.save('MNIST_5x100.h5')

In [190]:
def softmax_loss():        
    def loss(y_true, y_pred):
        y_soft = tf.keras.activations.softmax(y_pred, axis=1)
        loss_soft=keras.losses.SparseCategoricalCrossentropy()(y_true, y_soft)
        return loss_soft
    return loss
new_model = tf.keras.models.load_model('MNIST_5x100.h5', custom_objects={'loss': softmax_loss()})

## cluster similar neurons

In [191]:
m = set_keras_model(new_model) # layer printed last does not get clustered

In [187]:
# only use distances over _one_ class
m.update_keras()
m.test_accuracy()
for idx in range(10):
    print(idx, m.test_MNIST_labelacc(idx))

Test set accuracy:  0.9746000170707703
0 0.9867346882820129
1 0.9885462522506714
2 0.9593023061752319
3 0.9613861441612244
4 0.9867616891860962
5 0.9652466177940369
6 0.9780793190002441
7 0.9747081995010376
8 0.9691991806030273
9 0.9742318987846375


In [192]:
SA = SensitivityAnalysis(dists_l1=dists[0],dists_l2=dists[1], dists_l3=dists[2], 
                         dists_l4=dists[3],dists_l5=dists[4], eps=0.2)

In [193]:
# 50-3 tries clustering the third layer, 50-2 tries the second, 50-1 the first
cc = Cluster_Class(m, [1,1,100-3,100-4,100-5], cl_method="gradients", SA=SA)
acc, dic = cc.perform_clustering(verbose=True)
print(acc, dic)

Test set accuracy:  0.9746000170707703
----- [Get Activations] -----
----- [Start Clustering] -----
   - Layer 1
----- [Get Clusters] -----
K= 99
DO NOTHING
[]
   - Layer 2
----- [Get Clusters] -----
K= 99
DO NOTHING
[]
   - Layer 3
----- [Get Clusters] -----
K= 3
12  different clusters found 
num clustered neurons= 57
[(0, array([ 2, 12, 24, 25, 29, 30, 38, 46, 59, 61, 68, 69, 72, 80, 90, 98]), 25, 0, 55.223244), (1, array([ 4, 21, 50]), 21, 0, 23.908281), (2, array([ 5, 43]), 5, 0, 35.683643), (3, array([ 7, 28, 79]), 79, 0, 44.760746), (4, array([10, 15, 16, 17, 32, 41, 44, 74, 78, 85, 97, 99]), 41, 0, 70.96557), (5, array([26, 34, 62, 66, 71, 81]), 62, 0, 55.346405), (6, array([31, 92]), 31, 0, 36.940388), (7, array([36, 37, 95]), 95, 0, 23.067339), (8, array([42, 60, 64, 89]), 64, 0, 28.166096), (9, array([45, 55]), 45, 0, 33.21388), (10, array([48, 52]), 48, 0, 37.31882), (11, array([91, 93]), 91, 0, 27.665648)]
----- [Apply the Clustering to the Network] -----
   - Layer 4
-----

In [194]:
print("curr dist-dependency on label #", class_idx)
for idx in range(10):
    print(idx, m.test_MNIST_labelacc(idx))

curr dist-dependency on label # 1
0 0.6510204076766968
1 0.8757709264755249
2 0.7306201457977295
3 0.9683168530464172
4 0.709775984287262
5 0.8206278085708618
6 0.9874739050865173
7 0.9834630489349365
8 0.9476386308670044
9 0.9167492389678955


In [181]:
# 50-3 tries clustering the third layer, 50-2 tries the second, 50-1 the first
cc = Cluster_Class(m, [1,1,100-46,100-34,100-51], cl_method="random")
cc.perform_clustering(verbose=True)

Test set accuracy:  0.9746000170707703
----- [Get Activations] -----
----- [Start Clustering] -----
   - Layer 1
----- [Get Clusters] -----
K= 99
[]
   - Layer 2
----- [Get Clusters] -----
K= 99
[]
   - Layer 3
----- [Get Clusters] -----
K= 46
sample  100 46
[(0, array([ 2,  8, 18, 20, 30, 62, 68, 80, 82, 94, 96]), 94, 0, 54.99963), (1, array([ 1,  9, 19, 27, 28, 29, 32, 33, 36, 44, 49, 51, 57, 71, 79, 84, 90,
       93]), 90, 0, 67.02696), (2, array([ 7, 11, 22, 35, 42, 47, 59, 63, 65, 67, 69, 70, 73, 77, 95, 98, 99]), 7, 0, 53.4461)]
----- [Apply the Clustering to the Network] -----
   - Layer 4
----- [Get Clusters] -----
K= 34
sample  100 34
[(0, array([ 6, 41, 55, 56, 62, 65, 75, 79, 94]), 6, 0, 77.30345), (1, array([ 0,  1,  5,  7, 45, 51, 53, 54, 73, 74, 77, 93, 95, 96]), 73, 0, 77.68521), (2, array([ 2, 10, 11, 21, 31, 33, 34, 49, 58, 71, 99]), 11, 0, 69.685455)]
----- [Apply the Clustering to the Network] -----
   - Layer 5
----- [Get Clusters] -----
K= 51
sample  100 51
[(0, a

(0.6304000020027161, {'rr': 0.244, 'time': 5.310251951217651, 'rr_rel': 0.244})

## Utils

In [32]:
# test if not clustering changes network
cc = Cluster_Class(m, [10], cl_method="dbscan")
cc.perform_clustering(verbose=True)

Test set accuracy:  0.9760000109672546
----- [Get Activations] -----
----- [Start Clustering] -----
   - Layer 1
----- [Get Clusters] -----
[(0, array([30, 75, 76]), 30, 0, 11.99565)]
----- [Apply the Clustering to the Network] -----
   - Layer 2
----- [Get Clusters] -----
[(0, array([ 4,  7, 14, 15, 26, 45, 72, 97]), 7, 0, 12.251186)]
----- [Apply the Clustering to the Network] -----
   - Layer 3
----- [Get Clusters] -----
[(0, array([ 5, 22, 39, 57, 84, 88, 89, 95, 98]), 5, 0, 10.84604)]
----- [Apply the Clustering to the Network] -----
   - Layer 4
----- [Get Clusters] -----
[(0, array([12, 15, 18, 54, 65, 87]), 12, 0, 7.6198654)]
----- [Apply the Clustering to the Network] -----
   - Layer 5
----- [Get Clusters] -----
[(0, array([ 4,  5,  7,  8, 16, 28, 30, 43, 83, 86, 88, 90, 91, 94, 97]), 4, 0, 11.6834)]
----- [Apply the Clustering to the Network] -----
Overall Time:  3.1475541591644287
Test set accuracy:  0.972100019454956
----- [End Clustering] -----


(0.972100019454956,
 {'rr': 0.07199999999999995, 'time': 5.223412036895752, 'rr_rel': 0.072})

In [6]:
km.update_keras()

<tensorflow.python.keras.engine.sequential.Sequential at 0x7f7c02232c40>

In [5]:
km = set_keras_model(new_model)
km.test_accuracy()
print(km.params.keys())

2021-11-09 23:12:24.210482: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-09 23:12:24.225949: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2700905000 Hz


Test set accuracy:  0.9232000112533569
dict_keys(['W1', 'b1', 'W2', 'b2', 'W3', 'b3', 'W4', 'b4'])
