Fetch concept vectors from a trained concepts.
Can be used as input for the logic-based classification training/test

In [4]:
import pickle
import subprocess

from keras.backend import clear_session

from nn.utils import *


In [5]:
project_dir = '/home/rp218/luke-for-roko'
large_data_dir = f"{project_dir}/Thesis_Data"
model_dir = f"{large_data_dir}/Models"
FEATURE_EXTRACTOR = 'Resnet50V2'
features_dir = f'{large_data_dir}/Feature_vectors_{FEATURE_EXTRACTOR}'

## Load Data

In [6]:
# Parse the wrong format to the correct one
path = os.path.join(project_dir, "Extracted_Concepts/final_dict_new_codex.pkl")
luke_output = pd.read_pickle(path)

labels_keys = ['id', 'label', 'concepts']
labels_dict = {key: luke_output[key] for key in labels_keys}
labels_df_filtered = pd.DataFrame.from_dict(labels_dict)

labels = labels_df_filtered.copy()
labels_df_filtered


Unnamed: 0,id,label,concepts
0,SP7Y6KCFF2TD,out,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,LMH26GKJFGQW,play,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,HST5K3C5L9WS,ball,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,ZJ5T4M8F9USB,ball,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,257MNU1H3O56,foul,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
1915,ZDEUXDLTP1TL,play,"[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1916,863JKGRGLKMG,play,"[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1917,619MYTG7OTT0,out,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1918,BWJ683S12AE4,play,"[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
try:
    labels = pd.read_pickle('labels_100.pkl')
    X = np.load('data.npy')

except:

    X = []
    for id in labels_df_filtered['id']:
        feature_path = os.path.join(features_dir, id + '.npy')
        if os.path.isfile(feature_path):
            X.append(np.load(feature_path).T)

        else:
            labels = labels[labels['id'] != id]
            print(f"Id {id} not found")

    labels = labels.reset_index(drop=True)
    labels.to_pickle('labels_100.pkl')
    X = np.stack(X, axis=0)
    np.save('data.npy', X)

print(X.shape)
labels

Id Technically not found
(1919, 360, 2048)


Unnamed: 0,id,label,concepts
0,SP7Y6KCFF2TD,out,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,LMH26GKJFGQW,play,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,HST5K3C5L9WS,ball,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,ZJ5T4M8F9USB,ball,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,257MNU1H3O56,foul,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
1914,ZDEUXDLTP1TL,play,"[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1915,863JKGRGLKMG,play,"[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1916,619MYTG7OTT0,out,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1917,BWJ683S12AE4,play,"[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
concepts_text = pd.DataFrame.from_dict({"explanations": luke_output["explanations"]})

num_classes = 5
classes = ['strike', 'ball', 'play', 'foul', 'out']
n_concepts = 78

class_dict = {
    'strike': 0,
    'ball': 1,
    'play': 2,
    'foul': 3,
    'out': 4}

inv_class_dict = {v: k for k, v in class_dict.items()}

concept_matrix = labels['concepts'].values
concept_matrix = np.stack(concept_matrix, axis=0)
idx = np.argwhere(np.all(concept_matrix[..., :] == 0, axis=0))
concept_matrix = np.delete(concept_matrix, idx, axis=1)
concept_matrix = concept_matrix[:, :n_concepts]
print(concept_matrix.shape)

y = np.array([class_dict[label] for label in labels['label']])

y_binary = tf.keras.utils.to_categorical(y, num_classes)
print(y_binary.shape)

train_test_split_point = 1700

X_train0 = X[:train_test_split_point, :, :]
y_train_binary = y_binary[:train_test_split_point, :]
X_test0 = X[train_test_split_point:, :, :]
y_test_binary = y_binary[train_test_split_point:, :]
concept_train = concept_matrix[:train_test_split_point, :]
concept_test = concept_matrix[train_test_split_point:, :]

print(X_train0.shape)
print(y_train_binary.shape)
print(concept_train.shape)
print(X_test0.shape)
print(y_test_binary.shape)
print(concept_test.shape)

(1919, 78)
(1919, 5)
(1700, 360, 2048)
(1700, 5)
(1700, 78)
(219, 360, 2048)
(219, 5)
(219, 78)


In [9]:
#Load BEST Trained Model
model_id = "1655200844"
model = load_model(model_dir + f'/best_concept_Conv_attn_{n_concepts}_{model_id}.h5')
print(model.summary())

Model: "Video_concepts"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Input_1 (InputLayer)           [(None, 360, 2048)]  0           []                               
                                                                                                  
 Conv_1 (Conv1D)                (None, 360, 64)      393280      ['Input_1[0][0]']                
                                                                                                  
 Max_pool_1 (MaxPooling1D)      (None, 90, 64)       0           ['Conv_1[0][0]']                 
                                                                                                  
 Bn_1 (BatchNormalization)      (None, 90, 64)       256         ['Max_pool_1[0][0]']             
                                                                                     

2022-06-24 11:58:40.478193: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-06-24 11:58:40.478716: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-06-24 11:58:40.479242: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (rp218-ThinkPad-T15-Gen-1): /proc/driver/nvidia/version does not exist
2022-06-24 11:58:40.484815: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Total params: 515,643
Trainable params: 515,483
Non-trainable params: 160
__________________________________________________________________________________________________
None


In [10]:
network_type = 'concept_Conv_attn'

if (network_type == 'Conv1D' or network_type == 'LSTM'):
    cf_matrix, accuracy, macro_f1, mismatch, y_pred, = calculate_metrics(model, X_test0,
                                                                         y_test_binary)
    print('Accuracy : {}'.format(accuracy))
    print('F1-score : {}'.format(macro_f1))
    print(cf_matrix)

else:
    cf_matrix, accuracy, macro_f1, mismatch, y_pred, cf_concepts, accuracy_concepts = calculate_concept_metrics(model,
                                                                                                                X_test0,
                                                                                                                y_test_binary,
                                                                                                                concept_test)

    print('Accuracy : {}'.format(accuracy))
    print('F1-score : {}'.format(macro_f1))
    print(classes)
    print(cf_matrix)
    print(cf_concepts)
    print(accuracy_concepts)


2022-06-24 11:58:40.824219: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 645857280 exceeds 10% of free system memory.
2022-06-24 11:58:41.655558: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 94371840 exceeds 10% of free system memory.
2022-06-24 11:58:41.655637: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 94371840 exceeds 10% of free system memory.
2022-06-24 11:58:41.655661: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 94371840 exceeds 10% of free system memory.
2022-06-24 11:58:41.655682: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 94371840 exceeds 10% of free system memory.


Accuracy : 0.9315068493150684
F1-score : 0.9274226740850446
['strike', 'ball', 'play', 'foul', 'out']
[[38  3  1  0  0]
 [ 1 47  0  0  0]
 [ 0  0 61  0  1]
 [ 2  2  0 33  0]
 [ 0  0  5  0 25]]
[[15830   873]
 [  285    94]]
0.9322093431682473


In [11]:
eval_idx = 0

expected_concepts_idx = concept_matrix[eval_idx].astype(bool)
X_eval = tf.expand_dims(X[eval_idx], axis=0)
id_ = labels['id'][eval_idx]
label = labels['label'][eval_idx]

print("Evaluating:")
print(f"Row id {id_}")
print(f"With label {label}")
print("And active concept explanations:")
print(concepts_text[expected_concepts_idx])
print(X_eval.shape)

Evaluating:
Row id SP7Y6KCFF2TD
With label out
And active concept explanations:
            explanations
27  Who caught the ball.
(1, 360, 2048)


In [12]:
concept_preds, label_preds = model.predict(X_eval)
print(f"Model returned label values {label_preds}")
pred_label = np.argmax(label_preds)
print(f"Predicted label {inv_class_dict[pred_label]}")

concept_preds = concept_preds.flatten()
print(f"Model returned concept values {concept_preds}")
concept_preds[concept_preds <= 0.5] = 0
concept_preds[concept_preds > 0.5] = 1
concept_preds = concept_preds.astype(bool)
print("Model predicted concept explanations:")
print(concepts_text[concept_preds])

print()

Model returned label values [[0.03126197 0.03089079 0.33920693 0.04121688 0.5574235 ]]
Predicted label out
Model returned concept values [9.0890229e-03 6.3680476e-05 2.6383839e-05 9.6521866e-01 1.9639647e-05
 1.3530254e-04 2.8763572e-05 7.2768329e-05 4.2170381e-05 7.9640895e-01
 4.7829121e-02 2.4359256e-01 1.8143654e-04 4.3727319e-05 9.6751332e-01
 9.8797774e-01 8.6506546e-02 8.1204772e-03 4.6365618e-05 6.4697862e-04
 2.9112101e-03 8.7753332e-01 2.1720312e-05 6.1149993e-05 3.0358791e-01
 3.6013126e-04 2.0789886e-01 2.4420738e-01 4.9181202e-05 3.5589933e-04
 9.3104649e-01 4.6411584e-05 7.5052259e-05 8.4578496e-05 1.7411292e-01
 8.8996589e-03 5.7676435e-03 2.6261508e-03 1.0947287e-03 1.9651376e-05
 4.0447712e-04 4.3812394e-04 8.1198812e-03 1.9812584e-04 1.0809143e-06
 4.0526778e-02 9.6351505e-06 1.7902255e-04 1.3799991e-05 3.7279725e-04
 5.1471591e-04 1.0628998e-03 7.5526536e-06 9.4205907e-06 3.2776177e-02
 1.1704191e-04 4.2235242e-06 1.5142560e-04 1.1805678e-04 6.6835863e-05
 5.1721036e

### Get probability vectors from all probability samples

In [13]:
# model_ids = ['1655195211']
# test_id = 0
# explanations = concepts_text['explanations'].values
# print(explanations)
#
# for i, model_id in enumerate(model_ids):
#     model = load_model(model_dir + f'/best_concept_Conv_attn_{n_concepts}_{model_id}.h5')
#     output = attn_prediction(model, X_train0)
#     print(output[0])
#     preds = output[0]
#     preds[preds >= 0.5] = 1
#     preds[preds < 0.5] = 0
#     print(preds)
#     print(np.nonzero(preds))
#

In [14]:
# preds = output[test_id]
# preds[preds >= 0.5] = 1
# preds[preds < 0.5] = 0
# preds = preds.astype(bool)
# print(f"Explanations found {explanations[preds]}")
# X_visualize = X_train0[test_id]
#
# pred, pred_label, pred_concepts, pred_attn = visualize_concepts(X_visualize, model,
#                                                                 explanations, inv_class_dict)
#
# print(output.shape)
# print(output)
#

In [25]:
import os
clear_session()
model_ids = ['1655195211', '1655195642', '1655196032', '1655196419', '1655196803', '1655197228', '1655197609',
             '1655197990', '1655198373', '1655198758']
# model_ids = ['1655196419', '1655196803', '1655197228', '1655197609',
#              '1655197990', '1655198373', '1655198758']
offset = 0

for i, model_id in enumerate(model_ids):
    with tf.device("/cpu:0"):
        model = load_model(model_dir + f'/best_concept_Conv_attn_{n_concepts}_{model_id}.h5')
        i_prime = i + offset
        print(f"Iteration {i}")

        true_train_labels = labels["label"][:train_test_split_point].values
        concept_preds, _ = model.predict(X_train0)
        output = get_concept_vector_dict(concepts_text["explanations"].values, concept_preds, true_train_labels)
        pickle.dump(output, open(f"for_fastlas_examples_train_{i_prime}.pkl", 'wb'))

        true_test_labels = labels["label"][train_test_split_point:].values
        concept_preds, _ = model.predict(X_test0)
        output = get_concept_vector_dict(concepts_text["explanations"].values, concept_preds, true_test_labels)
        pickle.dump(output, open(f"for_fastlas_examples_test_{i_prime}.pkl", 'wb'))

        # concept_preds = attn_prediction(model, X_train0)
        # output = get_concept_vector_dict(concept_preds, true_train_labels)
        # pickle.dump(output, open(f"for_fastlas_examples_train_attn_{i_prime}.pkl", 'wb'))
        #
        #
        # concept_preds = attn_prediction(model, X_test0)
        # output = get_concept_vector_dict(concept_preds, true_test_labels)
        # pickle.dump(output, open(f"for_fastlas_examples_test_attn_{i_prime}.pkl", 'wb'))

        del model
        clear_session()


Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9


In [None]:
pickle.load(open("for_fastlas_examples_train_0.pkl", 'rb'))