In [13]:
from nnclr import *
import tensorflow as tf 
import numpy as np 
import matplotlib.pyplot as plt 
import time 
import scipy
import sklearn
from utils import *

from waveform_data import WaveFormDataset
tf.random.set_seed(0)

# Load model

In [14]:
DSET_PATH =  "..\\data"
DSET_NAME = "hybrid_static_siprobe_64C_600_S12"
GENERATE_TEMPLATES = True

file_id = "ckpt_1654866143" # 

encoder = tf.keras.models.load_model("checkpoints\\"+file_id+"\\encoder_high_acc.h5", custom_objects={"ResidualBlock":ResidualBlock, "LatentLayer":LatentLayer, "RandomBrightness":RandomBrightness})

encoder.compile()



In [3]:

def get_encoded_dataset(data_gen, enc, iterate=20, supervised=False):
    
    main_res = []
    target_samples = []
    supervised_arr = []
    
    it = iter(data_gen)
    for i in range(iterate):
        
        print(f"Iterating ... {i}", end="\r")
        try:
            sample = next(it)
        except Exception as e:
            print(f"Maximum data {e}")
            break #continue
        
        target_samples.extend(tf.squeeze(sample[1]))
        
        if supervised: # if supervised we get a tuple as X as (sample, ch)
            
            supervised_arr.extend(sample[0][1])
            
                #target_samples.extend(tf.squeeze(sample[1]))
            sample = sample[0]
            
        result = enc(sample[0])

        result = tf.reshape(result, [-1,width])


        main_res.extend(result)

    target_samples = np.array(target_samples)
    contrastive_labels = tf.zeros(target_samples.shape[0]) # batch size 

    unique_batch_values = np.unique(target_samples, axis=0) #

    for i in range(unique_batch_values.shape[0]):

        eq = tf.equal(unique_batch_values[i], target_samples)

        new_labels = tf.math.reduce_all(eq, axis=-1)
        contrastive_labels = tf.where(new_labels, i, contrastive_labels)


    main_labels = np.array(contrastive_labels)
    main_res = np.array(main_res)
    
    if supervised:
        return main_res, main_labels, np.array(supervised_arr)
    
    return main_res, main_labels


In [10]:
%matplotlib qt

def visualize(main_res, main_labels, one_hot=False, use_pca=False, max_data = 1000, perplexity=30):
    import matplotlib.pyplot as plt 
    from sklearn.decomposition import PCA
    from sklearn.manifold import TSNE
    pca = PCA(n_components=2)
    tsne = TSNE(n_components=2, learning_rate='auto', init='random',random_state=0, verbose=True, perplexity=perplexity)


    def res(x, label):
        
        uni = np.unique(label)
        
        for u in uni:
            
            xu = x[label==u]
            rgb = np.random.rand(3,)
            plt.scatter(xu[:,0], xu[:,1], color=[rgb])

            plt.show()

    mres = main_res[:max_data]
    
    if use_pca:
        result = pca.fit_transform(mres)
    else:
        
        pca = PCA(n_components=10)
        mres = pca.fit_transform(mres)
        result = tsne.fit_transform(mres)
    
    mlabels = main_labels[:max_data]
    
    if one_hot:
        mlabels = np.argmax(mlabels, axis=-1)
            
    res(result, mlabels)
    
#visualize(main_res, main_labels)

## Check similarity w/ and w/o channel distance correction

In [15]:
%matplotlib inline

dset3 = WaveFormDataset([os.path.join(DSET_PATH, DSET_NAME)], batch_size=64, supervised=True, min_snr=0, use_cache=False)
dset_w_idx = dset3()
main_res, main_labels, ch = get_encoded_dataset(dset_w_idx, encoder, iterate=20, supervised=True)

Iterating ... 19

In [6]:
main_labels = main_labels.astype(np.int32)
print(np.unique(main_labels))
print(main_res.shape)
print(main_labels.shape)


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58]
(64000, 32)
(64000,)


AssertionError: 

In [None]:
%matplotlib qt

from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('manhattan')  # euclidean or manhattan

clusters = [] 
cluster_means = []
cluster_positions = []



for l in np.unique(main_labels):
    
    means = np.mean(main_res[main_labels == l], axis=0)
    pos = np.mean(ch[main_labels == l])
    cluster_means.append(means)
    cluster_positions.append(pos)
    
cluster_means = np.array(cluster_means)
cluster_positions = np.array(cluster_positions)


## Similarity #################

sim = dist.pairwise(cluster_means)
sim = sim - np.min(sim, axis=0)
sim = sim / np.max(sim, axis=0)

sim = 1 - sim


## Position ####################
cluster_positions = np.expand_dims(cluster_positions, 1)
pos = scipy.spatial.distance_matrix(cluster_positions, cluster_positions)
pos = pos - np.min(pos)
pos = pos / np.max(pos)
pos = 1 - pos
pos = pos #**3

## Combination #################
comb = sim * pos
comb = comb / np.max(comb)


plt.matshow(sim)
plt.title("Similarity/Distance matrix - Sim")

plt.matshow(pos)
plt.title("Channel 1-distance normalized - CH")

plt.matshow(comb)
plt.title("Sim matrix combined with distance - Sim * CH")

max_val = np.max(comb - np.diag(np.diag(comb)))
print(f"Max value (excluding diagonals): {max_val}")


comb2 = np.where(comb < .8, 0, comb)
plt.matshow(comb2)
plt.title("Sim matrix combined with distance - .8 thresholded")

remaining_candidates = (np.count_nonzero(comb2) - comb2.shape[0]) / 2 

print(f"Remaining candidates: {remaining_candidates}, accuracy could be max: {(comb2.shape[0]-remaining_candidates) / comb2.shape[0]}")

## Getting the templates ################
templates = dset3._waveforms[0].numpy()
w_min = tf.math.reduce_min(templates, keepdims=True, axis=-1)
w_max = tf.math.reduce_max(templates, keepdims=True, axis=-1)
templates = (templates - w_min) / (w_max - w_min)


## Template embedding similarity ##################

t_sim = dist.pairwise(templates, templates) 
t_sim = t_sim - np.min(t_sim, axis=0)
t_sim = t_sim / np.max(t_sim, axis=0)
t_sim = 1 - t_sim

plt.matshow(t_sim)
plt.title("Template 1-distance (normalized) ")


templates = encoder(templates)
## this is not a good solution but for trying out it will do it 
print(templates.shape)
#templates = templates[:cluster_means.shape[0]]
templates = templates[templates.shape[0]-cluster_means.shape[0]:]
print(templates.shape)
templates = templates[::-1]

## Template embedding similarity ##################

t_sim = dist.pairwise(templates,templates) 
t_sim = t_sim - np.min(t_sim, axis=0)
t_sim = t_sim / np.max(t_sim, axis=0)
t_sim = 1 - t_sim

plt.matshow(t_sim)
plt.title("Template embedding 1-distance (normalized) ")

## Mean Embedding - template similarity similarity ################

diff_sim = t_sim * sim
plt.matshow(diff_sim)
plt.title(" Template - Embedding similarity")

## Mean - template distance #############

mt_sim = dist.pairwise(templates,cluster_means) 
mt_sim = mt_sim - np.min(mt_sim, axis=0)
mt_sim = mt_sim / np.max(mt_sim, axis=0)
mt_sim = 1 - mt_sim

plt.matshow(mt_sim)
plt.xlabel("Templates")
plt.ylabel("Cluster mean")
plt.title("Template and mean cluster 1-distance (normalized) - Mt")


corrected_sim = np.abs(sim - mt_sim)
plt.matshow(corrected_sim)
plt.title("Difference between Sim and Mt - | Sim - mt_sim |")
    

# CALCULATE AUC 

In [None]:
from sklearn.metrics import auc

thresholds = [x/100 for x in range(0, 100, 5)]

def get_score(corr, thresh):
    
    shape_0 = corr.shape[0]
    corr2 = np.where(corr < thresh, 0, corr)

    remaining_candidates = (np.count_nonzero(corr2) - shape_0) 
    
    total_comb = shape_0**2

    acc = max(0, (total_comb-remaining_candidates)) / total_comb
    
    return acc 

accs = [get_score(comb, x) for x in thresholds]

print(f'computed AUC: {auc(thresholds,accs)}')

plt.plot(thresholds, accs)
plt.show()

In [12]:
%matplotlib qt
plt.title("YT")
visualize(main_res, main_labels, one_hot=False, max_data =20000, perplexity=30, use_pca=False)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 20000 samples in 0.052s...
[t-SNE] Computed neighbors for 20000 samples in 1.619s...
[t-SNE] Computed conditional probabilities for sample 1000 / 20000
[t-SNE] Computed conditional probabilities for sample 2000 / 20000
[t-SNE] Computed conditional probabilities for sample 3000 / 20000
[t-SNE] Computed conditional probabilities for sample 4000 / 20000
[t-SNE] Computed conditional probabilities for sample 5000 / 20000
[t-SNE] Computed conditional probabilities for sample 6000 / 20000
[t-SNE] Computed conditional probabilities for sample 7000 / 20000
[t-SNE] Computed conditional probabilities for sample 8000 / 20000
[t-SNE] Computed conditional probabilities for sample 9000 / 20000
[t-SNE] Computed conditional probabilities for sample 10000 / 20000
[t-SNE] Computed conditional probabilities for sample 11000 / 20000
[t-SNE] Computed conditional probabilities for sample 12000 / 20000
[t-SNE] Computed conditional probabilities for sam

# Generate embeddings for the templates

In [18]:

datasets_list = ["hybrid_static_siprobe_64C_600_S11", "hybrid_static_siprobe_64C_600_S12", "REC_32C_600S_31", "1103_1_1"]
full_datasets = [os.path.join("..\\data\\", x) for x in datasets_list]

template_res = dset3._waveforms

if GENERATE_TEMPLATES:
    for i, tr in enumerate(template_res):

        for j, we in enumerate(tr):
            path = os.path.join("..\\supervised\\data\\", datasets_list[i], "waveform_embeddings")
            os.makedirs(path, exist_ok=True)
            
            np.save(os.path.join(path, str(j)+".npy"), we)
