In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import os
from pprint import pprint
import tensorflow as tf

os.chdir('../')
from utils.custom_utils import read_from_pickle, save_to_pickle, create_nn
from utils.nn_utils import read_params_dict, load_trained_weights
from utils.plot_utils import print_output_log, plot_pearson_corr

2022-12-14 17:25:01.184771: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


### Read pickle files 

In [None]:
save_dir = '../runs/CD_10_nodes_layer_and_sample_dropout_nested'
#file1 = read_from_pickle('X_type_list.pkl', save_dir)
#file2 = read_from_pickle('X_dim_list.pkl', save_dir)
file3 = read_from_pickle('list_of_nn_params_dict.pkl', save_dir)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(file3)

In [None]:
file = read_from_pickle('scalers_list.pkl', save_dir)
print(file)
print(len(file))
print(file[0].scale_)
print(file[0].mean_)
print(file[0].var_)

### Plot PCA 3D space

In [9]:
# Select the model directory
run_dir = '../runs/CD_10_nodes_layer_and_sample_dropout_nested'
nn_index = 1

params_dict_list = read_params_dict(run_dir, nn_index)

nn_save_dir = params_dict_list[0]
nn_params_dict = params_dict_list[1]
nn_dataset_dict = params_dict_list[4]

latents = read_from_pickle('nn_latents.pkl', nn_save_dir)

plots_dir = nn_save_dir + '/plots'
if not os.path.exists(plots_dir):
    os.mkdir(plots_dir)

figure_dir = plots_dir + '/pca_3D'
if not os.path.exists(figure_dir):
    os.mkdir(figure_dir)

# Plot for 3D PCA is only available to view through jupyter notebook
pca_3D = PCA(n_components=3)
latents_transformed_to_3D = pca_3D.fit_transform(latents)

save_to_pickle(latents_transformed_to_3D, 'PCA_3D_data.pkl', figure_dir)
print(' --> Transformed points to 3D')
print(' --> PCA 3D Stats : \n')
# Principle axes in feature space representing direction of max variance
print(f' --> Components : {pca_3D.components_} \n')
print(f' --> Explained variance : {pca_3D.explained_variance_} \n')
print(f' --> Explained variance ratio : {pca_3D.explained_variance_ratio_} \n')
print(f' --> Per feature empirical mean : {pca_3D.mean_} \n')

if nn_params_dict['add_supervision_on_latent']:
    # We only use the first label to color code points in the latent
    y = np.squeeze(read_from_pickle('nn_y_pred.pkl', nn_save_dir)[0].numpy())

    # Check if any preprocessing done on the labels
    if nn_dataset_dict['y_preprocess_scheme']:
        y_preprocessors = params_dict_list[8]
        y_encoder_list = y_preprocessors[0]
        y_scaler_list = y_preprocessors[1]

        y_descr = params_dict_list[7]
        y_dtype_list = y_descr[0] 
        y_dim_list = y_descr[1]

        if y_dtype_list[0] == 'cat':
            c = np.squeeze(y_encoder_list[0].inverse_transform(y.reshape(-1,1)))
        else:
            c = np.squeeze(y_scaler_list[0].inverse_transform(y.reshape(-1,1)))
    else:
        c = y
        
    cmin = np.min(c)
    cmax = np.max(c)
    colorscale = 'viridis'
else:
    c = 'k'
    cmin = None
    cmax = None
    colorscale = None

fig_pca_3d = go.Figure(data=[go.Scatter3d(x=latents_transformed_to_3D[:,0],
                                          y=latents_transformed_to_3D[:,1],
                                          z=latents_transformed_to_3D[:,2],
                                          mode='markers',
                                          marker=dict(size=4,
                                                      color=c,
                                                      colorscale=None,
                                                      cmin=cmin,
                                                      cmax=cmax,
                                                      showscale=True,
                                                      opacity=1))])
    
fig_pca_3d.show()

 --> Transformed points to 3D
 --> PCA 3D Stats : 

 --> Components : [[ 0.09375285  0.97307974  0.21053849]
 [-0.19244777 -0.18976898  0.9627832 ]
 [ 0.97681844 -0.13078137  0.16947564]] 

 --> Explained variance : [1.8206244 0.9867056 0.4121323] 

 --> Explained variance ratio : [0.56550574 0.3064815  0.12801278] 

 --> Per feature empirical mean : [0.8301959 1.1373472 1.5102644] 



In [None]:
# Test to check is latents are decorrelated

import pandas as pd
import seaborn as sns

data = pd.DataFrame(latents_transformed_to_3D)
sns.heatmap(data.corr(), annot=True, cmap='bwr')

In [None]:
# Calculating spearman correlation coefficient
from scipy.stats import spearmanr

rho, pvalue = spearmanr(latents_transformed_to_3D)
sns.heatmap(pvalue, annot=True, cmap='bwr')

In [None]:
# Calculating mutual information
# Import the necessary libraries
from sklearn.metrics import mutual_info_score
from scipy.stats import entropy, pearsonr


# Calculate the mutual information using sklearn
mi_sklearn = mutual_info_score(latents_transformed_to_3D[:,0], latents_transformed_to_3D[:,2])

print(mi_sklearn)
print(entropy(latents_transformed_to_3D[:,1]))

In [None]:
import dcor

hgmcorr = dcor.distance_correlation(latents_transformed_to_3D[:,0], latents_transformed_to_3D[:,1])

print(hgmcorr)

### Plot 3D latent space

In [8]:
# Select the model directory
run_dir = '../runs/CD_10_nodes_layer_and_sample_dropout_nested'
nn_index = 1

params_dict_list = read_params_dict(run_dir, nn_index)

nn_save_dir = params_dict_list[0]
nn_params_dict = params_dict_list[1]
nn_dataset_dict = params_dict_list[4]

''' Test Case
x = np.arange(0, 100, 1)
y = np.sin(x)
#z = np.cos(x)
#w = np.tan(x)
latents = np.transpose(np.vstack((x,y)))
'''

latents = read_from_pickle('nn_latents.pkl', nn_save_dir)

assert latents.shape[1] == 3, f'Latent space dimension {latents.shape[1]} not equal to 3. Use 3D/2D PCA to visualize'

plots_dir = nn_save_dir + '/plots'
if not os.path.exists(plots_dir):
    os.mkdir(plots_dir)

figure_dir = plots_dir + '/latents_3D'
if not os.path.exists(figure_dir):
    os.mkdir(figure_dir)

if nn_params_dict['add_supervision_on_latent']:
    # We only use the first label to color code points in the latent
    y = np.squeeze(read_from_pickle('nn_y_pred.pkl', nn_save_dir)[0].numpy())

    # Check if any preprocessing done on the labels
    if nn_dataset_dict['y_preprocess_scheme']:
        y_preprocessors = params_dict_list[8]
        y_encoder_list = y_preprocessors[0]
        y_scaler_list = y_preprocessors[1]

        y_descr = params_dict_list[7]
        y_dtype_list = y_descr[0] 
        y_dim_list = y_descr[1]

        if y_dtype_list[0] == 'cat':
            c = np.squeeze(y_encoder_list[0].inverse_transform(y.reshape(-1,1)))
        else:
            c = np.squeeze(y_scaler_list[0].inverse_transform(y.reshape(-1,1)))
    else:
        c = y
        
    cmin = np.min(c)
    cmax = np.max(c)
    colorscale = 'viridis'
else:
    c = 'k'
    cmin = None
    cmax = None
    colorscale = None
    
fig_pca_3d = go.Figure(data=[go.Scatter3d(x=latents[:,0],
                                          y=latents[:,1],
                                          z=latents[:,2],
                                          mode='markers',
                                          marker=dict(size=4,
                                                      color=c,
                                                      colorscale=colorscale,
                                                      cmin=cmin,
                                                      cmax=cmax,
                                                      showscale=True,
                                                      opacity=1))])
    
fig_pca_3d.show()

### Sample from 3D latent/PCA space

In [None]:
# Select a point from either 3D latent space or 3D PCA space
x = 2.84
y = 0.99
z = -0.53
sampled_latent_point = np.array([[x, y, z]])
run_dir = '../../runs/carbondots_single_ae_nodes_10_no_latent_dropout'
nn_index = 0
global_seed = 4004
strategy = '3D' # Other strategies would be '3D', 'slerp'... 

In [None]:
# Load the params dictionary
params_dict_list = read_params_dict(run_dir, nn_index) 
nn = create_nn(params_dict_list[0], global_seed, params_dict_list[1], params_dict_list[2], 
               params_dict_list[3], params_dict_list[5], params_dict_list[7])

checkpoint = tf.train.Checkpoint(nn)
print(tf.train.latest_checkpoint(nn_save_dir + '/checkpoints'))
checkpoint.restore(tf.train.latest_checkpoint(nn_save_dir + '/checkpoints')).expect_partial()

assert strategy == 'PCA_3D' or strategy == '3D', 'Sampling strategy not supported'

if strategy == 'PCA_3D':
    assert sampled_latent_point.shape[1] == 3, f'Sampled latent point has dimensions greater than 3'
    latent = pca_3D.inverse_transform(sampled_latent_point).reshape(1, 10)
    print(f' --> Latent Coordinate : {latent}')
    
if strategy == '3D':
    print(f' --> Latent Coordinate : {sampled_latent_point}')
    
if strategy == 'slerp':
    pass

nn_params_dict = params_dict_list[1]

if nn_params_dict['add_supervision_on_latent']:
    y_pred = nn.y_pred_net(sampled_latent_point)
X_hat = nn.decoder(sampled_latent_point)

X_descr = params_dict_list[5]
X_preprocessors = params_dict_list[6]
y_descr = params_dict_list[7]
y_preprocessors = params_dict_list[8]

# Decode each back into original output 
X_dtype_list = X_descr[0]
y_dtype_list = y_descr[0]

transformed_reconstructions = []
transformed_predictions = []
num_count = 0
cat_count = 0

print(X_dtype_list)

for i, X_dtype in enumerate(X_dtype_list):
    if X_dtype == 'num':
        reconsts = X_hat[i][0].numpy()
        for reconst in reconsts:
            transformed_reconstructions.append(X_preprocessors[1][num_count].inverse_transform(reconst.reshape(1,-1)))
            num_count += 1
        
        print(num_count)
    else:
        reconst = X_hat[i][0].numpy()
        print(reconst)
        transformed_reconstructions.append(X_preprocessors[0][cat_count].inverse_transform(reconst.reshape(1,-1)))
        cat_count += 1
        
num_count = 0
cat_count = 0
for j, y_dtype in enumerate(y_dtype_list):
    if y_dtype == 'num':
        transformed_predictions.append(y_preprocessors[1][num_count].inverse_transform(y_pred[j].numpy()))
        num_count += 1

In [None]:
for prediction in transformed_predictions:
    pprint(prediction)

print('\n')
    
for reconstruction in transformed_reconstructions:
    pprint(reconstruction)

In [None]:
run_dir = '../../runs/carbondots_single_ae_nodes_10_no_latent_dropout'
nn_index = 0
global_seed = 4004

params_dict_list = read_params_dict(run_dir, nn_index)
nn = create_nn(params_dict_list[0], global_seed, params_dict_list[1], params_dict_list[2], 
               params_dict_list[3], params_dict_list[5], params_dict_list[7])

#load_trained_weights(params_dict_list[0], nn)

latents = read_from_pickle('nn_latents.pkl', params_dict_list[0])

In [None]:
import seaborn as sns

sns.heatmap(np.corrcoef(latents), annot=True, cmap='bwr')

In [None]:
layer2_weights = nn.get_weights()[2]
print(layer2_weights.shape)
print(layer2_weights[:,0].shape)
#print(nn.get_weights()[].shape)

In [None]:
import seaborn as sns

node = 0
print(np.round(layer2_weights[:, node].reshape(20,1),2))
sns.heatmap(layer2_weights)

In [None]:
a = np.array([1,2,3])
print(a.shape)
print(a - 1)