In [None]:
import pandas as pd
import numpy as np
import numpy.linalg as la
import scipy as sp 

import matplotlib.pyplot as plt
import seaborn as sns

# Transform matrix experiments on interpretability

JMA 12 AUg 2024

In [None]:

# File dependencies:
model_data = pd.read_parquet("../understanding_classifiers/model_data_top_1k_descriptors.parquet").reset_index()
model_data.head()

## The coefficient vectors create the transform matrix from the semantic embedding space to the the interpretable concept space

Each row is one coefficient vector, for one of the label-terms, e.g. MeSH terms, that occupy the semantic embedding space. 

Stacking coefficient vectors in a matrix forms a transform from the embedding space to the concept space, whose dimensions are labeled by the label-terms. z

In [None]:
# Extract the vector field and expand it to multiple rows. 
concept_transforms = np.vstack((model_data.beta_unit_vector).map(lambda x: np.asarray(x)))
transform_names= model_data.name
row_count, column_count = concept_transforms.shape
f'output dimension: {row_count}, input dimension: {column_count}'

In [None]:
# Both the numpy version and the scipy version return the same results. 


u,s, vh = la.svd(concept_transforms)
u.shape, vh.shape


In [None]:
u_df = pd.DataFrame(u).set_index(transform_names)
u_df.head()

In [None]:
plt.plot(s)
plt.title('singular values magnitudes versus rank.')


### All rows and columns are unit length and rows and columns are orthogonal for both matricies

In [None]:
sns.heatmap(u_pd.T.corr())
plt.title('concept vector correlation heatmap')

In [None]:
sns.heatmap(np.cov(vh.T))
plt.title('concept vector correlation transpose heatmap')

In [None]:
# Check u for unit lengths along both axes.
np.any(np.apply_along_axis(la.norm, 0, u) - 1.0 < 1e4), np.any(np.apply_along_axis(la.norm, 1, u) - 1.0 < 1e4)


In [None]:
# Check v for unit lengths along both axes. 
np.any(np.apply_along_axis(la.norm, 0, vh) - 1.0 < 1e4), np.any(np.apply_along_axis(la.norm, 1, vh.T) - 1.0 < 1e4)

### To use the SVD we constrain the rank of the singular values by just taking the upper left corner of s. 

In [None]:
# Reconstruct the transform from its SVD components
# Re
s_reconstruction = np.zeros([row_count, column_count])
s_reconstruction[:column_count, :column_count] = np.diag(s)
sub_concept_transforms = u @ s_reconstruction @ vh

In [None]:
# Does the svd reconstruct the original matrix? 
# np.allclose := two arrays are element-wise equal within a tolerance
np.allclose(concept_transforms, sub_concept_transforms)


In [None]:
# Create a limited rank approximation to the transform matrix matrix

sub_rank = column_count - 600

s_reconstruction = np.zeros([sub_rank, sub_rank])
s_reconstruction[:sub_rank, :sub_rank] = np.diag(s[:sub_rank])
sub_concept_transforms = u[:row_count, :sub_rank] @ s_reconstruction @ vh[:sub_rank,:column_count]
sub_concept_transforms.shape


In [None]:
# Compute the normalized distance to the reduced rank matrix. 
# How good is the low rank approximation? 
la.norm(concept_transforms - sub_concept_transforms) /la.norm(concept_transforms)

In [None]:

transfer_sample_indexes = np.sort(np.random.choice(range(row_count), column_count, replace=False))
hold_out_indexes = set(range(1000)).difference(set(transfer_sample_indexes))   # We loose the association with concept names sigh. 
# Check that the two sets are  mutually exclusive & exhaustive
len(transfer_sample_indexes) + len(hold_out_indexes), hold_out_indexes.intersection(set(transfer_sample_indexes))

### Distances are not preserved from embedding to concept space by the transform

Make pair-wise comparisons between vectors in the hold out set and compare distances
before transform and after transform.

Look at the names of the hold out vectors, to see if distances between concept vectors
are more meaningful than among embedding vectors. 
hold_out_names = transform_names[list(hold_out_indexes)]  # set ordering is not preserved. 

In [None]:
# for pairs of held-out vectors, compute their cosine distances in embedding and concept spaces

# normalized cosine distance'
def cos_d(v1, v2):
    return sp.spatial.distance.cosine(v1, v2) - 1

transform = concept_transforms[transfer_sample_indexes,:]
hold_out_vectors = concept_transforms[list(hold_out_indexes),:]
    

def before_after_distance(v1, v2, transform_matrix):
    'compare how the transform changes the cosine distance'
    before  = cos_d(v1, v2)
    after = cos_d(transform_matrix @ v1, transform_matrix @ v2)
    return before, after




In [None]:
ba_distances = np.zeros([(len(hold_out_indexes)-1), 2])
for k in range(len(hold_out_indexes)-1):
    ba_distances[k, 0], ba_distances[k, 1] = before_after_distance(hold_out_vectors[k], hold_out_vectors[k+1], transform)
    #print(f'before: {b:.3}, after: {a:.3}')

ba_low_dim_distances = np.zeros([(len(hold_out_indexes)-1), 2])
for k in range(len(hold_out_indexes)-1):
    ba_low_dim_distances[k, 0], ba_low_dim_distances[k, 1] = before_after_distance(hold_out_vectors[k], hold_out_vectors[k+1], sub_concept_transforms)

plt.plot(ba_distances[:,0], ba_distances[:,1], 'o')

plt.plot(ba_low_dim_distances[:,0], ba_low_dim_distances[:,1], 'ro')
plt.plot([-1, 1], [-1, 1], color= 'grey')
plt.title('Transformed distances, Blue-full transform, Red-Low rank transform')

### Conjecture - the concept vectors have lower entropy in the concept space. 

Create a hold-out set of coefficient vectors to test the transfer function.  Use them as test vectors in the embedding space.  

In [None]:
import scipy.stats as ss 

tranformed_hold_out_vectors = (transform @ hold_out_vectors.T).T

# ss.entropy(concept_transforms + 1E-1 * np.ones([row_count, column_count]))
x = np.abs(hold_out_vectors)
xe = ss.entropy(x)/ss.entropy(np.ones([len(x)]))
xe = np.sort(xe)
print('mean: ',np.mean(xe))

x1 = np.abs(tranformed_hold_out_vectors)
xe1 = ss.entropy(x1)/ss.entropy(np.ones([len(x1)]))
xe1 = np.sort(xe1)
print('mean: ',np.mean(xe1))

plt.plot(xe, xe1, 'og')
plt.plot([0.92, 0.96], [0.92,0.96], color='grey')

