<a href="https://colab.research.google.com/github/ryan-hayden16/Projects/blob/main/bicluster_algorithm_9_16_(working_EM_code).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preliminary steps

In [None]:
# imports and installs
!pip install scanpy # tools for scRNA-seq analysis
!pip install matplotlib==3.1.3 # current version produces error w/ scanpy
!pip install sklearn # tools for general data analysis

import pandas as pd
import numpy as np
import scanpy as sc
import sklearn
import matplotlib.pyplot as plt
import math
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from scipy.stats import poisson
import copy


!pip install matplotlib==3.1.3 # reinstall to force old package version

Load data (and metadata, if available)

Quality control of raw data (optional)

Extract count matrix from raw data


Define known variables, ie: C,G,K,L

# how to simulate data

step 1: set pi and rho (to be something like uniform, ie: pi=(1/3, 1/3, 1/3))
then use pi and rho to generate Z and W according to pi and rho
ie: Z ~ multinoulli(1,pi) , etc..
then convert Z_i to a classification row in S
same for T
#arbitrarily fix: S, T ( ?? its set from procedure in above line)
set mu to have some cluster structure: ie: diagonal large, everything else small
ie; K=3, L=4, 1's on diagonal, see screenshot, in each row and column want one value to dominate
#then set pi and rho (?? set in step 1 above)
then generate X given S,T, mu
then try to recover S and T (ie: estimated S and T) using my code on generated X
then convert estimated S and T back into (estimated) Z and W
then you can adjusted rand index to compare accuracy of estimated Z,W with true Z and W from step 1 above 


#this is simulating using our model

In [18]:
# generate fake count data (0 to 100 counts) for testing code
import random

C=20
G=25
K=5
L=3
X=np.zeros((C,G))
for c in range(C):
  for g in range(G):
    X[c,g]=random.randint(0,10)

In [19]:
X

array([[ 2.,  1.,  5.,  1.,  9.,  4.,  9.,  2.,  2.,  4., 10.,  9., 10.,
         8.,  4.,  3.,  9.,  6.,  8.,  0.,  8.,  6.,  9.,  9.,  1.],
       [ 3.,  6.,  1.,  1.,  5.,  3.,  8., 10.,  6.,  0.,  1.,  3.,  0.,
         8., 10.,  6.,  0.,  2.,  8.,  3.,  1.,  1.,  0., 10.,  8.],
       [ 3.,  4.,  6., 10.,  7.,  9.,  3.,  7., 10.,  4.,  4.,  9.,  7.,
         7.,  5., 10.,  5.,  5.,  2.,  1.,  9.,  9.,  0.,  8.,  4.],
       [ 2.,  0.,  2.,  8.,  8.,  3., 10., 10.,  2.,  8.,  8.,  2.,  3.,
         0., 10.,  8.,  7.,  8.,  1.,  6.,  8.,  6.,  9.,  7.,  1.],
       [ 0., 10.,  2.,  7.,  3.,  7.,  4.,  6.,  6.,  5.,  5.,  8.,  4.,
        10.,  9., 10.,  4.,  8.,  5., 10.,  3.,  8.,  6.,  8.,  0.],
       [ 9.,  4.,  5.,  8.,  9.,  0., 10., 10.,  4.,  9.,  1.,  5.,  5.,
         9.,  3.,  8.,  8.,  1.,  2.,  3.,  7.,  9.,  0.,  2.,  3.],
       [ 0.,  0., 10.,  8., 10.,  1.,  7.,  4.,  1.,  0., 10., 10.,  2.,
        10., 10.,  1.,  5.,  1., 10.,  7.,  7.,  2.,  8.,  8.,  1.],
      

Initialize S and T variables

In [20]:
# initialize S
import scipy as sp
from numpy import linalg as LA
from scipy.linalg import sqrtm
from numpy.linalg import inv
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans


A=np.dot(X,np.transpose(X)) #step 1 (affinity matrix, simple version)

D=np.zeros((C,C))
for i in range(C):
  D[i,i] = sum(A[i,:]) #step 2.a (graph laplacian)

E=np.zeros((C,C)) # now take square root of diagonal matrices manually (see below)

for i in range(C):
  E[i,i] = math.sqrt(D[i,i]) # done manually to avoid possible errors with sqrtm function

# note that np.dot(E,E)==D returns a few falses, possibly because of numerical error? should be identical (this is still true even after manually taking square roots)

F=np.zeros((C,C))
for i in range(C):
  F[i,i] = (1/E[i,i]) # just take inverse of diagonal elements to avoid matrix operation related errors

H=np.dot(F,np.dot(A,F)) #step 2.b (graph laplacian)

# note that H==np.transpose(H) returns some falses, ie: not symmetric yet we get all real eigenvalues here

w, v = LA.eig(H) #step 3 (find K largest (orthogonal) eigenvectors of H and form a CxK matrix with them) and normalize the matrix
ordered_eigval=np.argsort(w) # returns indexes of ordered (small to large) of w (eigenvalue list)
k_large_eigval = ordered_eigval[-K:] # returns indexes of K largest eigvals
k_large_eigvec=np.transpose(v[k_large_eigval]) # returns corresponding K largest eigvecs, as a CxK ndarray
complex_k_large_eigvec=np.array(k_large_eigvec, dtype = 'complex_') # convert to complex valued matrix? does not fix error
e=LA.norm(complex_k_large_eigvec, axis=1) #
for n in range(C):
  complex_k_large_eigvec[n,:]=complex_k_large_eigvec[n,:]/e[n]
Y=complex_k_large_eigvec
B=Y.real 


#step 5 (treat each of the C rows of the matrix as a K-dim vector and cluster into K-clusters, via K-means)
kmeans = KMeans(n_clusters=K, random_state=0).fit(B)
cluster_labels = kmeans.labels_

#step 6 (assign/label each of the C cells into the corresponding cluster (ie: labels are 1,2,...,K) from step 5)
S=np.zeros((C,K))
for i in range(C):
  for j in range(K):
    if cluster_labels[i]==j:
      S[i,j]=1
#(each cell now has a label from 1,...,K, so we can then form the CxK classification matrix (ie: matrix S) that we want)


# this process gives us S_0

# THIS CODE IS FINISHED AND HAS BEEN CHECKED TO BE WORKING PROPERLY

In [21]:
S

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]])

In [22]:
# initialize T

T=np.zeros((G,K,L))

for i in range(K):
  XT=X
  XT=np.delete(XT,np.where(cluster_labels!=i),0) #now XT should only contain rows with label i
  A=np.dot(np.transpose(XT),XT) #step 1 (affinity matrix, simple version)
  D=np.zeros((G,G))
  for l in range(G):
    D[l,l] = sum(A[:,l]) #step 2.a (graph laplacian)
  E=np.zeros((G,G)) # now take square root of diagonal matrices manually (see below)
  for l in range(G):
    E[l,l] = math.sqrt(D[l,l])
  F=inv(E) # read above, do element wise as opposed to matrix wise operation
  F=np.zeros((G,G))
  for l in range(G):
    F[l,l] = (1/E[l,l]) # just take inverse of diagonal elements to avoid matrix operation related errors
  H=np.dot(F,np.dot(A,F)) 

  # maybe force H to be symmetric

# H should be symmetric, so shouldnt even have complex eigenvectors

  w, v = LA.eig(H) #step 3 (find K largest (orthogonal) eigenvectors of H and form a GxK matrix with them) and normalize the matrix
  ordered_eigval=np.argsort(w) # returns indexes of ordered (small to large) of w (eigenvalue list)
  k_large_eigval = ordered_eigval[-K:] # returns indexes of K largest eigvals
  k_large_eigvec=np.transpose(v[k_large_eigval]) # returns corresponding K largest eigvecs, as a GxK ndarray
  complex_k_large_eigvec=np.array(k_large_eigvec, dtype = 'complex_') # convert to complex valued matrix? does not fix error
  e=LA.norm(complex_k_large_eigvec, axis=1) #
  for n in range(C):
    complex_k_large_eigvec[n,:]=complex_k_large_eigvec[n,:]/e[n]
  Y=complex_k_large_eigvec
  B=Y.real # possible solution: just consider real part (do this in S initialization as well)
  # whatever solution is, apply to S initialization as well in case its eigenvalues are complex
  #try dividing each vector in k_large_eigvec by the corresponding value in LA.norm(k_large_eigvec), ie manually scale to unit vector
  # Y = normalize(complex_k_large_eigvec, axis=1, norm='l2') #normalize rows of the GxK matrix (CAUSES ERROR DUE TO COMPLEX NUMBERS)
  #step 5 (treat each of the C rows of the matrix as a K-dim vector and cluster into K-clusters, via K-means)
  #above solution seemed to work, but no error is in k-means

# ASK YUNPENG HOW TO DEAL WITH K-MEANS IF VECTORS ARE COMPLEX VALUED ??? MAYBE JUST DROP IMAGINARY PART? OR TAKE MAGNITUDE? ITS ONLY AN INITIAL GUESS FOR S

  kmeans = KMeans(n_clusters=L, random_state=0).fit(B)
  Tcluster_labels = kmeans.labels_
  #step 6 (assign/label each of the C cells into the corresponding cluster (ie: labels are 1,2,...,K) from step 5)
  for g in range(G):
    for l in range(L):
      if Tcluster_labels[g]==l:
        T[g,i,l]=1

# ALSO MAKE SURE TO SWAP TRANSPOSE ORDER SO ITS NOW GxG AFFINITY MATRIX, AND SWAP WHEREVER ELSE NECCESARY, IE NOW K-MEANS ON COLUMNS AND NORMALIZE COLUMNS, ETC...


# why is T initialzation code causing error when G is smaller then C ????

In [23]:
T

array([[[1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.]],

       [[0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[0., 0., 1.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 1., 0.],
        [1., 0., 0.]],

       [[0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.]],

       [[1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 1.,

#now code seems to work, but clean it up, make sure no logic errors, and change S initialization code so its also able to deal with complex #'s

Initialize theta paramaters

In [24]:
# using S_0 and T_0 and X, we can directly calculate rho_0, pi_0, and mu_0

#rho: use yunpeng line (12) and S_0
rho=np.zeros(K)
for k in range(K):
  rho[k]=(1/C)*sum(S[:,k])

#pi: use yunpeng line (13) and T_0
pi=np.zeros((K,L))
for k in range(K):
  for l in range(L):
    pi[k,l]=(1/G)*sum(T[:,k,l])


#mu: use yunpeng line (14) and S_0, T_0, X 
mu=np.zeros((K,L))
for k in range(K):
  for l in range(L):
    numer=0
    denom=0
    for c in range(C):
      for g in range(G):
        numer=numer+(S[c,k]*T[g,k,l]*X[c,g])
        denom=denom+(S[c,k]*T[g,k,l])
    mu[k,l]=numer/denom

# double check logic of mu loop

In [25]:
rho

array([0.25, 0.15, 0.2 , 0.15, 0.25])

In [26]:
pi

array([[0.28, 0.36, 0.36],
       [0.6 , 0.24, 0.16],
       [0.52, 0.36, 0.12],
       [0.36, 0.2 , 0.44],
       [0.6 , 0.28, 0.12]])

In [27]:
mu

array([[4.91428571, 5.62222222, 4.62222222],
       [5.2       , 4.        , 6.5       ],
       [5.48076923, 5.91666667, 5.41666667],
       [4.48148148, 6.93333333, 5.48484848],
       [5.46666667, 4.88571429, 4.13333333]])

In [28]:
S_initial=copy.deepcopy(S)
T_initial=copy.deepcopy(T)
rho_initial=copy.deepcopy(rho)
pi_initial=copy.deepcopy(pi)
mu_initial=copy.deepcopy(mu)

In [12]:
S_initial=S
T_initial=T

In [16]:
y=1
x=y
x+1
y

1

Iterative maximization scheme

In [29]:
# EM algorithm

S_old = 0 # initalize S and T
S_new = S
# Maybe change copy S,T code
T_old = 0
T_new = T

epsilon_1 = .001 # convergence threshold (what is good epsilon???)
epsilon_2 = .001 

# STEP 0 outer EM loop to update S,T, and theta
while ((np.amax(np.abs(np.subtract(S_new,S_old)))>epsilon_1) and (np.amax(np.abs(np.subtract(T_new,T_old)))>epsilon_2)):


  # STEP 1 inner EM loop to update S, rho, mu
  while ((np.amax(np.abs(np.subtract(S_new,S_old)))>epsilon_1)):

    #S update loop

    #S_old=S_new # save copy of previous S update
    S_old=copy.deepcopy(S_new)
    # this makes S_old point to S_new so, after S_new updated, S_old is too, hence this loop only runs once
    # same issue fo T
    # does deepcopy code work?
    # maybe put counter in to count number of while loops




    for c in range(C):
      k_numers=np.zeros(K) # save length k vector of numer values for each k value
      denom=0 # denom is sum of terms

      for k in range(K):
        numer=0 # numer is sum of logged terms
        numerd=0 # calculate denominator once by summing numerators up for all k values

        for g in range (G):
          for l in range (L):
            numer=numer+math.log(poisson.pmf(int(X[c,g]),mu[k,l]))*(T_new[g,k,l]) # log trick and normalize

        numer=numer+math.log(rho[k]) # final numerator for fixed k and c

        k_numers[k]=numer # save numerator associated with the fixed k for a single loop, and fixed c

      # NOW AT END OF FIRST K LOOP, AND HAVE K_NUMERS, WHICH IS IN LOG FORM

      # NOW CALCULATE MAX VALUE AMONG THE K NUMERATORS (FOR A FIXED C)
      w=np.amax(k_numers)
      for k in range(K):
        k_numers[k]=k_numers[k]-w #"normalize" k_numers

      denom=0
      for k in range(K):
        k_numers[k]=math.exp(k_numers[k])
        denom=denom+k_numers[k]

      for k in range(K):
        S_new[c,k]=k_numers[k]/denom # this loop is done once for each c loop, and within this loop it gives k-value associated numerator
    
    # end S loop

    for k in range(K):
      rho[k]=(1/C)*sum(S_new[:,k]) # update rho, using updated S values

    for k in range(K):
      for l in range(L):
        numer=0 # numer is sum of terms
        denom=0 # denom is sum of terms
        for c in range(C):
          for g in range(G):
            numer=numer+(S_new[c,k]*T_new[g,k,l]*X[c,g])
            denom=denom+(S_new[c,k]*T_new[g,k,l])
        mu[k,l]=numer/denom # update mu for each k and l values, using updated S and previous T values


  # now STEP 1 is finished and S, rho, mu have been updated



  # STEP 2 inner EM loop to update T, pi, mu
  while ((np.amax(np.abs(np.subtract(T_new,T_old)))>epsilon_2)):

    # T update loop

    #T_old=T_new # save copy of previous T update
    T_old=copy.deepcopy(T_new)
    # need to fix this above line that saves previous update

    for g in range(G):
      for k in range(K):
        l_numers=np.zeros(L) #save numerator for each l
        denom=0 # denom is sum of terms, depends on g and k indices

        for l in range(L):
          numer=0 # numer is sum of logged terms
          l_numerd=0 # calculate denominator once by summing numerators up for all l values

          for c in range(C):
            numer=numer+math.log(poisson.pmf(int(X[c,g]),mu[k,l]))*(S_new[c,k]) # log trick and normalize

          numer=numer+math.log(pi[k,l]) # final numer for fixed k, l, and g

          l_numers[l]=numer # save each numer for each k,l pair

        # NOW AT END OF FIRST G,K LOOP, AND HAVE L_NUMERS, WHICH IS IN LOG FORM

        # NOW CALCULATE MAX VALUE AMONG THE L NUMERATORS (FOR A FIXED G,K)
        w=np.amax(l_numers)
        for l in range(L):
          l_numers[l]=l_numers[l]-w #"normalize" k_numers

        denom=0
        for l in range(L):
          l_numers[l]=math.exp(l_numers[l])
          denom=denom+l_numers[l]

        for l in range(L):
          T_new[g,k,l]=l_numers[l]/denom # this loop is done once for each c loop, and within this loop it gives k-value associated numerator

    # end T loop

    for k in range(K):
      for l in range(L):
        pi[k,l]=(1/G)*sum(T_new[:,k,l]) # update pi, using updated T values

    for k in range(K):
      for l in range(L):
        numer=0 # numer is sum of terms
        denom=0 # denom is sum of terms
        for c in range(C):
          for g in range(G):
            numer=numer+(S_new[c,k]*T_new[g,k,l]*X[c,g])
            denom=denom+(S_new[c,k]*T_new[g,k,l])
        mu[k,l]=numer/denom # update mu for each k and l values, using updated T and previously updated S values


  # now STEP 2 is finished and T, pi, mu have been updated


# Now STEP 1 and STEP 2 have finished, NOW S, T, AND THETA HAVE ALL BEEN UPDATED THROUGH ONE ITERATION

# now while loop will check convergence of S and T, if threshold met, loop will end and S and T will stop updating

In [None]:
# after running on simulated data, change to zero inflated poisson, then check simulated again, then run on benchmarking
https://en.wikipedia.org/wiki/Zero-inflated_model#Zero-inflated_Poisson
only need to add one extra parameter, which controls probability of zeros
adding this extra parameter might change possibility of having closed form, so might need to use newton-ralphson
(actually according to wkipedia there is closed form for MLE of zero prob parameter)


always optimize log-likelihood because its concave
define hessian and score, etc...

then work on adding in row sums, column sums covariates, probably going to need to use newton ralphson because no closed form
this is based on yunpengs new paper + roys recent notes

# set S and T to converged values (since "=" works as pointer they should already be the updated values

In [30]:
S_initial

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]])

In [31]:
S

array([[2.08433831e-03, 1.97761995e-01, 8.30186087e-02, 6.80035950e-01,
        3.70991083e-02],
       [1.36986686e-04, 3.92295799e-02, 8.75264042e-01, 4.06711098e-03,
        8.13022805e-02],
       [5.58285360e-04, 2.47421555e-02, 9.51222062e-01, 3.13351466e-03,
        2.03439825e-02],
       [4.88942553e-05, 2.10127374e-02, 9.67395605e-01, 9.34718857e-04,
        1.06080441e-02],
       [6.79084035e-03, 1.67673728e-03, 9.89773699e-01, 1.18725732e-03,
        5.71465585e-04],
       [3.47383782e-04, 5.02709677e-01, 3.87475680e-02, 3.74683078e-01,
        8.35122925e-02],
       [2.31397720e-06, 2.26854593e-04, 3.03361593e-02, 9.67800605e-01,
        1.63406725e-03],
       [1.80066048e-04, 1.95588946e-01, 9.28505374e-02, 1.69889666e-02,
        6.94391484e-01],
       [5.73176519e-05, 5.00818320e-02, 1.83821686e-02, 1.66874071e-03,
        9.29809941e-01],
       [3.95302978e-06, 9.06229180e-01, 8.10059294e-02, 1.25712037e-02,
        1.89734348e-04],
       [1.74295206e-04, 9.3567

In [32]:
T_initial

array([[[1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.]],

       [[0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[0., 0., 1.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 1., 0.],
        [1., 0., 0.]],

       [[0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.]],

       [[1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 1.,

In [33]:
T

array([[[7.23276038e-02, 1.18334925e-01, 8.09337471e-01],
        [1.21741072e-01, 1.14439717e-11, 8.78258928e-01],
        [2.21382816e-01, 3.10673169e-05, 7.78586117e-01],
        [4.13547724e-01, 1.83956561e-01, 4.02495715e-01],
        [8.00042759e-01, 7.07215440e-02, 1.29235697e-01]],

       [[6.16475955e-02, 3.70388368e-02, 9.01313568e-01],
        [6.58471008e-01, 1.85639059e-06, 3.41527136e-01],
        [9.76285220e-01, 1.16743719e-02, 1.20404080e-02],
        [6.48335960e-01, 1.10302166e-01, 2.41361874e-01],
        [8.30249853e-01, 4.55129726e-02, 1.24237174e-01]],

       [[6.88926982e-02, 3.69660112e-01, 5.61447190e-01],
        [8.74107600e-01, 3.76520341e-04, 1.25515879e-01],
        [9.65143852e-01, 3.15420899e-02, 3.31405811e-03],
        [5.75973277e-05, 3.13846546e-01, 6.86095857e-01],
        [7.38320444e-01, 1.28713799e-01, 1.32965756e-01]],

       [[8.79419987e-05, 9.99819271e-01, 9.27865860e-05],
        [7.99693247e-01, 3.90171600e-05, 2.00267736e-01],
        

In [None]:
# convert S and T into labels Z and W
# check Z (cell-type) labels against true labels from metadata
# check high expression gene-community against known biomarkers for associated cell-type

In [None]:
# figure out way to visualize Z and W labels