<a href="https://colab.research.google.com/github/ryan-hayden16/Projects/blob/main/iterativeMLE_draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preliminary steps

In [None]:
# imports and installs
!pip install scanpy # tools for scRNA-seq analysis
!pip install matplotlib==3.1.3 # current version produces error w/ scanpy
!pip install sklearn # tools for general data analysis

import pandas as pd
import numpy as np
import scanpy as sc
import sklearn
import matplotlib.pyplot as plt
import math
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

!pip install matplotlib==3.1.3 # reinstall to force old package version

Load data (and metadata, if available)

In [None]:
# load data matrix (single patient/sample)
x = pd.read_csv("/content/Kidney-counts.csv", index_col=0)
x = np.transpose(x) # transpose into cell by gene format

In [None]:
# load metadata (if available, can be used to check cluster accuracy)
metadata = pd.read_csv("/content/annotations_FACS.csv", index_col=0)
metadata = metadata.loc[metadata['tissue'].isin(['Kidney'])]

In [None]:
# remove raw data with missing labels
cellclass=metadata.cell_ontology_class
cellclass=cellclass.to_frame()
mergedf=x.merge(cellclass, left_index=True, right_index=True)
metadf=mergedf.cell_ontology_class
metadf=metadf.to_frame()
bladf=mergedf.drop(columns=['cell_ontology_class'])

# now raw data and metadata have matching sizes
x=bladf
x_labels=metadf

# create annotated data matrix (ie: anndata) to use with scanpy
adata_raw = sc.AnnData(X = x, obs = x_labels)

  


Quality control of raw data (optional)

In [None]:
# quality control of raw data? (need old matplotlib version to avoid errors)

# quality control
adata_qc=adata_raw # keep copy of the raw data
is_spike_in = {}
for gene_name in adata_qc.var_names:
    if 'ERCC' in gene_name:
        is_spike_in[gene_name] = True # record that we found a spike-in
    else:
        is_spike_in[gene_name] = False # record that this was not a spike-in
adata_qc.var['ERCC'] = pd.Series(is_spike_in) # label the spike ins
qc = sc.pp.calculate_qc_metrics(adata_qc, qc_vars = ['ERCC']) # scanpy function
cell_qc_dataframe = qc[0] # cell quality control
gene_qc_dataframe = qc[1] # gene quality control

# cell filtering and gene filtering
low_ERCC_mask = (cell_qc_dataframe['pct_counts_ERCC'] < 10)
adata_qc = adata_qc[low_ERCC_mask]
sc.pp.filter_cells(adata_qc, min_genes = 750) # filter cells 
sc.pp.filter_genes(adata_qc, min_cells = 2) # filter genes
sc.pp.filter_genes(adata_qc, min_counts = 10)

#run PCA with no labels
sc.pp.pca(adata_qc)
sc.pl.pca_overview(adata_qc) # plot

# run PCA as exploratory measure to check the data out
sc.pp.pca(adata_qc)
sc.pl.pca_overview(adata_qc, color='cell_ontology_class') # plot

# normalize the data 
adata_norm=adata_qc # keep copy of qc data
sc.pp.normalize_per_cell(adata_norm, counts_per_cell_after=1e6)
sc.pp.normalize_total(adata_norm, target_sum=1e6, exclude_highly_expressed=True)

# (OPTIONAL) Remove highly expressed genes distorting the data
not_Rn45s = adata_norm.var.index != 'Rn45s'
adata_no_Rn45s = adata_norm[:, not_Rn45s] # keep copy of normed data
# need to check which genes to remove

# scale the data
adata_scale=adata_no_Rn45s
# adata_scale=adata_norm
sc.pp.log1p(adata_scale)
sc.pp.scale(adata_scale)

#re-run PCA with no labels
sc.pp.pca(adata_scale)
sc.pl.pca_overview(adata_scale) # plot

# re-run PCA, should seperate data better this time
sc.pp.pca(adata_scale)
sc.pl.pca_overview(adata_scale, color='cell_ontology_class') # plot

adata=adata_scale # adata is now quality controlled, normalized, and scaled

Extract count matrix from raw data

In [None]:
# convert data matrix in adata to dataframe
x = pd.DataFrame(adata.X)
x=x.set_index(adata.obs.index)

#NOTE, numpy ndarray is preferable to dataframe because it can be more than 2 dimensions, but tensorflow tensor might be computationally advantageous


Define known variables

In [None]:
# define known values
C,G = x.shape # retrieve number of cells and genes from raw data matrix
K = 5 # predicted number of cell-types
L = 3 # predicted number of gene-communities (ie: high, medium, low expression communities)


# define distribution f (start with Poisson) 
poisson pdf

python notes

In [None]:
#create tensor in python
import random
C=50
G=100
K=5 
L=3 
Q=np.zeros((G,K,L))
for i in range(G):
  for j in range(K):
    for l in range(L):
      Q[i,j,l]=random.uniform(0, 1)

print(Q)
# Q is GxKxL tensor

In [4]:
# generate fake (normalized) count data
X=np.zeros((C,G))
for c in range(C):
  for g in range(G):
    X[c,g]=random.uniform(0,1)

In [5]:
X

array([[0.6753744 , 0.8320612 , 0.49918573, ..., 0.35695974, 0.01775405,
        0.60654286],
       [0.78257583, 0.87906851, 0.97402181, ..., 0.26331163, 0.24914387,
        0.41427641],
       [0.66946349, 0.99919073, 0.54409948, ..., 0.39638758, 0.71148458,
        0.47074642],
       ...,
       [0.11847296, 0.96767791, 0.78908677, ..., 0.9368653 , 0.61783446,
        0.02920085],
       [0.25210393, 0.83065315, 0.8495982 , ..., 0.90617789, 0.21553334,
        0.80976785],
       [0.6560573 , 0.24328608, 0.22514646, ..., 0.50866792, 0.95877456,
        0.70112941]])

Initialize S and T variables

In [133]:
# initialize S
import scipy as sp
from numpy import linalg as LA
from scipy.linalg import sqrtm
from numpy.linalg import inv
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans


A=np.dot(X,np.transpose(X)) #step 1 (affinity matrix, simple version)

D=np.zeros((C,C))
for i in range(C):
  D[i,i] = sum(A[i,:]) #step 2.a (graph laplacian)

E=sqrtm(D) 
F=inv(E) 
H=np.dot(F,np.dot(A,F)) #step 2.b (graph laplacian)


w, v = LA.eig(H) #step 3 (find K largest (orthogonal) eigenvectors of H and form a CxK matrix with them) and normalize the matrix
ordered_eigval=np.argsort(w) # returns indexes of ordered (small to large) of w (eigenvalue list)
k_large_eigval = ordered_eigval[-K:] # returns indexes of K largest eigvals
k_large_eigvec=np.transpose(v[k_large_eigval]) # returns corresponding K largest eigvecs, as a CxK ndarray
Y = normalize(k_large_eigvec, axis=1, norm='l2') #normalize rows of the CxK matrix


#step 5 (treat each of the C rows of the matrix as a K-dim vector and cluster into K-clusters, via K-means)
kmeans = KMeans(n_clusters=K, random_state=0).fit(Y)
cluster_labels = kmeans.labels_

#step 6 (assign/label each of the C cells into the corresponding cluster (ie: labels are 1,2,...,K) from step 5)
S=np.zeros((C,K))
for i in range(C):
  for j in range(K):
    if cluster_labels[i]==j:
      S[i,j]=1
#(each cell now has a label from 1,...,K, so we can then form the CxK classification matrix (ie: matrix S) that we want)


# this process gives us S_0

# THIS CODE IS FINISHED AND HAS BEEN CHECKED TO BE WORKING PROPERLY

In [None]:
S.shape

In [None]:
S

In [138]:
cluster_labels # compare with S printout above to see that it worked

array([3, 2, 4, 3, 0, 2, 3, 4, 1, 4, 2, 4, 3, 3, 2, 2, 2, 0, 1, 1, 3, 4,
       2, 4, 1, 0, 1, 1, 3, 2, 2, 3, 4, 1, 1, 0, 2, 3, 1, 3, 3, 2, 2, 0,
       0, 4, 0, 2, 0, 3], dtype=int32)

In [None]:
# initialize T

#step 0 (using 1,...,K labels from above, turn X matrix into K submatrices, where each submatrix X_1,...,X_K corresponds to a label, and note each has G columns but varying number of columns)
CREATE K EMPTY ARRAYS, AND LOOP THROUGH CLUSTER_LABELS AND APPEND EACH ROW VECTOR FROM X TO THE X_i MATRIX ITS LABEL CORRESPONDS TO
LABEL THESE MATRIXES X_1,...,X_K AND LOOP THROUGH 1 TO K, EACH TIME PERFORMING THE BELOW STEPS

#step 1 (form K affinity matrices A_1,..., A_K, where A_i = (X_i^transpose)*X_i is a GxG matrix)
BESIDES KEEPING TRACK OF X_i VS X_j SHOULD BE ABLE TO JUST COPY ABOVE CODE, 
BUT ALSO MAKE SURE TO SWAP TRANSPOSE ORDER SO ITS NOW GxG AFFINITY MATRIX, AND SWAP WHEREVER ELSE NECCESARY, IE NOW K-MEANS ON COLUMNS AND NORMALIZE COLUMNS, ETC...
THE FINAL RESULT SHOULD BE K SEPERATE GxL CLASSIFICATION MATRICES, YIELDING THE DESIRED GxKxL TENSOR

#now repeat steps 2-7 from above (but now using L clusters/eigenvectors), to obtain K seperate GxL classification matrices, this gives us the GxKxL classification tensor that we want


# this process gives us T_0

Initialize theta paramaters

In [None]:
# using S_0 and T_0 and X, we can directly calculate rho_0, pi_0, and mu_0

rho: use yunpeng line (12) and S_0
pi: use yunpeng line (13) and T_0
mu: use yunpeng line (14) and S_0, T_0, X 

Iterative maximization scheme

In [None]:
# start with S_0, T_0, rho_0, pi_0, mu_0

# loop through steps 1 and 2


#NOTE: be careful with regard to dropouts/blowups, use appropriate numerical techniques


# step 1: update S, rho, mu
fix previous values of S, T, rho, pi, mu 
calculate updated S # E step (define function) 
USE YUNPENG LINE (9)
calculate updated rho, mu # M step (define function)
USE YUNPENG LINES (12 & 14)

# step 2: update T, pi, mu
fix previous values of S, T, rho, pi, mu 
calculate updated T # E step (define function)
USE YUNPENG LINE (10)
calculate updated pi, mu # M step (define function)
USE YUNPENG LINES (13 & 14)

# stop loop if convergence criteria met, ie: S and T both no longer changing values to significant degree
IE: T_(M+1)-T_(M) < epsilon, and similar for S_(M)

In [None]:
# convert S and T into labels Z and W
# check Z (cell-type) labels against true labels from metadata
# check high expression gene-community against known biomarkers for associated cell-type

In [None]:
# figure out way to visualize Z and W labels