### MaxFuse run on the retina dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import anndata as ad
import scanpy as sc

import sys
sys.path.append("../../MaxFuse_devo/09302022V/")
import match
import metrics
from scipy.io import mmread
import os

In [2]:
meta = pd.read_csv('/atac_bench_nrz/retina/data/meta_20k.csv')
celltype_labels = meta['annotation'].to_numpy()
np.unique(celltype_labels)

array(['AIIamacrine', 'Astrocyte', 'Cone', 'GABAamacrine', 'Glyamacrine',
       'Horizontal', 'Microglia', 'Mullerglia', 'OFFconebipolar',
       'ONconebipolar', 'Retinalganglioncell', 'Rod', 'Rodbipolar'],
      dtype=object)

In [3]:
# read in the counts
rna = mmread("/atac_bench_nrz/retina/data/rna_20k.txt").tocsr()
rna.shape

(20000, 36601)

In [4]:
# read in the names
rna_names = pd.read_csv(
    '/atac_bench_nrz/retina/data/rna_names.csv'
)['names'].to_numpy()

In [5]:
atacactivity=mmread("/atac_bench_nrz/retina/data/atac_20k.txt").tocsr()
atacactivity.shape

(20000, 24919)

In [6]:
# activity names
gas_names = pd.read_csv(
    '/atac_bench_nrz/retina/data/atac_names.csv'
)['names'].to_numpy()

In [9]:
peak_lsi = pd.read_csv('/atac_bench_nrz/retina/data/lsi_49_20k.csv')
peak_lsi = peak_lsi.drop('X', axis=1)
peak_lsi.shape

(20000, 49)

In [10]:
## make rna anndata
rna_adata = ad.AnnData(
    rna.tocsr(), dtype=np.float32
)
rna_adata.var_names = rna_names
#rna_adata.obs_names = meta_rna['Unnamed: 0']
rna_adata

AnnData object with n_obs × n_vars = 20000 × 36601

In [11]:
## make GAS anndata
activity_adata = ad.AnnData(atacactivity, dtype=np.float32)
activity_adata.var_names = gas_names
activity_adata

AnnData object with n_obs × n_vars = 20000 × 24919

In [12]:
peak_adata=ad.AnnData(peak_lsi, dtype=np.float32)
peak_adata



AnnData object with n_obs × n_vars = 20000 × 49

In [13]:
shared_genes = np.intersect1d(rna_adata.var_names, activity_adata.var_names)
len(np.intersect1d(rna_adata.var_names, activity_adata.var_names))

21369

In [14]:
rna_shared = rna_adata[:, shared_genes].X.todense()
activity_shared = activity_adata[:, shared_genes].X.todense()

mask = ((rna_shared.std(axis=0) > 0.01) & (activity_shared.std(axis=0) > 0.01)).A1 # filter out static ones
rna_shared = rna_shared[:, mask]
activity_shared = activity_shared[:, mask]

In [15]:
# normalize shared RNA counts
rna_shared = ad.AnnData(rna_shared)
sc.pp.normalize_total(rna_shared)
sc.pp.log1p(rna_shared)
sc.pp.highly_variable_genes(rna_shared, n_top_genes = 3000)
sc.pp.scale(rna_shared)
#rna_shared = rna_shared.X

## atac shared
activity_shared = ad.AnnData(activity_shared)
sc.pp.normalize_total(activity_shared)
sc.pp.log1p(activity_shared)
sc.pp.scale(activity_shared)
#activity_shared = activity_shared.X

In [16]:
print((rna_shared.shape, activity_shared.shape))

((20000, 18911), (20000, 18911))


In [17]:
vgenes = rna_shared.var.highly_variable

In [18]:
# shared features
rnaC_shared = rna_shared[:,vgenes].X
atac_shared = activity_shared[:,vgenes].X
# all features
rnaC_active = rna_shared[:,vgenes].X
atac_active = peak_adata.X

In [19]:
spm = match.MaxFuse(
        shared_arr1=rnaC_shared,
        shared_arr2=atac_shared,
        active_arr1=rnaC_active,
        active_arr2=atac_active,
        method='centroid_shrinkage',
        labels1=None, # if None, then use scanpy clustering pipeline
        labels2=None
    )

In [21]:
spm.split_into_batches(
    max_outward_size=5000,
    matching_ratio=5,
    metacell_size=2,
    method = 'binning',
    seed=None,
    verbose=True
)

The first data is split into 2 batches, average batch size is 10000, and max batch size is 10000.
The second data is split into 1 batches, average batch size is 20000, and max batch size is 20000.
Batch to batch correspondence is:
  ['0<->0', '1<->0'].


In [22]:
spm.construct_graphs(
    n_neighbors1=15,
    n_neighbors2=15,
    svd_components1=30,
    svd_components2=20,
    resolution1=2,
    resolution2=2,
    randomized_svd=False,  # @Shuxiao: Had to change this from True to False.  Doesn't work when true, why?
    svd_runs=1,
    resolution_tol=0.1,
    leiden_runs=1,
    leiden_seed=None,
    verbose=True
)

Aggregating cells in arr1 into metacells of average size 2...
Constructing neighborhood graphs for cells in arr1...
Now at batch 0...
Now at batch 1...
Graph construction finished!
Clustering into metacells...
Now at batch 0...
Metacell clustering finished!
Now at batch 1...
Metacell clustering finished!
Constructing neighborhood graphs for cells in arr1...
Now at batch 0...
Now at batch 1...
Graph construction finished!
Clustering the graphs for cells in arr1...
Now at batch 0...
Now at batch 1...
Graph clustering finished!
Constructing neighborhood graphs for cells in arr2...
Now at batch 0...
Graph construction finished!
Clustering the graphs for cells in arr2...
Now at batch 0...
Graph clustering finished!


In [23]:
spm.find_initial_pivots(
    wt1=0.7, wt2=0.7,
    svd_components1=30, svd_components2=30,
    randomized_svd=False, svd_runs=1,
    verbose=True
)

Now at batch 0<->0...
Now at batch 1<->0...
Done!


In [25]:
# test accuracy of initial matching
import utils, metrics
matching = spm._init_matching[0]
labels1 = utils.summarize_clustering(spm._metacell_labels1[0], celltype_labels[spm._batch_to_indices1[0]])
# labels1 = celltype_labels_rna[spm._batch_to_indices1[0]]
labels2 = celltype_labels[spm._batch_to_indices2[0]]
metrics.get_matching_acc(matching, labels1, labels2)

0.9821893135881529

In [26]:
spm.refine_pivots(
    wt1=0.7, wt2=0.7,
    svd_components1=200, svd_components2=None,
    cca_components=24,
    filter_prop=0.,
    n_iters=8,
    randomized_svd=False, 
    svd_runs=1,
    verbose=True
)

Now at batch 0<->0...
Now at batch 1<->0...
Done!


In [27]:
# test accuracy of refined matching
matching = spm._refined_matching[0]
labels1 = utils.summarize_clustering(spm._metacell_labels1[0], celltype_labels[spm._batch_to_indices1[0]])
# labels1 = celltype_labels_rna[spm._batch_to_indices1[0]]
labels2 = celltype_labels[spm._batch_to_indices2[0]]
metrics.get_matching_acc(matching, labels1, labels2)

0.9847908745247148

In [28]:
spm.filter_bad_matches(target='pivot', filter_prop=0.3, verbose=True)

Begin filtering...
Now at batch 0<->0...
Now at batch 1<->0...
6992/9988 pairs of matched cells remain after the filtering.
Fitting CCA on pivots...
Scoring matched pairs...
14160/20000 cells in arr1 are selected as pivots.
4898/20000 cells in arr2 are selected as pivots.
Done!


In [29]:
spm.propagate(
    wt1=0.7, wt2=0.7,
    svd_components1=40, 
    svd_components2=30, 
    randomized_svd=False, 
    svd_runs=1, 
    verbose=True
)

Now at batch 0<->0...
Now at batch 1<->0...
Done!


In [30]:
spm.filter_bad_matches(
    target='propagated',
    filter_prop=0.,
    verbose=True
)

Begin filtering...
Now at batch 0<->0...
Now at batch 1<->0...
36004/36004 pairs of matched cells remain after the filtering.
Scoring matched pairs...
Done!


In [31]:
matching = spm.get_matching(order=(2, 1), target='full_data')
metrics.get_matching_acc(matching, celltype_labels, celltype_labels, order = (2, 1))

0.963

In [32]:
full = pd.DataFrame(list(zip(matching[0],matching[1],matching[2])), columns = ["idx1","idx2","score"])
full.to_csv("/atac_bench_nrz/retina/mf/full_idx_21_test.csv", index=False)

In [33]:
arr1_cca, arr2_cca = spm.get_embedding(
        active_arr1 = spm.active_arr1,
        active_arr2 = spm.active_arr2,
        refit=False,
        matching=None,
        order=None,
        cca_components=20,
        cca_max_iter=None
    )

In [34]:
out_idx = 20 # save out but used 15 in downstream analysis
arr1_df = pd.DataFrame(arr1_cca).iloc[:,0:out_idx]
arr2_df = pd.DataFrame(arr2_cca).iloc[:,0:out_idx]
arr1_df.to_csv("/atac_bench_nrz/retina/mf/full_embed_x0_test.csv",index=False)
arr2_df.to_csv("/atac_bench_nrz/retina/mf/full_embed_y0_test.csv", index=False)