Try MaxFuse method to integrate CODEX and scRNA-seq from mouse ICT dataset
======

In [None]:
import warnings

In [None]:
warnings.simplefilter(action='ignore',)

In [None]:
warnings.simplefilter(action='ignore',)

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import seaborn as sns
import maxfuse as mf
import matplotlib.pyplot as plt

from scipy.io import mmread
from scipy import sparse
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# setup and load datasets (only run once)

## CODEX

In [None]:
! ls unprocessed_data

In [None]:
codex_meta = pd.read_csv('unprocessed_data/codex_meta.csv', index_col=0)
codex_genes = pd.read_csv('unprocessed_data/codex_genes.csv', index_col=0).values.flatten()
codex_counts = pd.read_csv('unprocessed_data/codex_counts_matrix.csv') # counts matrix
codex_data = pd.read_csv('unprocessed_data/codex_data_matrix.csv') # normalized data matrix (see I/O Rmd)

In [None]:
codex_counts = codex_counts.set_index("Unnamed: 0")
codex_counts.index.name = None
codex_counts.columns = codex_meta.index
codex_counts = codex_counts.T
array_copy = np.array(codex_counts, dtype=float)
codex_counts = pd.DataFrame(array_copy, index = codex_counts.index, columns = codex_counts.columns)
del array_copy
codex_counts = sparse.csr_matrix(codex_counts)
codex_counts

In [None]:
codex_data = codex_data.set_index("Unnamed: 0")
codex_data.index.name = None
codex_data.columns = codex_meta.index
codex_data = codex_data.T
array_copy = np.array(codex_data, dtype=float)
codex_data = pd.DataFrame(array_copy, index = codex_data.index, columns = codex_data.columns)
del array_copy
codex_data = sparse.csr_matrix(codex_data)
codex_data

In [None]:
codex_genes = pd.DataFrame(codex_genes, columns=['gene'], index = codex_genes)

In [None]:
codex = ad.AnnData(X = codex_data, obs = codex_meta, var = codex_genes, layers={'counts':codex_counts,'normalized_data':codex_data})
codex

In [None]:
codex.obsm['spatial'] = np.array(codex.obs[['x_um','y_um']]) # to easily visualize spatial image

In [None]:
# double check orientation is correct and layers have raw counts and normalized, respectively
sc.pl.embedding(codex[codex.obs['Image'] == 'cntrl_n130_d10'], color='pdL1', layer='counts',
                basis = 'spatial', title='cntrl_n130_d10', use_raw = False, vmax='p90')
sc.pl.embedding(codex[codex.obs['Image'] == 'cntrl_n130_d10'], color='pdL1', layer='normalized_data',
                basis = 'spatial', title='cntrl_n130_d10', use_raw = False, vmax='p90')

In [None]:
# codex.write_h5ad('codex.h5ad')

## scRNAseq

In [None]:
! ls unprocessed_data

In [None]:
rna_meta = pd.read_csv('unprocessed_data/rna_meta.csv', index_col=0)
rna_genes = pd.read_csv('unprocessed_data/rna_genes.csv', index_col=0).values.flatten()
rna_counts = pd.read_csv('unprocessed_data/rna_counts_matrix.csv') # counts matrix
rna_data = pd.read_csv('unprocessed_data/rna_data_matrix.csv') # normalized data matrix (see I/O Rmd)

In [None]:
rna_counts = rna_counts.set_index("Unnamed: 0")
rna_counts.index.name = None
rna_counts.columns = rna_meta.index
rna_counts = rna_counts.T
array_copy = np.array(rna_counts, dtype=float)
rna_counts = pd.DataFrame(array_copy, index = rna_counts.index, columns = rna_counts.columns)
del array_copy
rna_counts = sparse.csr_matrix(rna_counts)
rna_counts

In [None]:
rna_data = rna_data.set_index("Unnamed: 0")
rna_data.index.name = None
rna_data.columns = rna_meta.index
rna_data = rna_data.T

array_copy = np.array(rna_data, dtype=float)
rna_data = pd.DataFrame(array_copy, index = rna_data.index, columns = rna_data.columns)
del array_copy
rna_data = sparse.csr_matrix(rna_data)
rna_data

In [None]:
(rna_counts != rna_data).nnz==0 # checking the matrices are the same, which they should be

In [None]:
rna_genes = pd.DataFrame(rna_genes, columns=['gene'], index = rna_genes)

In [None]:
rna = ad.AnnData(X = rna_counts, obs = rna_meta, var = rna_genes, layers={'counts':rna_counts})
rna

In [None]:
rna.obsm['tSNE'] = np.array(rna.obs[['tSNE_1','tSNE_2']]) # transfer embeddings

In [None]:
# find genes that start with p (e.g., PD-L1 ortholog)
def starts_with_p(row):
    return any(str(val).startswith('Cd') for val in row)

filtered_df = rna.var[rna.var.apply(starts_with_p, axis=1)]
# sorted(filtered_df.values.flatten())

In [None]:
sc.pl.embedding(rna, color='Cd274', basis = 'tSNE', title='scRNA-seq', use_raw = False, vmax='p90')

In [None]:
# rna.write_h5ad('rna.h5ad')

# initial processing and clustering of scRNA-seq

## QC

In [None]:
rna = sc.read_h5ad('rna.h5ad')

In [None]:
rna.obs.head()

In [None]:
print(rna.obs["new_annotation"].value_counts())

In [None]:
# find genes that start with Rp (e.g., RPS or RPL orthologs)
def starts_with_p(row):
    return any(str(val).startswith('Hb') for val in row)

filtered_df = rna.var[rna.var.apply(starts_with_p, axis=1)]
# sorted(filtered_df.values.flatten())

In [None]:
rna.var["mt"] = rna.var_names.str.startswith("Mt-") # mitochondrial genes
rna.var["ribo"] = rna.var_names.str.startswith(("Rps", "Rpl")) # ribosomal genes
rna.var["hb"] = rna.var_names.str.contains("^Hb[^(P)]") # hemoglobin genes

In [None]:
print(rna.var[["mt","ribo","hb"]].value_counts())

In [None]:
sc.pp.calculate_qc_metrics(rna, qc_vars=["ribo", "hb"], inplace=True, log1p=True)

In [None]:
sc.pl.violin(rna, ["n_genes_by_counts", "total_counts", "pct_counts_ribo", "pct_counts_hb"],
    jitter=0.4, multi_panel=True, )

In [None]:
sc.pl.scatter(rna, "total_counts", "n_genes_by_counts", color="pct_counts_ribo")

In [None]:
print(rna.shape)
sc.pp.filter_genes(rna, min_cells=3)
print(rna.shape)

In [None]:
rna.layers["counts"] = rna.X.copy()

In [None]:
sc.pp.normalize_total(rna) # normalize to median total counts
sc.pp.log1p(rna) # logarithmize data

In [None]:
# ! pip install --user scikit-misc

In [None]:
sc.pp.highly_variable_genes(rna, n_top_genes=8000, batch_key=None, flavor='seurat_v3', layer='counts') # choosing 2K since only 5K cells

In [None]:
sc.pl.highly_variable_genes(rna)

## PP

In [None]:
sc.pp.pca(rna, n_comps=30)

In [None]:
sc.pl.pca_variance_ratio(rna, n_pcs=30, log=True)

In [None]:
list(rna.obs.columns)

In [None]:
sc.pl.pca(rna,
    color=["new_annotation", "new_annotation", "Sample", "Sample"],
    dimensions=[(0, 1), (2, 3), (0, 1), (2, 3)], ncols=2)

In [None]:
sc.pp.neighbors(rna, n_neighbors=15, n_pcs=20, method='umap') # method='umap' or method='gauss'

In [None]:
sc.tl.umap(rna)

In [None]:
sc.tl.leiden(rna, resolution=1.0)

In [None]:
sc.pl.umap(rna, color=['Sample'])
sc.pl.umap(rna, color=['new_annotation'])
sc.pl.umap(rna, color=['leiden'], legend_loc='on data')

In [None]:
rna.obs['Cluster'] = rna.obs['Cluster'].astype('category')
sc.pl.embedding(rna, basis = 'tSNE', color=['Sample'])
sc.pl.embedding(rna, basis = 'tSNE', color=['new_annotation'])
sc.pl.embedding(rna, basis = 'tSNE', color=['Cluster'], legend_loc='on data')

In [None]:
rna.write_h5ad('rna_umap.h5ad') # save this to avoid having to re-process

# initial processing and clustering of CODEX

In [None]:
codex = sc.read_h5ad('codex.h5ad')

In [None]:
sc.pp.pca(codex, n_comps=25)

In [None]:
sc.pl.pca_variance_ratio(codex, n_pcs=25, log=True) # choose 20 like in Seurat tutorial? Or 10 where elbow is

In [None]:
codex.obs.head()

In [None]:
sc.pl.pca(codex,
    color=["Image", "Image", "cell_type", "cell_type", "condition", "condition"],
    dimensions=[(0, 1), (2, 3), (0, 1), (2, 3), (0, 1), (2, 3)],
    ncols=2, size=2,)

In [None]:
sc.pp.neighbors(codex, n_neighbors=15, n_pcs=20, method='gauss') # method='umap' or method='gauss'

In [None]:
sc.tl.umap(codex)

In [None]:
sc.pl.umap(codex, color=['Image'])
sc.pl.umap(codex, color=['cell_type'])
sc.pl.umap(codex, color=['condition'], legend_loc='on data')

In [None]:
sc.tl.leiden(codex, resolution=0.4, n_iterations=2) # use igraph implementation and fixed iters to speed things up

In [None]:
sc.pl.umap(codex, color=['leiden'], legend_loc='on data')

In [None]:
codex.write_h5ad('codex_umap.h5ad') # save this since this integrated preprocessing took a while

# Naive scVI run on CODEX

https://docs.scvi-tools.org/en/stable/tutorials/notebooks/quick_start/api_overview.html

## fit models

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import seaborn as sns
import torch
import scvi

In [None]:
codex = sc.read_h5ad('codex_umap.h5ad')
# rna = sc.read_h5ad('rna_umap.h5ad')

In [None]:
codex.obs.head()

In [None]:
scvi.model.SCVI.setup_anndata(
    codex, layer='counts', 
    categorical_covariate_keys=['Image'], # 'condition'
)

In [None]:
model = scvi.model.SCVI(codex) # 10 latent layers by default

In [None]:
model.train()

In [None]:
codex_copy = codex.copy()
scvi.model.SCVI.setup_anndata(codex_copy, layer='counts', categorical_covariate_keys=['condition'],)
model_condition = scvi.model.SCVI(codex_copy)
model_condition.train()

In [None]:
codex.obsm['X_scVI'] = model.get_latent_representation()
codex_copy.obsm['X_scVI'] = model_condition.get_latent_representation()

In [None]:
codex.write_h5ad('codex_scvi_image.h5ad')
codex_copy.write_h5ad('codex_scvi_condition.h5ad')

## load and process (time consuming)

In [None]:
codex = sc.read_h5ad('codex_scvi_image.h5ad')
codex_copy = sc.read_h5ad('codex_scvi_condition.h5ad')

In [None]:
# subsample 10% for now
sc.pp.subsample(codex, .10)
sc.pp.subsample(codex_copy, .10)

In [None]:
sc.pp.neighbors(codex, use_rep='X_scVI')
sc.tl.umap(codex, min_dist=0.3)

In [None]:
sc.pl.umap(codex, color=['Image','cell_type','condition'][0])
sc.pl.umap(codex, color=['Image','cell_type','condition'][1])
sc.pl.umap(codex, color=['Image','cell_type','condition'][2])

In [None]:
sc.pp.neighbors(codex_copy, use_rep='X_scVI')
sc.tl.umap(codex_copy, min_dist=0.3)

In [None]:
sc.pl.umap(codex_copy, color=['Image','cell_type','condition'][0])
sc.pl.umap(codex_copy, color=['Image','cell_type','condition'][1])
sc.pl.umap(codex_copy, color=['Image','cell_type','condition'][2])

# Calculate SCI to prune CellChat results

In [None]:
codex = sc.read_h5ad('codex_umap.h5ad')
rna = sc.read_h5ad('rna_umap.h5ad')

In [None]:
codex

In [None]:
plt.subplots(figsize=(12,6))
ax = sns.histplot(codex.obs, x='CN', hue='cell_type', multiple='fill')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.xticks(rotation=315);

In [None]:
import matplotlib as mpl

mpl.rcParams['figure.figsize'] = (6,6)
ax = sc.pl.embedding(codex[codex.obs['Image']=='ict_n212_d10'], basis='spatial', color='CN', show=False)
ax.set_ylim(-100,9300)
ax.set_xlim(-100,9300)

In [None]:
def get_SCI(W, X, Y):
    
    N = W.shape[0]
    term1 = N / (2*W.sum().sum())

    x_m = X.mean()
    y_m = Y.mean()
    term2 = np.matmul(np.matmul(np.diag(X-x_m),W),np.diag(Y-y_m))
    term3 = term2.sum().sum()

    term4 = np.sqrt(((X-x_m)**2).sum()) * np.sqrt(((Y-y_m)**2).sum())

    term5 = term1 * term3 / term4

    return term5

# MaxFuse

https://github.com/shuxiaoc/maxfuse?tab=readme-ov-file

In [None]:
codex = sc.read_h5ad('codex_umap.h5ad')
rna = sc.read_h5ad('rna_umap.h5ad')

## add CN metadata to codex

In [None]:
adata_obs = sc.read_h5ad('codex.h5ad').obs
adata_obs.head()

In [None]:
sns.histplot(adata_obs, x='condition', hue='cell_type', multiple='stack')

In [None]:
# map neighborhood information (indices should match)
codex.obs['CN'] = adata_obs['neighborhood']

In [None]:
sns.histplot(adata_obs, x='condition', hue='neighborhood', multiple='stack')

In [None]:
# FIX THIS
codex.obs['CN'] = codex.obs['CN'].replace({
    1: 'CN1 Tumor Boundary',
    2: 'CN2 Tumor Bulk',
    3: 'CN3 Neutrophils + Dead cells',
    4: 'CN4 CX3CR1+ Macrophage',
    5: 'CN5 Dead Cells Center',
    6: 'CN6 Lymphoid Rich',
    7: 'CN7 INOS+ and IFN-g Actv Macs',
}).astype('category')

In [None]:
sns.histplot(codex.obs, x='condition', hue='CN', multiple='stack', legend=False).set_title('CNs across conditions');

In [None]:
sns.histplot(codex.obs, x='cell_type', hue='CN', multiple='stack', legend=False).set_title('CNs across cell types');
plt.xticks(rotation=90);

In [None]:
sc.pl.umap(codex, color='CN')

In [None]:
for im in codex.obs['Image'].unique():
    sc.pl.embedding(codex[codex.obs['Image'] == im], color='CN', basis = 'spatial', title=im)

In [None]:
# full data with tumor for reference
adata = sc.read_h5ad('codex.h5ad')
adata.obs['neighborhood'] = adata.obs['neighborhood'].astype('category')
for im in codex.obs['Image'].unique():
    sc.pl.embedding(adata[adata.obs['Image'] == im], color='neighborhood', basis = 'spatial', title=im)

In [None]:
# codex.write_h5ad('codex_umap.h5ad')

## do scRNA-seq feature selection

Re-compute 2k HVGs, and do top 100 genes for each cell type, and add those and marker genes to features for maxfuse

In [None]:
rna.var['mf_features'] = sc.pp.highly_variable_genes(rna, n_top_genes=2000, batch_key=None, flavor='seurat_v3', layer='counts', inplace=False)['highly_variable']

In [None]:
sc.tl.rank_genes_groups(rna, groupby='new_annotation', method='t-test')

In [None]:
print(np.sum(rna.var['mf_features']))
for ct in rna.obs['new_annotation'].unique():
    degs = sc.get.rank_genes_groups_df(rna, group=ct).iloc[:100,0].values
    rna.var.loc[rna.var.index.isin(degs),'mf_features']=True
print(np.sum(rna.var['mf_features']))

## cell type composition

In [None]:
ax = sns.histplot(codex.obs, x='condition',hue='cell_type', multiple='stack', legend=False)
for container in ax.containers:
    ax.bar_label(container, label_type='center')

In [None]:
plt.subplots(figsize=(12,6))
ax = sns.histplot(codex.obs, x='Image',hue='cell_type', multiple='stack', legend=False)
for container in ax.containers:
    ax.bar_label(container, label_type='center')
plt.xticks(rotation=90);

In [None]:
ax = sns.histplot(rna.obs, x='Sample',hue='new_annotation', multiple='stack', legend=False)
for container in ax.containers:
    ax.bar_label(container, label_type='center')

## Create Inputs: define weak-linked features i.e. connect protein and gene expression features

Does it need to be 1:1 feature mapping?

In [None]:
conversion = pd.read_csv('protein_gene_conversion.csv', index_col=0)

In [None]:
codex.var_names

In [None]:
h_m_map = pd.read_csv('human2mouse (1).txt', sep='\t', index_col=0)
h_m_map.reset_index(inplace=True)

In [None]:
found_rna = []
not_found = []
for gene in codex.var_names:
    if gene.capitalize() in rna.var_names:
        found_rna.append(gene.capitalize())
    else:
        not_found.append(gene.capitalize())

In [None]:
found_h_m_map = []
for i,gene in enumerate(not_found):
    if gene.capitalize() in h_m_map['Mouse'].values:
        found_h_m_map.append(gene.capitalize())
        not_found.pop(i)

In [None]:
found_protein_conversion = []
for i,gene in enumerate(not_found):
    if gene in conversion.index.values:
        found_protein_conversion.append(gene+':'+conversion.loc[gene,'RNA name'])
        not_found.pop(i)

In [None]:
found_protein_conversion2 = []
for i,gene in enumerate(not_found):
    if gene.upper() in conversion.index.values:
        found_protein_conversion2.append(gene+':'+conversion.loc[gene.upper(),'RNA name'])
        not_found.pop(i)

In [None]:
print('found in rna:', found_rna)
print('needs human mapping:', found_h_m_map)
print('found_protein_conversion', found_protein_conversion)
print('found_protein_conversion2', found_protein_conversion2)
print(not_found)

In [None]:
protein_mapping = {
    'cd103':'Itgae', 
    'ki67':'Mki67', 
    'foxp3':'Foxp3', 
    'cd140': 'Pdgfra', # CD140 protein same as PDGFRA gene? 
    'cx3cr1': 'Cx3cr1', 
    'cd3':'Cd3d', # or Cd3e or Cd3g 
    'cd8': 'Cd8b1', # or Cd8a
    'nkp46': 'Ncr1', # NKP46 protein same as NCR1 gene?
    'tim 3': 'Havcr2', # TIM3 protein same as HAVCR2 gene?  
    'xcr1': 'Xcr1', 
    'sirp-alpha': 'Sirpa', 
    'gzmB':'Gzmb', 
    'pd1':'Pdcd1', 
    'cd206': 'Mrc1',  
    'cd4':'Cd4',
    'caspase 3': 'Casp3',  
    'cd45': 'Ptprc', # or Ptprcap
    'Lag3':'Lag3', 
    'cd64': 'Fcgr1',  
    'f4-80': 'Adgre1',  
    'cd38':'Cd38',
    'cd31':'Pecam1', 
    'cd11c': 'Itgax',  
    'cd24':'Cd24a', 
    'inos': 'Nos2',  
    'cd11b': 'Itgam',  
    'ly6G':'Ly6g', 
    'cd90':'Thy1', 
    'mhcii': None, # composed of HLA-DPA1, HLA-DPB1, HLA-DQA1, HLA-DQB1, HLA-DRA? # not including because biased towards treated condition in scRNA, vs. codex
    'pdL1':'Cd274',
}

In [None]:
print(sorted(list(rna.var[rna.var_names.str.contains('H2')].index))[8:])

In [None]:
sc.pl.umap(codex, color = ['mhcii', 'condition'])

In [None]:
# h_m_map = pd.read_csv('human2mouse (1).txt', sep='\t', index_col=0)
# _ = ['HLA-DQA1','HLA-DQB1','HLA-DRA', 'HLA-DPA1', 'HLA-DPB1']
# _ = set(h_m_map.loc[h_m_map.index.isin(_),'Mouse'].values)
# _ = list(_ & set(rna.var_names))

# sc.pl.umap(rna, color = ['Cd8a', 'Cd8b1'])
# sc.pl.umap(rna, color = ['Cd3d', 'Cd3e', 'Cd3eap', 'Cd3g'])
# sc.pl.umap(rna, color = ['Ptprc', 'Ptprcap'])
# sc.pl.umap(rna, color = _, vmin='p10', vmax='p90')

sc.pl.umap(rna, color = ['Sirpa','Mrc1','Casp3','Fcgr1','Adgre1','Itgax','Itgam'], vmax='p90')

In [None]:
protein_index = list()
RNA_index = list()
for protein in protein_mapping.keys():
    if protein_mapping[protein] != None:
        protein_index.append(protein)
        RNA_index.append(protein_mapping[protein])
print(protein_index)
print(RNA_index)

In [None]:
rna_shared = rna[:,RNA_index].copy()
codex_shared = codex[:,protein_index].copy()
print(rna_shared.shape)
print(codex_shared.shape)

In [None]:
# only 18 of the ~30 shared features are HVGs in scRNA-seq
np.sum(rna_shared.var['mf_features'])

In [None]:
rna.var.loc[RNA_index,'mf_features'] = True
rna_shared.var.loc[RNA_index,'mf_features'] = True
print(np.sum(rna.var['mf_features']))

In [None]:
sc.pp.neighbors(rna_shared, n_neighbors=15, use_rep='X')
sc.tl.umap(rna_shared)

In [None]:
sc.pl.umap(rna_shared, color=['Sample'])
sc.pl.umap(rna_shared, color=['new_annotation'])
sc.pl.umap(rna_shared, color=['leiden'])

In [None]:
rna_shared = rna_shared.X.copy()
codex_shared = codex_shared.X.copy()

In [None]:
rna_active = rna[:,rna.var['mf_features']].copy()
sc.pp.scale(rna_active) # preprocessing in the tutorial, makes it mean=0 and std var
rna_active = rna_active.X

In [None]:
codex_active = codex.copy()
# not sure if needed to scale protein measurements (they don't do it in tutorial, but the scale might be [0,1] based on methods section)
codex_active = codex.X

In [None]:
rna_active = np.asarray(rna_active) # already dense numpy array
codex_active = np.asarray(codex_active.todense())
rna_shared = np.asarray(rna_shared.todense())
codex_shared = np.asarray(codex_shared.todense())

print(rna_active.shape)
print(codex_active.shape)
print(rna_shared.shape)
print(codex_shared.shape)

## fit MaxFuse

arr1 := RNA ; arr2 := CODEX

### step 1: prep

In [None]:
# use cell labels to guide MaxFuse smoothing steps
labels_rna = rna.obs['new_annotation'].values
labels_codex = codex.obs['cell_type'].values

display(labels_rna)
display(labels_codex)

In [None]:
fusor = mf.model.Fusor(
    shared_arr1 = rna_shared,
    active_arr1 = rna_active,
    labels1 = labels_rna,
    shared_arr2 = codex_shared,
    active_arr2 = codex_active,
    labels2 = labels_codex,
)

In [None]:
# see tutorial for explanation -- the below reduces computational complexity
fusor.split_into_batches(
    max_outward_size=8000,
    matching_ratio=4,
    metacell_size=2,
    verbose=True
)

In [None]:
# plot top singular values of active_arr1 on a random batch
fusor.plot_singular_values(target='active_arr1', n_components=None); # can also explicitly specify the number of components

In [None]:
# plot top singular values of active_arr2 on a random batch
fusor.plot_singular_values(target='active_arr2', n_components=None);

In [None]:
svd_components1 = 40
svd_components2 = 15

fusor.construct_graphs(
    n_neighbors1=15,
    n_neighbors2=15,
    svd_components1=svd_components1,
    svd_components2=svd_components2,
    resolution1=2,
    resolution2=2,
    # if two resolutions differ less than resolution_tol
    # then we do not distinguish between then
    resolution_tol=0.1,
    verbose=True
)

### step 2: finding initial pivots

In [None]:
fusor.plot_singular_values(target='shared_arr1',n_components=None);

In [None]:
fusor.plot_singular_values(target='shared_arr2',n_components=None);

In [None]:
svd_components1=20
svd_components2=20

fusor.find_initial_pivots(
    wt1=0.3, wt2=0.3, # weights of first and second modality; smaller = greater strength of fuzzy smoothing, 1 = original data used
    svd_components1=svd_components1, svd_components2=svd_components2)

### step 3: finding refined pivots

In [None]:
# plot top canonical correlations in a random batch
fusor.plot_canonical_correlations(
    svd_components1=40,
    svd_components2=None,
    cca_components=30
);

In [None]:
fusor.refine_pivots(
    wt1=0.3, wt2=0.3,
    svd_components1=40, svd_components2=None,
    cca_components=25,
    n_iters=1,
    randomized_svd=False, 
    svd_runs=1,
    verbose=True
)

In [None]:
fusor.filter_bad_matches(target='pivot', filter_prop=0.5) # 50% recommended by tutorial for spatial data

In [None]:
# check performance based on cell type accuracy (pivot matching)
pivot_matching = fusor.get_matching(order=(2, 1),target='pivot')

lv1_acc = mf.metrics.get_matching_acc(matching=pivot_matching, 
    labels1=labels_rna, 
    labels2=labels_codex,
    order = (2,1)
)
lv1_acc

In [None]:
# We can inspect the first pivot pair.
[pivot_matching[0][0], pivot_matching[1][0], pivot_matching[2][0]]

In [None]:
cm = confusion_matrix(labels_rna[pivot_matching[0]], labels_codex[pivot_matching[1]])
ConfusionMatrixDisplay(
    confusion_matrix=np.round((cm.T/np.sum(cm, axis=1)).T*100), 
    display_labels=np.unique(labels_rna),
).plot()

### step 4: propagation

In [None]:
fusor.propagate(
    svd_components1=40, 
    svd_components2=None, 
    wt1=0.7,
    wt2=0.7,
)

In [None]:
fusor.filter_bad_matches(target='propagated',filter_prop=0.3) # recommended filter_prop between 0.1 - 0.4

In [None]:
full_matching = fusor.get_matching(order=(2, 1), target='full_data') # we want rna (1) to match with multiple codex (2), not other way around

In [None]:
pd.DataFrame(list(zip(full_matching[0], full_matching[1], full_matching[2])), 
             columns = ['mod1_indx', 'mod2_indx', 'score'])
# columns: cell idx in mod1, cell idx in mod2, and matching scores

In [None]:
# compute the cell type level matching accuracy, for the full (filtered version) dataset
lv1_acc = mf.metrics.get_matching_acc(matching=full_matching, 
    labels1=labels_rna, 
    labels2=labels_codex 
)
lv1_acc

### step 5: downstream analysis (subsampled codex)

In [None]:
# if adding other metadata
codex_obs = sc.read_h5ad('codex_umap.h5ad').obs
rna_obs = sc.read_h5ad('rna_umap.h5ad').obs

In [None]:
display(codex_obs.columns)
display(rna_obs.columns)

In [None]:
rna_cca, protein_cca_sub = fusor.get_embedding(
    active_arr1=fusor.active_arr1,
    active_arr2=fusor.active_arr2[full_matching[1],:] # cells in codex remained after filtering
)

In [None]:
np.random.seed(42)
subs = 13000 # subsample 13k CODEX cells
randix = np.random.choice(protein_cca_sub.shape[0],subs, replace = False)

dim_use = 20 # dimensions of the CCA embedding to be used for UMAP etc

cca_adata = ad.AnnData(
    np.concatenate((rna_cca[:,:dim_use], protein_cca_sub[randix,:dim_use]), axis=0), 
    dtype=np.float32
)
cca_adata.obs['data_type'] = ['rna'] * rna_cca.shape[0] + ['protein'] * subs
cca_adata.obs['cell_type'] = list(np.concatenate((labels_rna,
                                                  labels_codex[full_matching[1]][randix]), axis = 0))
# add other metadata
condition_rna = rna_obs['Sample'].values
condition_codex = codex_obs['condition'].values
cca_adata.obs['condition'] = list(np.concatenate((condition_rna,
                                                  condition_codex[full_matching[1]][randix]), axis = 0))
cca_adata.obs['condition'] = cca_adata.obs['condition'].replace({
    'Control':'control',
    'aPD1_aCTLA4':'treated', 
    'CONTROL':'control', 
    'ICT':'treated',
})
cca_adata.obs['condition'] = cca_adata.obs['condition'].astype('category')

In [None]:
rna_shared

In [None]:
sc.pp.neighbors(cca_adata, n_neighbors=15)
sc.tl.umap(cca_adata)
sc.pl.umap(cca_adata, color='data_type')

In [None]:
sc.pl.umap(cca_adata, color='cell_type')

In [None]:
sc.pl.umap(cca_adata[cca_adata.obs['data_type']=='protein'], color='cell_type', title='just protein')

In [None]:
sc.pl.umap(cca_adata[cca_adata.obs['data_type']=='rna'], color='cell_type', title='just rna')

In [None]:
sc.pl.umap(cca_adata[cca_adata.obs['data_type']=='protein'], color='cell_type', groups=['neutrophils','cDC-1','IFN-gamma actv macs','T-reg'], title='just protein, sparse cell types')

In [None]:
sc.pl.umap(cca_adata[cca_adata.obs['data_type']=='rna'], color='cell_type', groups=['neutrophils','cDC-1','IFN-gamma actv macs','T-reg'], title='just rna, sparse cell types')

### step 5: downstream analysis (full codex)

In [None]:
rna_cca, protein_cca_full = fusor.get_embedding(
    active_arr1=fusor.active_arr1,
    active_arr2=fusor.active_arr2[full_matching[1],:] # cells in codex remained after filtering
)

In [None]:
# np.random.seed(42)
# subs = 13000 # subsample 13k CODEX cells
# randix = np.random.choice(protein_cca_sub.shape[0],subs, replace = False)

dim_use = 15 # dimensions of the CCA embedding to be used for UMAP etc

cca_adata_full = ad.AnnData(
    np.concatenate((rna_cca[:,:dim_use], protein_cca_full[:,:dim_use]), axis=0), 
    dtype=np.float32
)
cca_adata_full.obs['data_type'] = ['rna'] * rna_cca.shape[0] + ['protein'] * protein_cca_full.shape[0]
cca_adata_full.obs['cell_type'] = list(np.concatenate((labels_rna,
                                                  labels_codex[full_matching[1]]), axis = 0))

In [None]:
sc.pp.neighbors(cca_adata_full, n_neighbors=15)
sc.tl.umap(cca_adata_full)
sc.pl.umap(cca_adata_full, color='data_type')

In [None]:
sc.pl.umap(cca_adata_full, color='cell_type')

In [None]:
sc.pl.umap(cca_adata_full[cca_adata_full.obs['data_type']=='protein'], color='cell_type', title='just protein')

In [None]:
sc.pl.umap(cca_adata_full[cca_adata_full.obs['data_type']=='rna'], color='cell_type', title='just rna')

In [None]:
sc.pl.umap(cca_adata_full[cca_adata_full.obs['data_type']=='protein'], color='cell_type', groups=['neutrophils','cDC-1','IFN-gamma actv macs'], title='just protein, sparse cell types')

In [None]:
sc.pl.umap(cca_adata_full[cca_adata_full.obs['data_type']=='rna'], color='cell_type', groups=['neutrophils','cDC-1','IFN-gamma actv macs'], title='just rna, sparse cell types')

# explore CN information on merged data

In [None]:
cca_adata.obs['CN'] = list(np.concatenate((np.full_like(labels_rna, 'na (RNA)'),
                                            codex.obs['CN'].values[full_matching[1]][randix]), 
                                            axis = 0))
cca_adata.obs.head()

In [None]:
sc.pl.umap(cca_adata, color='cell_type', title='rna+protein')

In [None]:
sc.pl.umap(cca_adata, color='CN')

In [None]:
sc.pl.umap(cca_adata[cca_adata.obs['data_type']=='protein'], color='CN', title='just protein')

In [None]:
protein = cca_adata[cca_adata.obs['data_type']=='protein']
sc.tl.embedding_density(protein, groupby='CN')

In [None]:
sc.pl.umap(cca_adata[cca_adata.obs['data_type']=='rna'], color='cell_type', title='rna only')

In [None]:
sc.pl.umap(protein, color='cell_type', title='protein only')

In [None]:
sc.pl.umap(protein, color='cell_type', groups=['T-reg','cd4+ T cells','cd8+T cells'])

In [None]:
sc.pl.embedding_density(protein, groupby='CN', ncols=2)

In [None]:
sc.pl.umap(cca_adata, color='condition')

## find centroids for each cell type in each CN

In [None]:
# full matching gives the links between modalities: mod1=RNA, mod2=CODEX. RNA can map to multiple CODEX
pd.DataFrame(list(zip(full_matching[0], full_matching[1], full_matching[2])), 
             columns = ['mod1_indx', 'mod2_indx', 'score'])

In [None]:
# rna indices 0 through 5545, codex indices 5546 through 18545
cca_adata

In [None]:
for ct in protein.obs['cell_type'].unique():
    sc.pl.umap(protein, color='cell_type', groups=[ct])

In [None]:
# from visual inspection, drop cell_types from CNs that don't have much representation

cn_map = {
    'CN1':'CN1 Tumor Boundary',
    'CN2':'CN2 Tumor Bulk',
    'CN3':'CN3 Neutrophils + Dead cells',
    'CN4':'CN4 CX3CR1+ Macrophage',
    'CN5':'CN5 Dead Cells Center',
    'CN6':'CN6 Lymphoid Rich',
    'CN7':'CN7 INOS+ and IFN-g Actv Macs',
}
cn_map = dict((v,k) for k,v in cn_map.items()) # flip the dict

drop = {
    'T-reg': ['CN3','CN5',], # CN3, CN5
    'cx3cr1+ macs': ['CN3','CN5',], # CN5 
    'cd206+ macs': ['CN5',], # CN5 
    'cd8+T cells': ['CN5',], # CN5
    'IFN-gamma actv macs': ['CN5',], # CN5
    'inos+ macrophages': ['CN5',], # CN5 
    'cd4+ T cells': ['CN5',], # CN5
    'nk cells': ['CN3','CN5',], # CN3, CN5
    'cDC-1': ['CN3','CN5',], # CN3, CN5
    'neutrophils': ['CN1','CN2','CN4','CN6','CN7'], # all but CN3, CN5
}

In [None]:
# calculate centroid using protein data
protein.obs['cn'] = protein.obs['CN'].replace(cn_map)
protein.obs['ct_cn'] = protein.obs['cell_type'].astype('str') + ' ' + protein.obs['cn'].astype('str')
protein.obs['ct_cn'] = protein.obs['ct_cn'].astype('category')
protein.obs['ct_cn_i'] = protein.obs['ct_cn'].cat.codes

centroids = mf.utils.get_centroids(protein.X, protein.obs['ct_cn_i'])
centroids = pd.DataFrame(centroids)

In [None]:
tmp = protein.obs[['ct_cn','ct_cn_i']].copy().drop_duplicates()
tmp.index = tmp['ct_cn_i']
tmp.sort_index(inplace=True)
tmp.drop(columns='ct_cn_i', inplace=True)

In [None]:
centroids['ct_cn'] = tmp['ct_cn']
centroids.set_index('ct_cn', inplace=True)
centroids.head()

## redo UMAP with cell_type/CN centroids visualized

In [None]:
centroids_copy = centroids.copy()

In [None]:
centroids = ad.AnnData(centroids_copy)
centroids.obs['data_type'] = 'centroids'
centroids.obs['centroid'] = centroids.obs.index.astype('str') + ' centroid'
centroids.obs['cell_type'] = centroids.obs['centroid'].str.split(' CN', expand=True)[0]
centroids.obs['CN'] = centroids.obs['centroid'].str.split('CN', expand=True)[1]
centroids.obs['CN'] = 'CN' + centroids.obs['CN'].str.split(' ', expand=True)[0]

In [None]:
cca_adata.obs['centroid'] = 'NA'

In [None]:
cca_adata_centroids = ad.concat([cca_adata, centroids])
cca_adata_centroids

In [None]:
sc.pp.neighbors(cca_adata_centroids, n_neighbors=15)
sc.tl.umap(cca_adata_centroids)
sc.pl.umap(cca_adata_centroids, color='data_type')

In [None]:
centroids = cca_adata_centroids[cca_adata_centroids.obs['data_type']=='centroids']
cca_adata = cca_adata_centroids[cca_adata_centroids.obs['data_type']!='centroids']
protein = cca_adata_centroids[cca_adata_centroids.obs['data_type']=='protein']
rna = cca_adata_centroids[cca_adata_centroids.obs['data_type']=='rna']

In [None]:
sc.pl.umap(cca_adata, color='cell_type')
sc.pl.umap(cca_adata, color='CN', groups = sorted(cca_adata.obs['CN'].unique())[:-1])

In [None]:
sc.pl.umap(protein, color='cell_type', title='just protein')


In [None]:
sc.pl.umap(rna, color='cell_type', title='just rna')

In [None]:
centroids_plot = centroids.obs[['cell_type','CN','centroid']]
centroids_plot['UMAP1'] = np.array(centroids.obsm['X_umap'][:,0])
centroids_plot['UMAP2'] = np.array(centroids.obsm['X_umap'][:,1])
centroids_plot['color'] = centroids_plot['CN'].replace({
    'CN1': '#ff7f0e', 
    'CN2': '#d62728', 
    'CN3': '#8c564b', 
    'CN4': '#b5bd61', 
    'CN5': '#aec7e8', 
    'CN6': '#98df8a', 
    'CN7': '#c5b0d5',
})
centroids_plot

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib import rcParams

for ct in cca_adata.obs['cell_type'].unique():
    fig, ax = plt.subplots(figsize=(8, 6))
    sc.pl.umap(cca_adata, color='cell_type', groups = [ct], ax=ax, title=ct+' CN centroids', legend_loc=False, show=False)
    plt.scatter(x=centroids_plot.loc[centroids_plot['cell_type']==ct,'UMAP1'], 
                y=centroids_plot.loc[centroids_plot['cell_type']==ct,'UMAP2'],
                color = centroids_plot.loc[centroids_plot['cell_type']==ct,'color'],
                marker='*', s=200)

In [None]:
# uncomment this block to add condition metadata back (should be done above)

# condition_rna = rna_obs['Sample'].values
# condition_codex = codex_obs['condition'].values
# condition_array = list(np.concatenate((condition_rna, condition_codex[full_matching[1]][randix], ['NA'] * centroids.shape[0]), axis = 0))
# cca_adata_centroids.obs['condition'] = condition_array
# cca_adata_centroids.obs['condition'] = cca_adata_centroids.obs['condition'].replace({
#     'Control':'control',
#     'aPD1_aCTLA4':'treated', 
#     'CONTROL':'control', 
#     'ICT':'treated',
#     'NA' : 'NA',
# })
# cca_adata_centroids.obs['condition'] = cca_adata_centroids.obs['condition'].astype('category')
# 
# centroids = cca_adata_centroids[cca_adata_centroids.obs['data_type']=='centroids']
# cca_adata = cca_adata_centroids[cca_adata_centroids.obs['data_type']!='centroids']
# protein = cca_adata_centroids[cca_adata_centroids.obs['data_type']=='protein']
# rna = cca_adata_centroids[cca_adata_centroids.obs['data_type']=='rna']
# rna_centroids = cca_adata_centroids[cca_adata_centroids.obs['data_type']!='protein']

In [None]:
sc.pl.umap(cca_adata, color='condition')

In [None]:
# cca_adata_centroids.write_h5ad('cca_adata_centroids.h5ad')

## find scRNA-seq nearest to cell_type/CN centroids

do it for treated and untreated separately

In [None]:
# cca_adata_centroids = sc.read_h5ad('cca_adata_centroids.h5ad')

# centroids = cca_adata_centroids[cca_adata_centroids.obs['data_type']=='centroids']
# cca_adata = cca_adata_centroids[cca_adata_centroids.obs['data_type']!='centroids']
# protein = cca_adata_centroids[cca_adata_centroids.obs['data_type']=='protein']
# rna = cca_adata_centroids[cca_adata_centroids.obs['data_type']=='rna']
# rna_centroids = cca_adata_centroids[cca_adata_centroids.obs['data_type']!='protein']

In [None]:
treated = rna_centroids[rna_centroids.obs['condition'].isin(['treated','NA'])]
untreated = rna_centroids[rna_centroids.obs['condition'].isin(['control','NA'])]

In [None]:
# for each ct/cn combination, subset the cell_type, construct NN graph, grab column of distances to centroid, sort and take top 100 (or as many) and those are repesentative single-cells for that ct/cn, for treated and untreated
CUTOFF = 50
treated_ct_cns = {}
untreated_ct_cns = {}
for ct in rna_centroids.obs['cell_type'].unique():
    treated_ct_cns[ct] = []
    untreated_ct_cns[ct] = []
    for cn in ['CN1', 'CN2', 'CN3', 'CN4', 'CN5', 'CN6', 'CN7']:
        if cn in drop[ct]: # skip the ones that don't have great CN representation
            continue
        print('ct:', ct, '| cn:', cn)
        # do it first for treated
        subset = treated[(treated.obs['cell_type']==ct) & \
                               (treated.obs['CN'].isin([cn,'na (RNA)']))].copy()
        sc.pp.neighbors(subset, n_neighbors = subset.shape[0]) # connect full neighborhood graph
        cells = list(subset.obs.index) # grab cell names
        nn = pd.DataFrame(subset.obsp['connectivities'].todense(), index=cells, columns=cells)
        cells.remove(ct+' '+cn) # remove centroid
        nn = nn.loc[[ct+' '+cn],cells].T
        nn = nn.sort_values(by=ct+' '+cn, ascending=False)
        cutoff = CUTOFF
        if nn.shape[0]<100:
            cutoff = nn.shape[0]
            print('<100 NNs...finding', cutoff, 'NNs')
        nn = nn.iloc[:max(nn.shape[0],cutoff)] # either top 100 or however many are in the dataset
        nn = nn.loc[nn[ct+' '+cn]>0] # must have connectivity > 0 (this shouldn't matter I think)
        cca_adata_centroids.obs['nn treated '+ct+' '+cn] = False
        cca_adata_centroids.obs.loc[nn.index, 'nn treated '+ct+' '+cn] = True
        treated_ct_cns[ct].append('nn treated '+ct+' '+cn)
        # then do it again for untreated
        subset = untreated[(untreated.obs['cell_type']==ct) & \
                               (untreated.obs['CN'].isin([cn,'na (RNA)']))].copy()
        sc.pp.neighbors(subset, n_neighbors = subset.shape[0]) # connect full neighborhood graph
        cells = list(subset.obs.index) # grab cell names
        nn = pd.DataFrame(subset.obsp['connectivities'].todense(), index=cells, columns=cells)
        cells.remove(ct+' '+cn) # remove centroid
        nn = nn.loc[[ct+' '+cn],cells].T
        nn = nn.sort_values(by=ct+' '+cn, ascending=False)
        cutoff = CUTOFF
        if nn.shape[0]<100:
            cutoff = nn.shape[0]
            print('<100 NNs...finding', cutoff, 'NNs')
        nn = nn.iloc[:max(nn.shape[0],cutoff)] # either top 100 or however many are in the dataset
        nn = nn.loc[nn[ct+' '+cn]>0] # must have connectivity > 0 (this shouldn't matter I think)
        cca_adata_centroids.obs['nn untreated '+ct+' '+cn] = False
        cca_adata_centroids.obs.loc[nn.index, 'nn untreated '+ct+' '+cn] = True
        untreated_ct_cns[ct].append('nn untreated '+ct+' '+cn)

cca_adata_centroids.obs

In [None]:
for ct in treated_ct_cns:
    sc.pl.umap(cca_adata_centroids, color=treated_ct_cns[ct] + untreated_ct_cns[ct], ncols=5, legend_loc=False)

In [None]:
cca_adata_centroids.write_h5ad('cca_')