In [35]:
from os.path import join as opj
import numpy as np
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 100

In [36]:
data_root = '/om2/user/rogerjin/data/Ben'
data_path = opj(data_root, 'data_files_new.npz')
data = np.load(data_path, allow_pickle=True)
rna = data['rna_train']
atac = data['atac_train_small']

In [37]:
def zero_frac(arr):
    zero_count = np.count_nonzero(arr==0)
    return zero_count/arr.size

In [38]:
print('rna zero frac:', zero_frac(rna))
print('atac zero frac:', zero_frac(atac))

rna zero frac: 0.0
atac zero frac: 0.9740022531413609


In [39]:
list(data.keys())

['peak_selector',
 'atac_train_small',
 'atac_test_small',
 'rna_train',
 'rna_test',
 'rna_good_feats',
 'train_idx',
 'test_idx',
 'cell_type_train',
 'cell_type_test']

In [40]:
import torch
torch.cuda.is_available()

True

In [53]:
def self_correlation(matrix, device='cuda:0'):
    matrix = torch.Tensor(matrix).to(device)
    # return matrix.T @ matrix
    return torch.corrcoef(matrix.T).detach().cpu()

train_rna = data['rna_train']
train_atac = data['atac_train_small']
corr_rna = self_correlation(train_rna)
corr_atac = self_correlation(train_atac)

In [54]:
import pandas as pd
gene_list = pd.read_csv(f'{data_root}/gene_list.csv', header=None)

In [55]:
chosen_genes = gene_list[data['rna_good_feats']]

In [56]:
cell_labels_train = data['cell_type_train']

In [57]:
cell_labels_train.shape

(6897,)

In [58]:
from GanoliModel import GanoliLogisticGAN
model_cls = GanoliLogisticGAN

In [59]:
from sklearn.decomposition import PCA

pca_rna = PCA(n_components=20)
pca_atac = PCA(n_components=20)
pca_rna.fit_transform(corr_rna)
pca_atac.fit_transform(corr_atac)

rna_embedding = torch.Tensor(pca_rna.components_.T).to('cuda:0')
atac_embedding = torch.Tensor(pca_atac.components_.T).to('cuda:0')

In [60]:
checkpoint_dir = '/om2/user/rogerjin/GANOLI/ganoli/models/logs/logistic_embed_pca_corr_lr=0.0002_beta1=0.5/default/version_0/checkpoints'
checkpoint = 'step=123119-epoch=569-val_oracle_total=1.07.ckpt'
checkpoint_path = f'{checkpoint_dir}/{checkpoint}'
ckpt_model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint_path, rna_shape=7445, atac_shape=3808, rna_embedding=rna_embedding, atac_embedding=atac_embedding)

RuntimeError: Error(s) in loading state_dict for GanoliLogisticGAN:
	size mismatch for generator_rna2atac.linear.weight: copying a param with shape torch.Size([3808, 20]) from checkpoint, the shape in current model is torch.Size([3808, 7445]).
	size mismatch for generator_atac2rna.model.weight: copying a param with shape torch.Size([7445, 20]) from checkpoint, the shape in current model is torch.Size([7445, 3808]).
	size mismatch for discriminator_rna.model.weight: copying a param with shape torch.Size([1, 20]) from checkpoint, the shape in current model is torch.Size([1, 7445]).
	size mismatch for discriminator_atac.model.weight: copying a param with shape torch.Size([1, 20]) from checkpoint, the shape in current model is torch.Size([1, 3808]).

In [None]:
ckpt_model.to('cuda:0')

In [None]:
atac_val = data['atac_test_small']

In [None]:
import torch
atac_val = torch.Tensor(atac_val).to('cuda:0')

In [None]:
atac_val_mini = atac_val[:10, :]

In [None]:
rna_val_pred = ckpt_model(atac_val,data_type='atac')

In [None]:
rna_val_pred.shape

In [None]:
rna_val_labels = data['cell_type_test']

In [None]:
import scanpy as sc
import anndata as ad
sc.set_figure_params(dpi=300)
sc._settings.ScanpyConfig.n_jobs = 4

def plot_umap(data, labels=None, label_name=None):
    data = ad.AnnData(data)
    if labels is not None:
        data.obs[label_name] = labels
    sc.pp.neighbors(data, n_neighbors=10, n_pcs=40)
    sc.tl.leiden(data)
    sc.tl.paga(data)
    sc.pl.paga(data, plot=False)
    sc.tl.umap(data, init_pos='paga')
    sc.pl.umap(data, color=label_name) 
    return data

In [None]:
plot_umap(rna_val_pred, rna_val_labels, label_name='Cell Type')