# Run Savercat with all genes

In [1]:
#Import Packages
import random
import os
import numpy as np
import scanpy as sc
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras.utils.vis_utils import plot_model
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, LeakyReLU, Lambda
from tensorflow.keras import Model
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as pl

In [2]:
base_name = os.path.basename(os.getcwd())
print(base_name)
print(sc.__version__)
sc.settings.verbosity = 3  
sc.logging.print_versions()

bin
1.5.1
scanpy==1.5.1 anndata==0.7.4 umap==0.4.6 numpy==1.20.1 scipy==1.4.1 pandas==1.0.3 scikit-learn==0.23.1 statsmodels==0.11.1 python-igraph==0.8.2 leidenalg==0.8.1


# 1. Load data

In [3]:
adata = sc.read_h5ad('../data/adata_subsample_allg.h5ad')
print(adata)

AnnData object with n_obs × n_vars = 29259 × 20042
    obs: 'Cycle', 'patient'


# 2. Savercat preprocess

In [4]:
# import utils functions in utils_0509.py
from utils_0509 import *

In [5]:
predict_key = 'Cycle' # the name of the cell-level label to be predicted
batch_key = 'patient' # the name of the cell-level label to be adjusted for

In [6]:
# savercat preprocessing step
adata = savercat_preprocess(adata, predict_key=predict_key, adjust_key=batch_key)
adata

normalizing by total count per cell
    finished (0:00:24): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


AnnData object with n_obs × n_vars = 29259 × 20042
    obs: 'Cycle', 'patient', 'n_counts', 'size_factors'
    var: 'mean', 'std'
    uns: 'log1p'
    obsm: 'saver_targetL', 'B_raw', 'B', 'loglib', 'saver_batch'

# 3. Build the model

In [7]:
# import network buiding functions in network_0509.py
from network_0509 import * 

In [8]:
# if train on highly variable genes, then keep enc=(256, 256, 128), dec=(128, 256, 256)
# leave all the parameters unchanged
SAVER_net = CVAE(x_input_size = adata.n_vars, # number of genes
                 b_input_size = adata.obsm['saver_batch'].shape[1], # number of batches including lib-size
                 lb_input_size = adata.obsm['saver_targetL'].shape[1], # number of labels to predict
                 enc = (512, 256, 128), # dim of the encoder
                 dec = (128, 256, 512), # dim of the decoder
                 latent_k = 30) # dimension of the low-dimensional latent space
SAVER_net.build()
SAVER_net.compile_model(pred_weight=1, kl_weight=1)

# 4. Initialize the model

In [9]:
# no need to modify this block
# label guided initialization step
loss = SAVER_net.model_initialize(adata, fit_verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 3/300
Epoch 4/300
Epoch 00004: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 5/300
Epoch 00005: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 00005: early stopping


In [10]:
# fill in the directory where you want to save the file
# 'weights_step1.h5' is the file name
SAVER_net.model.save_weights('../data/weights_init.h5') 

# 5. Fine-tune the model

In [11]:
# if train on highly variable genes, then keep enc=(256, 256, 128), dec=(128, 256, 256)
# leave all the parameters unchanged
# same as block 8 but use the weight you just saved
SAVER_net = CVAE(x_input_size = adata.n_vars,
                 b_input_size = adata.obsm['saver_batch'].shape[1],
                 lb_input_size = adata.obsm['saver_targetL'].shape[1],
                 enc = (512, 256, 128),
                 dec = (128, 256, 512),
                 latent_k = 30)
SAVER_net.build()
SAVER_net.load_weights('../data/weights_init.h5') # fill in the weight file you just saved
SAVER_net.compile_model(pred_weight=0., kl_weight=1)

In [12]:
# no need to modify this block
# train savercat model which do the dimension reduction
loss = SAVER_net.model_finetune(adata, fit_verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 00018: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 19/300
Epoch 20/300
Epoch 00020: early stopping


In [13]:
# predict the low-dimensional embedding for all the cells, and save to a csv file
meta_df_train = adata.obs
z_train = SAVER_net.extra_models['mean_out'].predict([adata.X, adata.obsm['saver_batch']])
z_df = pd.DataFrame(z_train, 
                    index = meta_df_train.index,
                    columns = ['saver{}'.format(i+1) for i in range(SAVER_net.latent_k)])
z_df.to_csv('../data/lowdim_savercat_allg.csv') # where you want to save the low-dimensional embeddings learned by SAVERCAT