# Data Format Transformation

This document shows how to generate an Anndata object which can be directly used by Savercat from the original data file.

In [1]:
import random
import os
import numpy as np
import scanpy as sc
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras.utils.vis_utils import plot_model
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, LeakyReLU, Lambda
from tensorflow.keras import Model
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as pl
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
readRDS = robjects.r['readRDS']

In [2]:
base_name = os.path.basename(os.getcwd())
print(base_name)
print(sc.__version__)
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

bin
1.5.1
scanpy==1.5.1 anndata==0.7.4 umap==0.4.6 numpy==1.20.1 scipy==1.4.1 pandas==1.0.3 scikit-learn==0.23.1 statsmodels==0.11.1 python-igraph==0.8.2 leidenalg==0.8.1


## Anndata with only highly variable genes

In [3]:
# Load in cell by highly variablre gene matrix (subsample_hvg.rds) and axix names.
counts = readRDS('../data/subsample_hvg.rds')
row_names = pd.read_csv('../data/subsample_hvg_row.txt', header=0, index_col=0, sep='\t')
row_names = row_names['x'].to_list()
col_names = pd.read_csv('../data/subsample_hvg_col.txt', header=0, index_col=0, sep='\t')
col_names = col_names['x'].to_list()
adata = pd.DataFrame(data=counts, index=row_names, columns=col_names)
adata

Unnamed: 0,AAACCTGAGAAGGACA-1_1_1,AAACCTGAGAGCAATT-1_1_1,AAACCTGAGGAGCGAG-1_1_1,AAACCTGCATCTGGTA-1_1_1,AAACCTGGTTCACCTC-1_1_1,AAACCTGGTTGTGGCC-1_1_1,AAACGGGCAAGCCTAT-1_1_1,AAACGGGCATCGTCGG-1_1_1,AAACGGGTCGAACGGA-1_1_1,AAAGATGAGAGATGAG-1_1_1,...,TTTGTCAAGAAACCTA-1_4_2,TTTGTCAAGAAACGCC-1_4_2,TTTGTCAAGCTTCGCG-1_4_2,TTTGTCACACATGACT-1_4_2,TTTGTCACAGATGGCA-1_4_2,TTTGTCACAGCGTTCG-1_4_2,TTTGTCACATCCGTGG-1_4_2,TTTGTCACATTCGACA-1_4_2,TTTGTCAGTCATGCCG-1_4_2,TTTGTCATCGAATGGG-1_4_2
IGKV1-9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S100A9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IGKV3-20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S100A8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IGKV3-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DACT3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LAMC3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HTR3A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SEMA5A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# generate Anndata
adata = adata.sample(frac=1, axis=1, random_state=210509)
adata = sc.AnnData(adata)
adata = adata.transpose()

In [5]:
# load in and add metadata to adata.obs
meta_df = pd.read_csv('../data/subsample_hvg_meta.txt', header=0, index_col=0, sep='\t')
adata.obs['Cycle'] = meta_df.loc[adata.obs_names, "Cycle"]
adata.obs['patient'] = meta_df.loc[adata.obs_names, "patient"]
print(adata)
adata.obs

AnnData object with n_obs × n_vars = 29259 × 2668
    obs: 'Cycle', 'patient'


Unnamed: 0,Cycle,patient
TCGAGGCGTCTACCTC-1_3_1,Cycle4,12
TTCTCAATCCAGAAGG-1_4_2,Cycle6,4
GTGAAGGCATCCGGGT-1_4_2,Cycle6,4
CGTAGGCTCTTAACCT-1_4_2,Cycle6,4
TGGCCAGCACGAGAGT-1_3_2,Cycle4,4
...,...,...
AAGGCAGTCCTGCCAT-1_4_2,Cycle6,4
TACTTACGTGTGCGTC-1_1_2,Cycle1,4
CAACCAACACGCCAGT-1_4_1,Cycle6,12
CTGATCCCAAGGACTG-1_2_1,Cycle2,12


In [6]:
# save as .h5ad
adata.write_h5ad('../data/adata_subsample_hvg.h5ad')

... storing 'Cycle' as categorical


## Anndata with all genes

In [7]:
# Load in cell by all gene matrix (subsample_hvg.rds) and axix names.
counts = readRDS('../data/subsample_allg.rds')
row_names = pd.read_csv('../data/subsample_allg_row.txt', header=0, index_col=0, sep='\t')
row_names = row_names['x'].to_list()
col_names = pd.read_csv('../data/subsample_allg_col.txt', header=0, index_col=0, sep='\t')
col_names = col_names['x'].to_list()
adata = pd.DataFrame(data=counts, index=row_names, columns=col_names)
adata

Unnamed: 0,AAACCTGAGAAGGACA-1_1_1,AAACCTGAGAGCAATT-1_1_1,AAACCTGAGGAGCGAG-1_1_1,AAACCTGCATCTGGTA-1_1_1,AAACCTGGTTCACCTC-1_1_1,AAACCTGGTTGTGGCC-1_1_1,AAACGGGCAAGCCTAT-1_1_1,AAACGGGCATCGTCGG-1_1_1,AAACGGGTCGAACGGA-1_1_1,AAAGATGAGAGATGAG-1_1_1,...,TTTGTCAAGAAACCTA-1_4_2,TTTGTCAAGAAACGCC-1_4_2,TTTGTCAAGCTTCGCG-1_4_2,TTTGTCACACATGACT-1_4_2,TTTGTCACAGATGGCA-1_4_2,TTTGTCACAGCGTTCG-1_4_2,TTTGTCACATCCGTGG-1_4_2,TTTGTCACATTCGACA-1_4_2,TTTGTCAGTCATGCCG-1_4_2,TTTGTCATCGAATGGG-1_4_2
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL669831.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM87B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LINC00115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM41C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IGLVI-20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL031593.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CU633967.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GRIK1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# generate Anndata
adata = adata.sample(frac=1, axis=1, random_state=210509)
adata = sc.AnnData(adata)
adata = adata.transpose()

In [9]:
# load in and add metadata to adata.obs
meta_df = pd.read_csv('../data/subsample_allg_meta.txt', header=0, index_col=0, sep='\t')
adata.obs['Cycle'] = meta_df.loc[adata.obs_names, "Cycle"]
adata.obs['patient'] = meta_df.loc[adata.obs_names, "patient"]
print(adata)
adata.obs

AnnData object with n_obs × n_vars = 29259 × 20042
    obs: 'Cycle', 'patient'


Unnamed: 0,Cycle,patient
TCGAGGCGTCTACCTC-1_3_1,Cycle4,12
TTCTCAATCCAGAAGG-1_4_2,Cycle6,4
GTGAAGGCATCCGGGT-1_4_2,Cycle6,4
CGTAGGCTCTTAACCT-1_4_2,Cycle6,4
TGGCCAGCACGAGAGT-1_3_2,Cycle4,4
...,...,...
AAGGCAGTCCTGCCAT-1_4_2,Cycle6,4
TACTTACGTGTGCGTC-1_1_2,Cycle1,4
CAACCAACACGCCAGT-1_4_1,Cycle6,12
CTGATCCCAAGGACTG-1_2_1,Cycle2,12


In [10]:
# save as .h5ad
adata.write_h5ad('../data/adata_subsample_allg.h5ad')

... storing 'Cycle' as categorical
