In [None]:
import gc
import pandas as pd
import numpy as np
import mudata
from mudata import AnnData, MuData
import scanpy as sc
import muon as mu
from scipy.sparse import hstack,vstack,csr_matrix,save_npz,load_npz
from sklearn.decomposition import NMF,LatentDirichletAllocation,TruncatedSVD
from muon import prot as pt
import json

############################################################################
#----- work folder -----
############################################################################
settings = json.load(open('./settings.json'))

input_path = settings['input_path']
features_path = settings['features_path']
model_path = settings['model_path']
sub_path = settings['sub_path']


# save transformed cite inputs to csr_matrix and id list

In [None]:
%%time
train_multi_inputs = pd.read_hdf(input_path+'train_multi_inputs.h5').reset_index(drop=True)
metadata = pd.read_csv(input_path+'metadata.csv.zip')

# id list
train_multi_inputs_id = pd.read_hdf(input_path+'train_multi_inputs.h5').reset_index()[['cell_id']]
train_multi_inputs_id = train_multi_inputs_id.merge(metadata,on=['cell_id'],how='left')
train_multi_inputs_id = train_multi_inputs_id.drop(['technology'],axis=1)
train_multi_inputs_id.to_feather(feature_path+'train_multi_inputs_id.feather')

# csr_matrix
train_multi_inputs_sparse = csr_matrix(train_multi_inputs.to_numpy())
save_npz(feature_path+"train_multi_inputs_sparse.npz", train_multi_inputs_sparse)

del train_multi_inputs,train_multi_inputs_id
gc.collect()

test_multi_inputs = pd.read_hdf(input_path+'test_multi_inputs.h5').reset_index(drop=True)

# id list
test_multi_inputs_id = pd.read_hdf(input_path+'test_multi_inputs.h5').reset_index()[['cell_id']]
test_multi_inputs_id = test_multi_inputs_id.merge(metadata,on=['cell_id'],how='left')
test_multi_inputs_id = test_multi_inputs_id.drop(['technology'],axis=1)
test_multi_inputs_id.to_feather(feature_path+'test_multi_inputs_id.feather')

# csr_matrix
test_multi_inputs_sparse = csr_matrix(test_multi_inputs.to_numpy())
save_npz(feature_path+"test_multi_inputs_sparse.npz", test_multi_inputs_sparse)

del test_multi_inputs,test_multi_inputs_id
gc.collect()

multi_inputs_sparse = vstack([train_multi_inputs_sparse,test_multi_inputs_sparse])
save_npz(feature_path+"multi_inputs_sparse.npz", multi_inputs_sparse)

del train_multi_inputs_sparse,test_multi_inputs_sparse,multi_inputs_sparse
gc.collect()

# sparse matrix to 100d tsvd

In [None]:
%%time

multi_inputs_sparse = load_npz(feature_path+"multi_inputs_sparse.npz")
tsvd = TruncatedSVD(n_components=100, algorithm='arpack')
multi_inputs_svd = tsvd.fit_transform(multi_inputs_sparse)
np.save(feature_path+'multi_inputs_svd_100.npy', multi_inputs_svd)

# save raw count cite inputs to csr_matrix 

In [None]:
%%time
train_multi_inputs_raw = pd.read_hdf(input_path+'train_multi_inputs_raw.h5').reset_index(drop=True)

# id list
train_multi_inputs_id_raw = pd.read_hdf(input_path+'train_multi_inputs_raw.h5').reset_index()[['cell_id']]
train_multi_inputs_id_raw.to_feather(feature_path+'train_multi_inputs_id_raw.feather')

# csr_matrix
train_multi_inputs_raw_sparse = csr_matrix(train_multi_inputs_raw.to_numpy())
save_npz(feature_path+"train_multi_inputs_raw_sparse.npz", train_multi_inputs_raw_sparse)

del train_multi_inputs_raw,train_multi_inputs_id
gc.collect()

test_multi_inputs_raw = pd.read_hdf(input_path+'test_multi_inputs_raw.h5').reset_index(drop=True)

# id list
test_multi_inputs_id_raw = pd.read_hdf(input_path+'test_multi_inputs_raw.h5').reset_index()[['cell_id']]
test_multi_inputs_id_raw.to_feather(feature_path+'test_multi_inputs_id_raw.feather')

# csr_matrix
test_multi_inputs_raw_sparse = csr_matrix(test_multi_inputs_raw.to_numpy())
save_npz(feature_path+"test_multi_inputs_raw_sparse.npz", test_multi_inputs_raw_sparse)

del test_multi_inputs_raw
gc.collect()

multi_inputs_raw_sparse = vstack([train_multi_inputs_raw_sparse,test_multi_inputs_raw_sparse])
save_npz(feature_path+"multi_inputs_raw_sparse.npz", multi_inputs_raw_sparse)

del train_multi_inputs_raw_sparse,test_multi_inputs_raw_sparse,multi_inputs_raw_sparse
gc.collect()

# centered log ratio(clr) for raw count

In [None]:
%%time
print ('load train_multi_inputs_raw')
train_rna_df = pd.read_hdf(input_path+'train_multi_inputs_raw.h5')
print ('load test_multi_inputs_raw')
test_rna_df = pd.read_hdf(input_path+'test_multi_inputs_raw.h5')
print ('concat multi_inputs_raw')
rna_df = pd.concat([train_rna_df,test_rna_df])

del train_rna_df,test_rna_df
gc.collect()

print ('AnnData')
rna = AnnData(csr_matrix(rna_df))
rna.obs_names = rna_df.index.values
rna.var_names = rna_df.columns.values

del rna_df
gc.collect()

pt.pp.clr(rna)

multi_inputs_clr_sparse = rna.X
save_npz(feature_path+'multi_inputs_clr_sparse.npz', multi_inputs_clr_sparse)

del multi_inputs_clr_sparse,rna
gc.collect()

# clr to 100d tsvd

In [None]:
%%time

multi_inputs_sparse = load_npz(feature_path+"multi_inputs_clr_sparse.npz")
tsvd = TruncatedSVD(n_components=100, algorithm='arpack')
multi_inputs_svd = tsvd.fit_transform(multi_inputs_sparse)
np.save(feature_path+'multi_inputs_svd_clr_100.npy', multi_inputs_svd)