In [1]:
import numpy as np
import tensorflow as tf
import scipy.special as sp
import scipy.stats as st
import scqtl
import pandas as pd

In [2]:
# Generate some ZINB-distributed counts
num_samples = 1000
umi = np.concatenate([scqtl.simulation.simulate(
  num_samples=num_samples,
  size=1e5,
  seed=trial)[0][:,:1] for trial in range(10)], axis=1)
size_factor = 1e5 * np.ones((num_samples, 1))

# Generate a null design matrix
design = np.zeros((num_samples, 1))

# Map all samples to one individual/condition, i.e. one set of ZINB parameters
onehot = np.ones((num_samples, 1))

# Find the NB MLE
# Important: casting to float32 is required
init = scqtl.tf.fit(
  umi=umi.astype(np.float32),
  onehot=onehot.astype(np.float32),
  design=design.astype(np.float32),
  size_factor=size_factor.astype(np.float32),
  learning_rate=1e-3,
  max_epochs=20000,
  verbose=True,
)

# Find the ZINB MLE, starting from the NB MLE
log_mu, log_phi, logodds, nb_llik, zinb_llik = scqtl.tf.fit(
  umi=umi.astype(np.float32),
  onehot=onehot.astype(np.float32),
  design=design.astype(np.float32),
  size_factor=size_factor.astype(np.float32),
  learning_rate=1e-3,
  max_epochs=20000,
  warm_start=init[:3],
  verbose=True)

19999 13788.233
19999 13010.246


In [3]:
print(design.shape)
print(umi.shape)
print(onehot.shape)
print(size_factor.shape)

(1000, 1)
(1000, 10)
(1000, 1)
(1000, 1)


In [2]:
def recode(annotations, key):
  n = annotations.shape[0]
  cat = sorted(set(annotations[key]))
  onehot = np.zeros((n, len(cat)))
  onehot[np.arange(n), annotations[key].apply(cat.index)] = 1
  return onehot

In [3]:
save_dir = "/work-zfs/abattle4/prashanthi/Single_cell_eQTL/data/UMI_counts/expr/"
expr = pd.read_csv(save_dir + "megakaryocytes.csv")

In [4]:
pseudobulk = pd.read_csv("/work-zfs/abattle4/prashanthi/Single_cell_eQTL/data/expr/" + "megakaryocytes.txt", sep = "\t")
pseudobulk = pseudobulk[~(pseudobulk == 0).any(axis=1)]
expr = expr.loc[:, np.insert(pseudobulk.index.values, 0, "index", axis=0)]

In [5]:
meta_data = pd.read_csv("/work-zfs/abattle4/prashanthi/Single_cell_eQTL/data/UMI_counts/metadata/megakaryocytes.csv")
print(meta_data['index'].equals(expr['index']))
meta_data.head()

True


Unnamed: 0,index,disease_cov,ct_cov,pop_cov,ind_cov,well,batch_cov,batch,percent_mito,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,n_genes
0,AAACCTGAGCAATATG-1-0-0-0-0-0-0-0-0-0-0-0-0-0,sle,Megakaryocytes,WHITE,901560200_901560200,YE_8-16-1,lupus8.16,0,0.046385,548,6.308098,1145.0,7.044033,42.270742,56.244541,69.606987,95.80786,548
1,AACCGCGTCGACCAGC-1-0-0-0-0-0-0-0-0-0-0-0-0-0,sle,Megakaryocytes,ASIAN,1597_1597,YE_8-16-1,lupus8.16,0,0.02409,537,6.287859,1444.0,7.275865,51.038781,66.481994,76.66205,97.437673,537
2,AACTCAGGTTGGTTTG-1-0-0-0-0-0-0-0-0-0-0-0-0-0,sle,Megakaryocytes,ASIAN,1597_1597,YE_8-16-1,lupus8.16,0,0.025518,538,6.289716,1568.0,7.358194,51.594388,67.47449,78.443878,97.576531,538
3,ACCAGTAAGCAGATCG-1-0-0-0-0-0-0-0-0-0-0-0-0-0,sle,Megakaryocytes,ASIAN,1597_1597,YE_8-16-1,lupus8.16,0,0.045817,752,6.624065,1688.0,7.431892,35.781991,50.592417,64.632701,85.07109,752
4,ACCTTTAAGTGTCCCG-1-0-0-0-0-0-0-0-0-0-0-0-0-0,sle,Megakaryocytes,ASIAN,1597_1597,YE_8-16-1,lupus8.16,0,0.027282,565,6.338594,1496.0,7.311218,44.518717,61.163102,75.601604,95.65508,565


In [6]:
# Examine the meta data distributions
print(meta_data.batch.value_counts())
print(meta_data.batch_cov.value_counts())
print(meta_data.pop_cov.value_counts())
print(meta_data.well.value_counts())

0    10188
Name: batch, dtype: int64
lupus7.20    1346
lupus8.23    1284
lupus8.17    1110
lupus8.2     1004
lupus8.3      987
lupus7.19     974
lupus1.10     915
lupus7.26     866
lupus8.16     661
lupus8.9      643
lupus7.13     398
Name: batch_cov, dtype: int64
ASIAN    5719
WHITE    4469
Name: pop_cov, dtype: int64
YE_7-20-3    354
YE_8-23-1    345
YE_8-17-4    334
YE_8-23-3    334
YE_7-20-4    333
YE_7-20-1    332
YE_8-9-4     329
YE_7-20-2    327
YE_8-23-2    324
YE_8-9-3     314
YE_8-17-3    303
YE_8-23-4    281
YE_7-26-1    281
YE_8-2-1     274
YE_8-2-3     271
YE_7-19-3    265
YE_8-2-4     263
YE_8-3-1     258
YE_8-3-3     256
YE_7-19-1    245
YE_7-19-4    240
YE_8-17-1    239
YE_8-3-4     239
YE110-4      235
YE_8-3-2     234
YE_8-17-2    234
YE110-1      229
YE110-3      228
YE_7-19-2    224
YE110-2      223
YE_7-26-4    211
YE_8-16-2    211
YE_7-26-2    210
YE_8-2-2     196
YE_8-16-3    193
YE_8-16-4    189
YE_7-26-3    164
YE_7-13-2    102
YE_7-13-1    102
YE_7-13-4     99

In [6]:
umi = expr.iloc[:, 1:].values
umi

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [3., 0., 0., ..., 0., 0., 1.]])

In [7]:
size_factor = pd.read_csv("/work-zfs/abattle4/prashanthi/Single_cell_eQTL/data/size_factor/megakaryocytes.csv")
print(size_factor['index'].equals(expr['index']))
size_factor = size_factor['total_counts'].values.reshape((umi.shape[0],1))

True


In [8]:
batch     = recode(meta_data, "batch")
batch_cov = recode(meta_data, "batch_cov")
well      = recode(meta_data, "well")
pop_cov   = recode(meta_data, "pop_cov")
design    = np.concatenate((batch, batch_cov, well, pop_cov), axis = 1)
design -= design.mean(axis=0)
onehot = recode(meta_data, "ind_cov")

In [9]:
# Create variables to store the inferred log_mu and log_phi values
log_mu = np.empty([onehot.shape[1], umi.shape[1]])
log_phi = np.empty([onehot.shape[1], umi.shape[1]])
logodds = np.empty([onehot.shape[1], umi.shape[1]])
nb_llik = np.empty([onehot.shape[1]])
zinb_llik = np.empty([onehot.shape[1]])

In [10]:
print(design.shape)
print(umi.shape)
print(onehot.shape)
print(size_factor.shape)

(10188, 56)
(10188, 1686)
(10188, 119)
(10188, 1)


In [None]:
n_genes = 200
for i in range(0, umi.shape[1], n_genes):
    if((i + n_genes) < umi.shape[1]):
        start_index = i
        end_index = i + n_genes
    else:
        start_index = i
        end_index = umi.shape[1]
    umi_sub = umi[:, start_index:end_index]
    print(umi_sub.shape)
    init = scqtl.tf.fit(
      umi=umi_sub.astype(np.float32),
      onehot=onehot.astype(np.float32),
      design=design.astype(np.float32),
      size_factor=size_factor.astype(np.float32),
      learning_rate=1e-4,
      max_epochs=100000,
      verbose=True)
    log_mu[:,start_index:end_index], log_phi[:,start_index:end_index], logodds[:,start_index:end_index], nb_llik[start_index:end_index], zinb_llik[start_index:end_index] = scqtl.tf.fit(
      umi=umi_sub.astype(np.float32),
      onehot=onehot.astype(np.float32),
      design=design.astype(np.float32),
      size_factor=size_factor.astype(np.float32),
      learning_rate=1e-4,
      max_epochs=100000,
      warm_start=init[:3],
      verbose=True)

(10188, 200)
99999 1562369.8
99999 1539600.5
(10188, 200)
19000 3553104.5

In [None]:
res_dir = "/work-zfs/abattle4/prashanthi/Single_cell_eQTL/results/ZINB_summary_stats/"+ "megakaryocytes"
np.savetxt((res_dir + "/log_mu.csv"), log_mu, delimiter=",")
np.savetxt((res_dir + "/log_phi.csv"), log_phi, delimiter=",")
np.savetxt((res_dir + "/logodds.csv"), logodds, delimiter=",")
np.savetxt((res_dir + "/nb_llik.csv"), nb_llik, delimiter=",")
np.savetxt((res_dir + "/zinb_llik.csv"), zinb_llik, delimiter=",")