In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from benchmarking import *
% matplotlib inline

In [2]:
data_path = "data/10x1M/"

data has already:
- randomly shuffled
- filtered 880 top variance genes
- subsampled to 
    - 1M train
    - 10k test

# import and format

In [13]:
f = h5py.File(data_path + "data.hdf5", "r")
X_train = f["data_train"] 
X_test = f["data_test"]
f_log = h5py.File(data_path + "data_log.hdf5", "r")
X_train_log = f_log["log_data_train"]
X_test_log = f_log["log_data_test"]
b_train = f["batch_train"][:]
b_test = f["batch_test"][:]

#### Create something for BISCUIT and MAGIC

In [5]:
#np.savetxt("expression_mRNA_1m.txt", X_train[:100000], header="", delimiter="\t")
#np.save("expression_mRNA_1m", X_train[:100000])

# Benchmarking

In [6]:
from sklearn.decomposition import PCA
from sklearn.decomposition import FactorAnalysis
from ZIFA import ZIFA
from ZINB import ZINB
from SIMLR import SIMLR

In [8]:
n_latent = 10
#n_cells_list = [4000, 10000, 15000, 30000, 50000, 100000]
n_cells_list = [10000]

In [14]:
for n_cells in n_cells_list:
    print n_cells, " cells"
    
    print "Running FA"
    fa = FactorAnalysis(n_components=n_latent)
    %time fa.fit(X_train_log[:n_cells])
    latent = fa.transform(X_train_log[:n_cells])
    print entropy_batch_mixing(latent, b_train)
    res = fa.score(X_test_log[:n_cells]) - np.mean(np.sum(X_test_log[:n_cells], axis=-1))
    print "FA: ", res
        
    print "Running SIMLR"
    simlr = SIMLR(n_clusters=10)
    %time simlr.fit_transform(X_train[:n_cells])
    print entropy_batch_mixing(simlr.F, b_train)

    print "Running ZIFA"
    zifa = ZIFA(n_components=n_latent)
    %time zifa.fit(X_train_log[:n_cells])
    latent = zifa.transform(data_train)
    print entropy_batch_mixing(latent, b_train)
    res = zifa.score(X_test_log[:n_cells]) - np.mean(np.sum(X_test_log[:n_cells], axis=-1))
    print "ZIFA: ", res

    print "Running ZINB"
    zinb = ZINB(n_components=n_latent, learn_V=True)
    % time zinb.fit(X_train[:n_cells])
    % time latent = zinb.transform(X_train[:n_cells])
    print entropy_batch_mixing(latent, b_train)
    res = zinb.score(X_test[:n_cells])
    print "ZINB: ", res

10000  cells
Running FA
CPU times: user 7.58 s, sys: 13.5 s, total: 21.1 s
Wall time: 2.83 s
0.660695669479
FA:  -1320.67298705
Running SIMLR
Performing fast PCA.
Performing k-nearest neighbour search.
Computing the multiple Kernels.
Performing the iterative procedure  5  times.
Iteration:  1 
Iteration:  2 
Iteration:  3 
Iteration:  4 
Iteration:  5 
Performing Kmeans.
Performing t-SNE.
The main loop will be now performed with a maximum of 300 iterations.
Performing iteration 1.
Performing iteration 2.
Performing iteration 3.
Performing iteration 4.
Performing iteration 5.
Performing iteration 6.
Performing iteration 7.
Performing iteration 8.
Performing iteration 9.
Performing iteration 10.
Performing iteration 11.
Performing iteration 12.
Performing iteration 13.
Performing iteration 14.
Performing iteration 15.
Performing iteration 16.
Performing iteration 17.
Performing iteration 18.
Performing iteration 19.
Performing iteration 20.
Performing iteration 21.
Performing iteration 2

0.659771218641
Running ZIFA


KeyboardInterrupt: 

## imputation

In [7]:
X_zero, i, j, ix = dropout(X_train[:10000], zifa.params["decay_coef"], uniform=True)
np.save(data_path + "imputation/X_zero.npy", X_zero)
np.save(data_path + "imputation/i.npy", i)
np.save(data_path + "imputation/j.npy", j)
np.save(data_path + "imputation/ix.npy", ix)

In [3]:
X_zero, i, j, ix = \
        np.load(data_path + "imputation/X_zero.npy"),\
        np.load(data_path + "imputation/i.npy"), \
        np.load(data_path + "imputation/j.npy"), \
        np.load(data_path + "imputation/ix.npy")

In [4]:
np.savetxt("expression_mRNA_1M_impute.txt", np.vstack((np.zeros(X_zero.shape[1]), X_zero[:5000])), delimiter="\t")

In [None]:
# ZIFA Imputation
zifa = ZIFA(n_components=n_latent)
%time zifa.fit(np.log(1 + X_zero))
%time X_ZIFA_log = zifa.output_estimation(np.log(1 + X_zero))["EX"]
p_ZIFA = np.exp(- zifa.params["decay_coef"] * X_ZIFA_log**2)

In [10]:
print("ZIFA", imputation_error(np.exp(X_ZIFA_log) - 1, X_train[:10000], X_zero,  i, j, ix))

('ZIFA', 1.4898509912731444)


In [12]:
# ZINB Imputation
zinb = ZINB(n_components=n_latent, learn_V=True)
%time zinb.fit(X_zero)
%time zinb.score(X_zero)
fit_ZINB = zinb.output_estimation()
def softplus(x):
    return np.log(1 + np.exp(x))
X_ZINB = fit_ZINB[0]
logit_ZINB = fit_ZINB[1]
theta_ZINB = fit_ZINB[2]
log_p_ZINB = -softplus(- logit_ZINB)
print("ZINB", imputation_error(X_ZINB, X_train[:10000], X_zero, i, j, ix))

CPU times: user 6min 27s, sys: 14.7 s, total: 6min 42s
Wall time: 29min 1s
CPU times: user 42.7 s, sys: 6.89 s, total: 49.6 s
Wall time: 3min 7s
('ZINB', 1.0771223055642813)


In [11]:
#MAGIC Imputation
mean_MAGIC = np.load(data_path + "imputation/X_zero_MAGIC.npy")
print("MAGIC", imputation_error(mean_MAGIC, X_train[:10000], X_zero, i, j, ix))

('MAGIC', 1.7571007940906977)
