In [37]:
import h5py 
import numpy as np
import scipy.sparse as sp_sparse
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
f = h5py.File("1M_neurons_filtered_gene_bc_matrices_h5.h5", "r")
dsets = f["mm10"]

In [3]:
for name in f["mm10"]:
    print name

barcodes
data
gene_names
genes
indices
indptr
shape


In [4]:
f["mm10"]["barcodes"][:]

array(['AAACCTGAGATAGGAG-1', 'AAACCTGAGCGGCTTC-1', 'AAACCTGAGGAATCGC-1',
       ..., 'TTTGTCAGTGCGATAG-133', 'TTTGTCAGTTAAAGTG-133',
       'TTTGTCATCTGAAAGA-133'],
      dtype='|S20')

In [5]:
string_10x = """E18 20161004 Neurons Sample 64 Fraction of Reads Kept	87.1%
E18 20161004 Neurons Sample 63 Fraction of Reads Kept	76.5%
E18 20161004 Neurons Sample 62 Fraction of Reads Kept	93.5%
E18 20161004 Neurons Sample 61 Fraction of Reads Kept	77.6%
E18 20160930 Neurons Sample 25 Fraction of Reads Kept	70.0%
E18 20160930 Neurons Sample 27 Fraction of Reads Kept	72.2%
E18 20160930 Neurons Sample 28 Fraction of Reads Kept	73.0%
E18 20160930 Neurons Sample 29 Fraction of Reads Kept	72.0%
E18 20160930 Neurons Sample 21 Fraction of Reads Kept	71.9%
E18 20160930 Neurons Sample 22 Fraction of Reads Kept	74.7%
E18 20160930 Neurons Sample 23 Fraction of Reads Kept	75.2%
E18 20160930 Neurons Sample 24 Fraction of Reads Kept	76.0%
E18 20160930 Neurons Sample 30 Fraction of Reads Kept	72.0%
E18 20160930 Neurons Sample 31 Fraction of Reads Kept	72.4%
E18 20160930 Neurons Sample 04 Fraction of Reads Kept	76.4%
E18 20160930 Neurons Sample 08 Fraction of Reads Kept	73.1%
E18 20161004 Neurons Sample 18 Fraction of Reads Kept	86.7%
E18 20161004 Neurons Sample 51 Fraction of Reads Kept	74.6%
E18 20161004 Neurons Sample 52 Fraction of Reads Kept	83.7%
E18 20161004 Neurons Sample 53 Fraction of Reads Kept	93.5%
E18 20161004 Neurons Sample 54 Fraction of Reads Kept	74.0%
E18 20161004 Neurons Sample 55 Fraction of Reads Kept	79.9%
E18 20161004 Neurons Sample 56 Fraction of Reads Kept	76.5%
E18 20161004 Neurons Sample 57 Fraction of Reads Kept	91.4%
E18 20161004 Neurons Sample 58 Fraction of Reads Kept	80.9%
E18 20161004 Neurons Sample 59 Fraction of Reads Kept	76.4%
E18 20161004 Neurons Sample 60 Fraction of Reads Kept	77.4%
E18 20160930 Neurons Sample 61 Fraction of Reads Kept	84.6%
E18 20160930 Neurons Sample 60 Fraction of Reads Kept	83.0%
E18 20160930 Neurons Sample 57 Fraction of Reads Kept	82.1%
E18 20160930 Neurons Sample 56 Fraction of Reads Kept	83.4%
E18 20160930 Neurons Sample 59 Fraction of Reads Kept	65.4%
E18 20160930 Neurons Sample 58 Fraction of Reads Kept	82.4%
E18 20160930 Neurons Sample 53 Fraction of Reads Kept	83.9%
E18 20160930 Neurons Sample 52 Fraction of Reads Kept	81.3%
E18 20160930 Neurons Sample 55 Fraction of Reads Kept	81.1%
E18 20160930 Neurons Sample 54 Fraction of Reads Kept	82.4%
E18 20161004 Neurons Sample 46 Fraction of Reads Kept	79.2%
E18 20161004 Neurons Sample 45 Fraction of Reads Kept	83.7%
E18 20161004 Neurons Sample 48 Fraction of Reads Kept	77.8%
E18 20161004 Neurons Sample 47 Fraction of Reads Kept	74.5%
E18 20161004 Neurons Sample 42 Fraction of Reads Kept	81.8%
E18 20161004 Neurons Sample 41 Fraction of Reads Kept	74.9%
E18 20161004 Neurons Sample 44 Fraction of Reads Kept	83.7%
E18 20161004 Neurons Sample 43 Fraction of Reads Kept	82.3%
E18 20161004 Neurons Sample 13 Fraction of Reads Kept	84.9%
E18 20161004 Neurons Sample 14 Fraction of Reads Kept	80.7%
E18 20161004 Neurons Sample 11 Fraction of Reads Kept	81.1%
E18 20161004 Neurons Sample 12 Fraction of Reads Kept	83.2%
E18 20161004 Neurons Sample 50 Fraction of Reads Kept	76.3%
E18 20161004 Neurons Sample 49 Fraction of Reads Kept	81.7%
E18 20161004 Neurons Sample 15 Fraction of Reads Kept	78.1%
E18 20161004 Neurons Sample 16 Fraction of Reads Kept	83.9%
E18 20160930 Neurons Sample 03 Fraction of Reads Kept	76.9%
E18 20160930 Neurons Sample 07 Fraction of Reads Kept	69.2%
E18 20161004 Neurons Sample 39 Fraction of Reads Kept	69.9%
E18 20161004 Neurons Sample 40 Fraction of Reads Kept	87.0%
E18 20161004 Neurons Sample 33 Fraction of Reads Kept	75.7%
E18 20161004 Neurons Sample 34 Fraction of Reads Kept	77.5%
E18 20161004 Neurons Sample 31 Fraction of Reads Kept	93.7%
E18 20161004 Neurons Sample 32 Fraction of Reads Kept	74.9%
E18 20161004 Neurons Sample 37 Fraction of Reads Kept	70.1%
E18 20161004 Neurons Sample 38 Fraction of Reads Kept	63.3%
E18 20161004 Neurons Sample 35 Fraction of Reads Kept	66.5%
E18 20161004 Neurons Sample 36 Fraction of Reads Kept	70.7%
E18 20160930 Neurons Sample 41 Fraction of Reads Kept	74.8%
E18 20160930 Neurons Sample 40 Fraction of Reads Kept	73.8%
E18 20160930 Neurons Sample 35 Fraction of Reads Kept	77.1%
E18 20160930 Neurons Sample 34 Fraction of Reads Kept	76.3%
E18 20160930 Neurons Sample 33 Fraction of Reads Kept	74.4%
E18 20160930 Neurons Sample 32 Fraction of Reads Kept	72.9%
E18 20160930 Neurons Sample 39 Fraction of Reads Kept	74.5%
E18 20160930 Neurons Sample 38 Fraction of Reads Kept	74.3%
E18 20160930 Neurons Sample 37 Fraction of Reads Kept	69.9%
E18 20160930 Neurons Sample 36 Fraction of Reads Kept	74.4%
E18 20161004 Neurons Sample 17 Fraction of Reads Kept	85.4%
E18 20160930 Neurons Sample 62 Fraction of Reads Kept	84.6%
E18 20160930 Neurons Sample 63 Fraction of Reads Kept	87.8%
E18 20160930 Neurons Sample 64 Fraction of Reads Kept	83.2%
E18 20160930 Neurons Sample 66 Fraction of Reads Kept	87.0%
E18 20160930 Neurons Sample 67 Fraction of Reads Kept	82.0%
E18 20160930 Neurons Sample 68 Fraction of Reads Kept	80.0%
E18 20160930 Neurons Sample 69 Fraction of Reads Kept	84.7%
E18 20160930 Neurons Sample 70 Fraction of Reads Kept	81.7%
E18 20160930 Neurons Sample 71 Fraction of Reads Kept	79.3%
E18 20160930 Neurons Sample 72 Fraction of Reads Kept	81.7%
E18 20160930 Neurons Sample 02 Fraction of Reads Kept	77.8%
E18 20160930 Neurons Sample 06 Fraction of Reads Kept	74.0%
E18 20161004 Neurons Sample 30 Fraction of Reads Kept	84.6%
E18 20161004 Neurons Sample 29 Fraction of Reads Kept	82.3%
E18 20161004 Neurons Sample 22 Fraction of Reads Kept	89.5%
E18 20161004 Neurons Sample 21 Fraction of Reads Kept	68.8%
E18 20161004 Neurons Sample 24 Fraction of Reads Kept	88.3%
E18 20161004 Neurons Sample 23 Fraction of Reads Kept	100.0%
E18 20161004 Neurons Sample 26 Fraction of Reads Kept	81.2%
E18 20161004 Neurons Sample 25 Fraction of Reads Kept	93.9%
E18 20161004 Neurons Sample 28 Fraction of Reads Kept	97.0%
E18 20161004 Neurons Sample 27 Fraction of Reads Kept	79.3%
E18 20160930 Neurons Sample 11 Fraction of Reads Kept	74.4%
E18 20160930 Neurons Sample 10 Fraction of Reads Kept	73.8%
E18 20160930 Neurons Sample 13 Fraction of Reads Kept	62.6%
E18 20160930 Neurons Sample 12 Fraction of Reads Kept	73.3%
E18 20160930 Neurons Sample 16 Fraction of Reads Kept	76.5%
E18 20160930 Neurons Sample 15 Fraction of Reads Kept	77.2%
E18 20160930 Neurons Sample 18 Fraction of Reads Kept	73.5%
E18 20160930 Neurons Sample 17 Fraction of Reads Kept	76.4%
E18 20160930 Neurons Sample 20 Fraction of Reads Kept	74.9%
E18 20160930 Neurons Sample 19 Fraction of Reads Kept	75.2%
E18 20161004 Neurons Sample 19 Fraction of Reads Kept	88.4%
E18 20161004 Neurons Sample 20 Fraction of Reads Kept	90.4%
E18 20160930 Neurons Sample 50 Fraction of Reads Kept	71.1%
E18 20160930 Neurons Sample 51 Fraction of Reads Kept	97.2%
E18 20160930 Neurons Sample 48 Fraction of Reads Kept	80.1%
E18 20160930 Neurons Sample 49 Fraction of Reads Kept	70.0%
E18 20160930 Neurons Sample 46 Fraction of Reads Kept	69.5%
E18 20160930 Neurons Sample 47 Fraction of Reads Kept	79.6%
E18 20160930 Neurons Sample 44 Fraction of Reads Kept	92.2%
E18 20160930 Neurons Sample 45 Fraction of Reads Kept	83.8%
E18 20160930 Neurons Sample 42 Fraction of Reads Kept	77.4%
E18 20160930 Neurons Sample 43 Fraction of Reads Kept	84.1%
E18 20160930 Neurons Sample 01 Fraction of Reads Kept	82.5%
E18 20160930 Neurons Sample 05 Fraction of Reads Kept	75.8%
E18 20160930 Neurons Sample 09 Fraction of Reads Kept	74.3%
E18 20161004 Neurons Sample 08 Fraction of Reads Kept	82.9%
E18 20161004 Neurons Sample 07 Fraction of Reads Kept	79.1%
E18 20161004 Neurons Sample 06 Fraction of Reads Kept	84.8%
E18 20161004 Neurons Sample 05 Fraction of Reads Kept	84.8%
E18 20161004 Neurons Sample 04 Fraction of Reads Kept	87.1%
E18 20161004 Neurons Sample 03 Fraction of Reads Kept	82.0%
E18 20161004 Neurons Sample 02 Fraction of Reads Kept	86.4%
E18 20161004 Neurons Sample 01 Fraction of Reads Kept	84.2%
E18 20161004 Neurons Sample 10 Fraction of Reads Kept	80.1%
E18 20161004 Neurons Sample 09 Fraction of Reads Kept	80.1%"""

In [6]:
batch = [int(x[8:10]) - 9 for x in string_10x.split("\n")]
library = [int(x[28:30]) for x in string_10x.split("\n")]

In [7]:
batch_id = np.array([batch[int(x.split("-")[-1])-1] for x in f["mm10"]["barcodes"][:]])

In [34]:
matrix = sp_sparse.csc_matrix((dsets['data'], dsets['indices'], dsets['indptr']), shape=dsets['shape'])

In [35]:
matrix.shape, dsets['data'].shape, dsets['indices'].shape, dsets['indptr'].shape

((27998, 1306127), (2624828308,), (2624828308,), (1306128,))

### Downsampling data to get variable genes

In [10]:
matrix = matrix[:, :100000]
variance = (np.array(matrix.multiply(matrix).mean(1)) - np.array(matrix.mean(1))**2)[:, 0]

In [32]:
np.sum(variance >= 0.03)

10326

In [33]:
mask_small = variance >= 1.912
mask_big = variance >= 0.03

In [12]:
# RELOAD YOUR DATASET !

### Coming back to variable genes and constructing datasets

In [36]:
dataset = matrix[mask_small, :].T.A
dataset.shape

(1306127, 720)

In [38]:
X_train, X_test, b_train, b_test = train_test_split(dataset, batch_id, test_size=0.1, random_state=0)
hf = h5py.File('data_small.hdf5', 'w')
hf.create_dataset('data_train', data=X_train)
hf.create_dataset('data_test', data=X_test)
hf.create_dataset('batch_train', data=b_train)
hf.create_dataset('batch_test', data=b_test)
hf.close()
hf = h5py.File('data_log_small.hdf5', 'w')
hf.create_dataset('log_data_train', data=np.log(1+X_train))
hf.create_dataset('log_data_test', data=np.log(1+X_train))
hf.create_dataset('batch_train', data=b_train)
hf.create_dataset('batch_test', data=b_test)
hf.close()

In [71]:
line_train = np.random.choice(np.arange(matrix.shape[1]), size=500000)
line_test = np.random.choice(np.arange(matrix.shape[1]), size=10000)

In [74]:
hf = h5py.File('data_big.hdf5', 'w')
hf.create_dataset('data_test', data=matrix[:, line_test][mask_big].T.A)
hf.create_dataset('batch_train', data=batch_id[line_train])
hf.create_dataset('batch_test', data=batch_id[line_test])
hf.create_dataset('data_train', shape=(line_train.shape[0], np.sum(mask_big)), dtype=np.int32)


<HDF5 dataset "data_train": shape (500000, 10326), type "<i4">

In [75]:
for i in range(10):
    print i
    hf["data_train"][i* 50000: (i+1)*50000] = matrix[:, line_train[i* 50000: (i+1)*50000]][mask_big].T.A

0
1
2
3
4
5
6
7
8
9


In [76]:

hf.close()