In [7]:
import os

DATA_FOLDER = '../00_data'
FIGURE_FOLDER = '../04_figures'

notebook_name = '003_make_combined_singlecell_fibroblast_datasets'

data_folder = os.path.join(DATA_FOLDER, notebook_name)
figure_folder = os.path.join(FIGURE_FOLDER, notebook_name)

print('Data folder for notebook:', data_folder)
print('Figure folder for notebook:', figure_folder)

! mkdir -p $figure_folder
! mkdir -p $data_folder

Data folder for notebook: ../00_data/003_make_combined_singlecell_fibroblast_datasets
Figure folder for notebook: ../04_figures/003_make_combined_singlecell_fibroblast_datasets


In [8]:
# Numerical python
import numpy as np

# Pandas for dataframes
import pandas as pd

# Labeled N-dimensional arrays
import xarray as xr

In [9]:
%matplotlib inline

In [14]:
input_folder = os.path.join(DATA_FOLDER, '002_sum_counts_with_same_gene_symbol')
! ls -lh $input_folder

total 36536
-rw-r--r--  1 olgabot  staff   2.6M Sep 29 14:56 Group1.matrix.csv.gz
-rw-r--r--  1 olgabot  staff   2.6M Sep 29 14:50 Group2.matrix.csv.gz
-rw-r--r--  1 olgabot  staff   6.2M Sep 29 14:54 Group3.matrix.csv.gz
-rw-r--r--  1 olgabot  staff    33K Sep 29 14:54 Group4.matrix.csv.gz
-rw-r--r--  1 olgabot  staff   4.3M Sep 29 14:26 Group5.matrix.csv.gz
-rw-r--r--  1 olgabot  staff   2.0M Sep 29 14:27 Group8.matrix.csv.gz


In [15]:
group_numbers = 1, 2, 3, 4, 5, 8

In [35]:
%%time
import glob

dfs = []

for n in group_numbers:
    print(f'--- Group #{n} ---')
    basename = f'Group{n}.matrix.csv.gz'
    filename = os.path.join(input_folder, basename)
    print('\t--- Time to read gzipped csv: ---')
    try:
        %time df = pd.read_csv(filename, compression='gzip', index_col=0)
    except EOFError:
        break
    
#     # Remove genes that are zero for all cells
#     all_zero = (df == 0).all(axis=1)
#     print('\tbefore:', df.shape)
#     df = df.loc[~all_zero]
#     print('\tafter:', df.shape)

    # Add group id "cNNNN" for the cell number
    df.columns = ['group{n}_{i}'.format(i=str(i).zfill(4), n=n) 
                  for i in range(len(df.columns))]
    
    print(f"\tAre the columns unique? {df.columns.is_unique}")
    dfs.append(df)

counts = pd.concat(dfs, axis=1)
counts = counts.fillna(0)
print('Concatenated data:', counts.shape)

--- Group #1 ---
	--- Time to read gzipped csv: ---
CPU times: user 5.83 s, sys: 630 ms, total: 6.46 s
Wall time: 6.47 s
	Are the columns unique? True
--- Group #2 ---
	--- Time to read gzipped csv: ---
CPU times: user 5.96 s, sys: 614 ms, total: 6.57 s
Wall time: 6.6 s
	Are the columns unique? True
--- Group #3 ---
	--- Time to read gzipped csv: ---
CPU times: user 46.2 s, sys: 4.02 s, total: 50.2 s
Wall time: 50.4 s
	Are the columns unique? True
--- Group #4 ---
	--- Time to read gzipped csv: ---
CPU times: user 29.2 s, sys: 3.27 s, total: 32.4 s
Wall time: 32.9 s
	Are the columns unique? True
--- Group #5 ---
	--- Time to read gzipped csv: ---
CPU times: user 9.28 s, sys: 936 ms, total: 10.2 s
Wall time: 10.2 s
	Are the columns unique? True
--- Group #8 ---
	--- Time to read gzipped csv: ---
CPU times: user 4.59 s, sys: 429 ms, total: 5.02 s
Wall time: 5.03 s
	Are the columns unique? True
(58828, 13300)
CPU times: user 1min 51s, sys: 32.6 s, total: 2min 23s
Wall time: 2min 31s


In [18]:
counts.head()

Unnamed: 0,group1_0000,group1_0001,group1_0002,group1_0003,group1_0004,group1_0005,group1_0006,group1_0007,group1_0008,group1_0009,...,group8_0790,group8_0791,group8_0792,group8_0793,group8_0794,group8_0795,group8_0796,group8_0797,group8_0798,group8_0799
5S_rRNA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5_8S_rRNA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7SK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1BG-AS1,0,0,0,0,1,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
(counts.head() >= 0).any(axis=1)

5S_rRNA      True
5_8S_rRNA    True
7SK          True
A1BG         True
A1BG-AS1     True
dtype: bool

In [71]:
genes_all_zeros = (counts >= 0).any(axis=1)
print(genes_all_zeros.sum())

58828


## Make gene metadata

In [7]:
genes_metadata = counts.index.to_frame()
genes_metadata.index = genes_metadata.index.droplevel(-1)
genes_metadata = genes_metadata.drop(0, axis=1)
print(genes_metadata.shape)
genes_metadata.index.name = 'gene'
genes_metadata = genes_metadata.rename(columns={1:'symbol'})
genes_metadata.head()

NameError: name 'counts' is not defined

In [None]:
genes_metadata.groupby('symbol').size()

In [None]:
counts.index = genes_metadata.index
print(counts.shape)
counts.head()

In [None]:
counts.tail()

## Make cell metadata

In [37]:
cell_metadata = pd.DataFrame(index=counts.columns)
cell_metadata['group'] = [x.split('_')[0] for x in cell_metadata.index]
cell_metadata.head()

Unnamed: 0,group
group1_0000,group1
group1_0001,group1
group1_0002,group1
group1_0003,group1
group1_0004,group1


### Add cell number

In [38]:
cell_metadata['cell_number'] = [f'c{i+1}' for i in range(len(cell_metadata.index))]
print(cell_metadata.shape)
cell_metadata.head()

(13300, 2)


Unnamed: 0,group,cell_number
group1_0000,group1,c1
group1_0001,group1,c2
group1_0002,group1,c3
group1_0003,group1,c4
group1_0004,group1,c5


In [39]:
cell_metadata.tail()

Unnamed: 0,group,cell_number
group8_0795,group8,c13296
group8_0796,group8,c13297
group8_0797,group8,c13298
group8_0798,group8,c13299
group8_0799,group8,c13300


### Add permuted group name for negative controL

In [43]:
np.random.seed(0)

cell_metadata['group_permuted'] = np.random.permutation(cell_metadata['group'])
cell_metadata.head()

Unnamed: 0,group,cell_number,group_permuted
group1_0000,group1,c1,group3
group1_0001,group1,c2,group3
group1_0002,group1,c3,group2
group1_0003,group1,c4,group3
group1_0004,group1,c5,group3


In [51]:
cell_metadata.to_dict?

In [54]:
d = cell_metadata.head().to_dict(orient='list')
d

{'cell_number': ['c1', 'c2', 'c3', 'c4', 'c5'],
 'group': ['group1', 'group1', 'group1', 'group1', 'group1'],
 'group_permuted': ['group3', 'group3', 'group2', 'group3', 'group3']}

## Add metadata as "coordinates" for slicing

In [56]:
coords = {k: ('cell', v) for k, v in d.items()}
coords

{'cell_number': ('cell', ['c1', 'c2', 'c3', 'c4', 'c5']),
 'group': ('cell', ['group1', 'group1', 'group1', 'group1', 'group1']),
 'group_permuted': ('cell',
  ['group3', 'group3', 'group2', 'group3', 'group3'])}

In [57]:
cell_metadata_dict = cell_metadata.to_dict(orient='list')
coords = {k: ('cell', v) for k, v in cell_metadata_dict.items()}
coords.update(dict(gene=counts.index, cell=counts.columns))
print(coords.keys())

dict_keys(['group', 'cell_number', 'group_permuted', 'gene', 'cell'])


In [74]:
ds = xr.Dataset(
    {'counts': (['cell', 'gene'], counts.T),
#      'gene_metadata': (['gene', 'gene_feature'], genes_metadata),
#      'cell_metadata': (['cell', 'cell_feature'], cell_metadata)
    },
    coords=coords)
#        coords={'gene': counts.index, 
#                'cell': counts.columns, 
# #                'gene_feature': genes_metadata.columns,
#                'cell': cell_metadata.columns})
ds

<xarray.Dataset>
Dimensions:        (cell: 13300, cell_metadata: 3, gene: 58828)
Coordinates:
  * gene           (gene) object '5S_rRNA' '5_8S_rRNA' '7SK' 'A1BG' ...
  * cell           (cell) object 'group1_0000' 'group1_0001' 'group1_0002' ...
  * cell_metadata  (cell_metadata) object 'group' 'cell_number' 'group_permuted'
Data variables:
    counts         (cell, gene) int64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...

In [75]:
%%time

netcdf = os.path.join(data_folder, 'cshl-fibroblast.netcdf')
ds.to_netcdf(netcdf)

CPU times: user 1.38 s, sys: 3.42 s, total: 4.8 s
Wall time: 6.08 s


In [76]:
ls -lha $netcdf

-rw-r--r--  1 olgabot  staff   5.8G Sep 29 17:16 ../00_data/003_make_combined_singlecell_fibroblast_datasets/cshl-fibroblast.netcdf
