In [1]:
import glob
import os
import sys
import numpy as np
import pandas as pd
import math

import scanpy as sc
import pandas as pd

from scipy.io import mmread
from scipy.sparse import csr_matrix
from scipy.sparse import vstack
import gzip
import re
import mygene

import tqdm.notebook as tq
import matplotlib.pyplot as plt

from collections import Counter
from utils import write_as_singles

In [2]:
path = "/n/data1/hms/cellbio/sander/judy/resource_paper/"

In [3]:
h5files = glob.glob(path+"**/*.h5",recursive=True)
h5files = [x for x in h5files if not "preprocessed" in x]
h5files = [x for x in h5files if not "_qc" in x]
h5files = [x for x in h5files if not "_normalized" in x]
h5files = [x for x in h5files if not "McFarland" in x]

In [41]:
len(h5files)

21

In [42]:
h5files

['/n/data1/hms/cellbio/sander/judy/resource_paper/GasperiniShendure2019/highMOI/GasperiniShendure2019_highMOI.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/GasperiniShendure2019/lowMOI/GasperiniShendure2019_lowMOI.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/GasperiniShendure2019/atscale/GasperiniShendure2019_atscale.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi1/TianKampmann2021_CRISPRi1.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi4/TianKampmann2021_CRISPRi4.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi3/TianKampmann2021_CRISPRi3.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi2/TianKampmann2021_CRISPRi2.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/CRISPRa1/TianKampmann2021_CRISPRa1.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/CRISPRa

In [4]:
def qc_h5(h5file, writeadata=False):
    directory = "/".join(h5file.split("/")[:-1])
    prefix = h5file.split("/")[-1].split(".")[0]
    adata = sc.read_h5ad(h5file)
    adata.var.index = adata.var.index.astype("str")
    adata.obs.index = adata.obs.index.astype("str")
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    
    ## Perturbation-wise

    # #of perturbations
#     perturbations = list(set(adata.obs.perturbation))
#     perturbations.remove("control")
#     n_pert = len(perturbations) 
#     # cells per perturbations
#     n_cell_per_pert = adata.obs.perturbation.value_counts()
    # cells with 0,1,2... targets
    n_pert_per_cell = [0 if x=="control" else len(x.split("_")) for x in adata.obs.perturbation]
    adata.obs["nperts"] = n_pert_per_cell
#     dfperts = {"n_counts":n_cell_per_pert}
#     dfperts = pd.DataFrame(dfperts)
    
    ## Cell-wise

    # #of genes expressed
    n_genes_per_cell = [x[0] for x in (adata.X>0).sum(1).tolist()]
    adata.obs["ngenes"] = n_genes_per_cell
    # #of counts in total
    n_counts_per_cell = [x[0] for x in adata.X.sum(1).tolist()]
    adata.obs["ncounts"] = n_counts_per_cell
    # percentage of mitochondria genes
    mtgenes = [x for x in adata.var.index if x.upper().startswith("MT-")]
    n_mtgenecounts_per_cell = [x[0] for x in adata.X[:,np.isin(adata.var.index, mtgenes)].sum(1).tolist()]
    n_perc_mtgenes = np.array(n_mtgenecounts_per_cell)/np.array(n_counts_per_cell)*100
    adata.obs["percent_mito"] = n_perc_mtgenes
    # percentage of ribosomal genes
    ribogenes = [x for x in adata.var.index if x.upper().startswith("RPS") or x.upper().startswith("RPL")]
    n_ribogenecounts_per_cell = [x[0] for x in adata.X[:,np.isin(adata.var.index, ribogenes)].sum(1).tolist()]
    n_perc_ribogenes = np.array(n_ribogenecounts_per_cell)/np.array(n_counts_per_cell)*100
    adata.obs["percent_ribo"] = n_perc_ribogenes
    # percentage of hemoglobin genes
    if "primary" in list(adata.obs.tissue_type):
        hemogenes = [x for x in adata.var.index if x.upper().startswith("HBA") or x.upper().startswith("HBB")]
        n_hemogenecounts_per_cell = [x[0] for x in adata.X[:,np.isin(adata.var.index, hemogenes)].sum(1).tolist()]
        n_perc_hemogenes = np.array(n_hemogenecounts_per_cell)/np.array(n_counts_per_cell)*100
        adata.obs["percent_hemo"] = n_perc_hemogenes
        #     dfcells = {"n_pert":n_pert_per_cell,"n_genes":n_genes_per_cell,"n_counts":n_counts_per_cell,
#            "n_mtgenecounts":n_mtgenecounts_per_cell,"n_perc_mtgenes":n_perc_mtgenes,
#            "n_ribogenecounts":n_ribogenecounts_per_cell,"n_perc_ribogenes":n_perc_ribogenes,
#            "n_hemogenecounts":n_hemogenecounts_per_cell,"n_perc_hemogenes":n_perc_hemogenes}
#     dfcells = pd.DataFrame(dfcells)
#     dfcells.index = adata.obs.index
    
    ## gene-wise

    # #of gene counts across all cells
    n_genecounts = adata.X.sum(0).tolist()[0]
    adata.var["ncounts"] = n_genecounts
    # #of cells where gene is expressed
    n_cell_genes = (adata.X>0).sum(0).tolist()[0]
    adata.var["ncells"] = n_cell_genes
#     dfgenes = {"n_counts":n_genecounts,"n_cells":n_cell_genes}
#     dfgenes = pd.DataFrame(dfgenes)
#     dfgenes.index = adata.var.index
    
#     dfperts.to_csv(directory+"/"+prefix+"_perturbation_wise_qc.csv")
#     dfcells.to_csv(directory+"/"+prefix+"_cell_wise_qc.csv")
#     dfgenes.to_csv(directory+"/"+prefix+"_gene_wise_qc.csv")
    
#    return dfperts, dfcells, dfgenes
    if writeadata:
        if os.path.exists(directory+"/"+prefix+"_qc.h5"):
            !rm {directory+"/"+prefix+"_qc.h5"}
        adata.write(directory+"/"+prefix+"_qc.h5")
    else:
        return adata

In [58]:
adata.var_names_make_unique()

In [60]:
for h5file in h5files[3:]:
    qc_h5(h5file, writeadata=True)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_u

In [64]:
mcfarlandh5 = glob.glob('/n/data1/hms/cellbio/sander/judy/resource_paper/McFarlandTshemiak2020/*.h5')
qc_h5(mcfarlandh5[0], writeadata=True)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [6]:
h5files = glob.glob(path+"**/*qc.h5",recursive=True)
h5files = [x for x in h5files if not "McFarland" in x]
h5files.append('/n/data1/hms/cellbio/sander/judy/resource_paper/McFarlandTshemiak2020/all_expts_combined_qc.h5')

In [5]:
len(h5files)

22

In [8]:
h5files

['/n/data1/hms/cellbio/sander/judy/resource_paper/PapalexiSatija2021/eccite_arrayed/PapalexiSatija2021_eccite_arrayed_protein_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/PapalexiSatija2021/eccite_arrayed/PapalexiSatija2021_eccite_arrayed_RNA_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/PapalexiSatija2021/eccite/PapalexiSatija2021_eccite_RNA_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/PapalexiSatija2021/eccite/PapalexiSatija2021_eccite_protein_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/FrangiehIzar2021/FrangiehIzar2021/FrangiehIzar2021_protein_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/FrangiehIzar2021/FrangiehIzar2021/FrangiehIzar2021_RNA_qc.h5']

In [7]:
tians = ['/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/TianKampmann2021_CRISPRi.h5',
        '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/TianKampmann2021_CRISPRa.h5',
        '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2019/day7neuron/TianKampmann2019_day7neuron.h5',
        '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2019/iPSC/TianKampmann2019_iPSC.h5']
for h5file in tians:
    qc_h5(h5file, writeadata=True)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [8]:
tians = ['/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/TianKampmann2021_CRISPRi_qc.h5',
        '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/TianKampmann2021_CRISPRa_qc.h5',
        '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2019/day7neuron/TianKampmann2019_day7neuron_qc.h5',
        '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2019/iPSC/TianKampmann2019_iPSC_qc.h5']
for h5file in tians:
    directory = "/".join(h5file.split("/")[:-2])
    prefix = h5file.split("/")[-2]
    adata = sc.read_h5ad(h5file)
    write_as_singles(adata,directory,prefix)

  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')


In [12]:
for h5file in h5files[2:]:
    if "_protein" in h5file:
        directory = "/".join(h5file.split("/")[:-1])
        prefix = "protein"
    elif "_RNA" in h5file:
        directory = "/".join(h5file.split("/")[:-1])
        prefix = "RNA"
    else:
        directory = "/".join(h5file.split("/")[:-2])
        prefix = h5file.split("/")[-2]
    adata = sc.read_h5ad(h5file)
    write_as_singles(adata,directory,prefix)

  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{folder} already exists. Possibly overwriting files...')
  warn(f'{

In [11]:
for h5file in h5files:
    directory = "/".join(h5file.split("/")[:-1])
    if "_protein" in h5file:
        prefix = "protein"
    if "_RNA" in h5file:
        prefix = "RNA"
    #print(directory)
    #print(prefix)
    adata = sc.read_h5ad(h5file)
    write_as_singles(adata,directory,prefix,add_h5=True)

In [4]:
obss = glob.glob(path+"**/obs.csv",recursive=True)

In [5]:
obss

['/n/data1/hms/cellbio/sander/judy/resource_paper/GasperiniShendure2019/highMOI/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/GasperiniShendure2019/lowMOI/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/GasperiniShendure2019/atscale/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi1/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi4/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi3/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi2/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/CRISPRa1/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/CRISPRa2/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/McFarlandTshemiak2020/obs.csv',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/NormanWeissman2019/raw/obs.csv',
 '

In [18]:
h5files=[]
for obs in obss:
    if "RNA" in obs:
        directory = '/'.join(obs.split("/")[:-2])
        h5file = glob.glob(directory+"/*RNA_qc.h5")
    elif "protein" in obs:
        directory = '/'.join(obs.split("/")[:-2])
        h5file = glob.glob(directory+"/*protein_qc.h5")
    else:
        directory = '/'.join(obs.split("/")[:-1])
        h5file = glob.glob(directory+"/*_qc.h5")
    h5files.append(h5file[0])

In [21]:
h5files

['/n/data1/hms/cellbio/sander/judy/resource_paper/GasperiniShendure2019/highMOI/GasperiniShendure2019_highMOI_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/GasperiniShendure2019/lowMOI/GasperiniShendure2019_lowMOI_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/GasperiniShendure2019/atscale/GasperiniShendure2019_atscale_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi1/TianKampmann2021_CRISPRi1_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi4/TianKampmann2021_CRISPRi4_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi3/TianKampmann2021_CRISPRi3_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi2/TianKampmann2021_CRISPRi2_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/CRISPRa1/TianKampmann2021_CRISPRa1_qc.h5',
 '/n/data1/hms/cellbio/sander/judy/resource_paper/TianKamp

In [38]:
for h5file in h5files:
    if "RNA" in h5file:
        directory = '/'.join(h5file.split("/")[8:-1])+"/RNA/"
    elif "protein" in h5file:
        directory = '/'.join(h5file.split("/")[8:-1])+"/protein/"        
    else:
        directory = '/'.join(h5file.split("/")[8:-1])+"/"
    filename = "_".join(h5file.split("/")[-1].split("_")[:-1])+".h5"
    if filename == "all_expts_combined.h5":
        directory = "McFarlandTshemiak2020/McFarlandTshemiak2020/"
        filename = "McFarlandTshemiak2020_all_expts_combined.h5"
    !/home/cs308/dropbox_uploader.sh upload {h5file} {directory+filename}
    !/home/cs308/dropbox_uploader.sh upload {'/n/data1/hms/cellbio/sander/judy/resource_paper/'+directory+'obs.csv'} {directory}
    !/home/cs308/dropbox_uploader.sh upload {'/n/data1/hms/cellbio/sander/judy/resource_paper/'+directory+'var.csv'} {directory}
    !/home/cs308/dropbox_uploader.sh upload {'/n/data1/hms/cellbio/sander/judy/resource_paper/'+directory+'counts.mtx.gz'} {directory}
#    print(filename)
#    print(directory)

which: no shasum in (/home/cs308/cellbox_analysis/bin:/n/app/R/3.6.1/bin:/n/app/pango/1.40.3/include:/n/app/pango/1.40.3/bin:/n/app/cairo/1.14.6/include/cairo:/n/app/cairo/1.14.6/bin:/n/app/harfbuzz/1.3.4/bin:/n/app/fontconfig/2.12.1/bin:/n/app/libffi/3.2.1/lib/libffi-3.2.1/include:/n/app/freetype/2.7/include/freetype2:/n/app/freetype/2.7/bin:/n/app/libxml2/2.9.4/include/libxml2:/n/app/libxml2/2.9.4/bin:/n/app/glib/2.50.2/bin:/n/app/hdf5/1.10.1/include:/n/app/hdf5/1.10.1/bin:/n/app/gsl/2.3/include:/n/app/gsl/2.3/bin:/n/app/boost/1.62.0/include:/n/app/xz/5.2.3/include:/n/app/xz/5.2.3/bin:/n/app/pixman/0.34.0/include/pixman-1:/n/app/pixman/0.34.0/include:/n/app/tiff/4.0.7/include:/n/app/tiff/4.0.7/bin:/n/app/libpng/1.6.26/include:/n/app/libpng/1.6.26/bin:/n/app/jpeg/9b/include:/n/app/jpeg/9b/bin:/n/app/java/jdk-1.8u112/bin:/n/app/openblas/0.2.19/include:/n/app/python/3.6.0/include:/n/app/python/3.6.0/bin:/n/app/gcc/6.2.0/bin:/n/groups/marks/software/anaconda_o2/bin:/n/cluster/bin:/opt/si

 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi1/TianKampmann2021_CRISPRi1_qc.h5" to "/TianKampmann2021/CRISPRi/CRISPRi1/TianKampmann2021_CRISPRi1.h5" by 6 chunks ...... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi1/obs.csv" to "/TianKampmann2021/CRISPRi/CRISPRi1/obs.csv"... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi1/var.csv" to "/TianKampmann2021/CRISPRi/CRISPRi1/var.csv"... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/CRISPRi1/counts.mtx.gz" to "/TianKampmann2021/CRISPRi/CRISPRi1/counts.mtx.gz"... DONE
which: no shasum in (/home/cs308/cellbox_analysis/bin:/n/app/R/3.6.1/bin:/n/app/pango/1.40.3/include:/n/app/pango/1.40.3/bin:/n/app/cairo/1.14.6/include/cairo:/n/app/cairo/1.14.6/bin:/n/app/harfbuzz/1.3.4/bin:/n/app/fontconfig/2.12.1/bin:/n/app/libffi/3.2.1/lib/libffi-3.2.1/include:

 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/CRISPRa1/TianKampmann2021_CRISPRa1_qc.h5" to "/TianKampmann2021/CRISPRa/CRISPRa1/TianKampmann2021_CRISPRa1.h5" by 6 chunks ...... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/CRISPRa1/obs.csv" to "/TianKampmann2021/CRISPRa/CRISPRa1/obs.csv"... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/CRISPRa1/var.csv" to "/TianKampmann2021/CRISPRa/CRISPRa1/var.csv"... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/CRISPRa1/counts.mtx.gz" to "/TianKampmann2021/CRISPRa/CRISPRa1/counts.mtx.gz"... DONE
which: no shasum in (/home/cs308/cellbox_analysis/bin:/n/app/R/3.6.1/bin:/n/app/pango/1.40.3/include:/n/app/pango/1.40.3/bin:/n/app/cairo/1.14.6/include/cairo:/n/app/cairo/1.14.6/bin:/n/app/harfbuzz/1.3.4/bin:/n/app/fontconfig/2.12.1/bin:/n/app/libffi/3.2.1/lib/libffi-3.2.1/include:

 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2019/day7neuron/day7neuron2/var.csv" to "/TianKampmann2019/day7neuron/day7neuron2/var.csv"... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2019/day7neuron/day7neuron2/counts.mtx.gz" to "/TianKampmann2019/day7neuron/day7neuron2/counts.mtx.gz" by 4 chunks .... DONE
which: no shasum in (/home/cs308/cellbox_analysis/bin:/n/app/R/3.6.1/bin:/n/app/pango/1.40.3/include:/n/app/pango/1.40.3/bin:/n/app/cairo/1.14.6/include/cairo:/n/app/cairo/1.14.6/bin:/n/app/harfbuzz/1.3.4/bin:/n/app/fontconfig/2.12.1/bin:/n/app/libffi/3.2.1/lib/libffi-3.2.1/include:/n/app/freetype/2.7/include/freetype2:/n/app/freetype/2.7/bin:/n/app/libxml2/2.9.4/include/libxml2:/n/app/libxml2/2.9.4/bin:/n/app/glib/2.50.2/bin:/n/app/hdf5/1.10.1/include:/n/app/hdf5/1.10.1/bin:/n/app/gsl/2.3/include:/n/app/gsl/2.3/bin:/n/app/boost/1.62.0/include:/n/app/xz/5.2.3/include:/n/app/xz/5.2.3/bin:/n/app/pixman/0.34.0/include/pixma

In [9]:
for h5file in tians:
    directory = '/'.join(h5file.split("/")[8:-1])+"/"
    filename = "_".join(h5file.split("/")[-1].split("_")[:-1])+".h5"
    !/home/cs308/dropbox_uploader.sh upload {h5file} {directory+filename}
    !/home/cs308/dropbox_uploader.sh upload {'/n/data1/hms/cellbio/sander/judy/resource_paper/'+directory+'obs.csv'} {directory}
    !/home/cs308/dropbox_uploader.sh upload {'/n/data1/hms/cellbio/sander/judy/resource_paper/'+directory+'var.csv'} {directory}
    !/home/cs308/dropbox_uploader.sh upload {'/n/data1/hms/cellbio/sander/judy/resource_paper/'+directory+'counts.mtx.gz'} {directory}
#    print(filename)
#    print(directory)

 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/TianKampmann2021_CRISPRi_qc.h5" to "/TianKampmann2021/CRISPRi/TianKampmann2021_CRISPRi.h5" by 22 chunks ...................... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/obs.csv" to "/TianKampmann2021/CRISPRi/obs.csv"... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/var.csv" to "/TianKampmann2021/CRISPRi/var.csv"... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRi/counts.mtx.gz" to "/TianKampmann2021/CRISPRi/counts.mtx.gz" by 9 chunks ......... DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/TianKampmann2021_CRISPRa_qc.h5" to "/TianKampmann2021/CRISPRa/TianKampmann2021_CRISPRa.h5" by 12 chunks ............ DONE
 > Uploading "/n/data1/hms/cellbio/sander/judy/resource_paper/TianKampmann2021/CRISPRa/obs.csv" to "/TianKampman

In [49]:
for h5file in h5files:
    print(h5file.split("/")[-1])
    print(os.path.getsize(h5file)/1024/1024/1024)

GasperiniShendure2019_highMOI_qc.h5
1.375312639400363
GasperiniShendure2019_lowMOI_qc.h5
1.1704791374504566
GasperiniShendure2019_atscale_qc.h5
5.696730896830559
TianKampmann2021_CRISPRi1_qc.h5
0.27218893356621265
TianKampmann2021_CRISPRi4_qc.h5
0.29750340059399605
TianKampmann2021_CRISPRi3_qc.h5
0.23738515190780163
TianKampmann2021_CRISPRi2_qc.h5
0.27679960802197456
TianKampmann2021_CRISPRa1_qc.h5
0.2928194282576442
TianKampmann2021_CRISPRa2_qc.h5
0.26540362648665905
all_expts_combined_qc.h5
5.340465493500233
NormanWeissman2019_raw_qc.h5
2.74808413349092
NormanWeissman2019_filtered_qc.h5
2.712737698107958
PapalexiSatija2021_eccite_arrayed_protein_qc.h5
0.0011401474475860596
PapalexiSatija2021_eccite_arrayed_RNA_qc.h5
0.17370563372969627
PapalexiSatija2021_eccite_protein_qc.h5
0.002593059092760086
PapalexiSatija2021_eccite_RNA_qc.h5
0.5222308021038771
TianKampmann2019_day7neuron1_qc.h5
0.4456041231751442
TianKampmann2019_day7neuron2_qc.h5
0.4140484854578972
TianKampmann2019_iPSC1_qc.h5