Merge pull request #46 from sansomlab/sns_fix_hashing

Sns fix hashing
sansomlab · Sep 1, 2023 · a8d5e14 · a8d5e14
2 parents 56b880d + febc1b4
commit a8d5e14
Show file tree

Hide file tree

Showing 7 changed files with 68 additions and 45 deletions.
diff --git a/cellhub/pipeline_cellranger.py b/cellhub/pipeline_cellranger.py
@@ -44,25 +44,6 @@
 
 * "sample_id" a unique identifier for the biological sample being analysed
 * "library_id" is a unique identifier for the sequencing libraries generated from a single channel on a single 10x chip. Use the same "library ID" for Gene Expression, Antibody Capture, VDJ-T and VDJ-B libraries that are generated from the same channel.
-* "chemistry": The 10x reaction chemistry, the options are: 
-
-  * 'auto' for autodetection, 
-  * 'threeprime' for Single Cell 3', 
-  * 'fiveprime' for  Single Cell 5', 
-  * 'SC3Pv1',
-  * 'SC3Pv2',
-  * 'SC3Pv3', 
-  * 'SC5P-PE',
-  * 'SC5P-R2' for Single Cell 5', paired-end/R2-only,
-  * 'SC-FB' for Single Cell Antibody-only 3' v2 or 5'.
-  
-* "expect_cells": An integer specifying the expected number of cells
-
-It is recommended to include the following columns
-
-* "chip": a unique ID for the 10x Chip
-* "channel_id": an integer denoting the channel on the chip
-* "date": the date the 10x run was performed
 
 Additional arbitrary columns describing the sample metadata should be included
 to aid the downstream analysis, for example
@@ -74,6 +55,9 @@
 * "age"
 * "sex"
 
+For HTO hashing experiments include a column containing details of the hash tag, e.g.
+
+* "hto_id"
 
 
 (ii) libraries.tsv
@@ -88,6 +72,26 @@
 * "feature_type": One of "Gene Expression", "Antibody Capture", "VDJ-T" or "VDJ-B". Case sensitive.
 * "fastq_path": the location of the folder containing the fastq files
 * "sample": this will be passed to the "--sample" parameter of the cellranger pipelines (see: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input). It is only used to match the relevant FASTQ files: it does not have to match the "sample_id" provided in the "samples.tsv" table, and is not used in downstream analysis.
+* "chemistry": The 10x reaction chemistry, the options are: 
+
+  * 'auto' for autodetection, 
+  * 'threeprime' for Single Cell 3', 
+  * 'fiveprime' for  Single Cell 5', 
+  * 'SC3Pv1',
+  * 'SC3Pv2',
+  * 'SC3Pv3', 
+  * 'SC5P-PE',
+  * 'SC5P-R2' for Single Cell 5', paired-end/R2-only,
+  * 'SC-FB' for Single Cell Antibody-only 3' v2 or 5'.
+  
+  * "expect_cells": An integer specifying the expected number of cells
+
+It is recommended to include the following columns
+
+* "chip": a unique ID for the 10x Chip
+* "channel_id": an integer denoting the channel on the chip
+* "date": the date the 10x run was performed
+
 
 Note: Use of the cellranger "--lanes": parameter is not supported. This means that data from all the lanes present in the given location for the given "sample" prefix will be run. This applies for both Gene Expression and VDJ analysis. If you need to analyse data from a single lane, link the data from that lane into a different location. 
 
@@ -222,7 +226,7 @@ def count(infile, outfile):
 
     this_library_id = os.path.basename(infile)[:-len(".csv")]
 
-    library_parameters = S.samples[this_library_id]
+    library_parameters = S.library_parameters[this_library_id]
 
     # provide references for the present feature types
     lib_types = S.lib_types(this_library_id)
@@ -232,7 +236,7 @@ def count(infile, outfile):
         transcriptome = "--transcriptome=" + PARAMS["gex_reference"]
 
     if "Antibody Capture" in lib_types:
-        feature_ref =  "--feature-ref" + PARAMS["feature_reference"]
+        feature_ref =  "--feature-ref=" + PARAMS["feature_reference"]
 
     # add read trimming if specified
     r1len, r2len = "", ""

diff --git a/cellhub/pipeline_dehash.py b/cellhub/pipeline_dehash.py
@@ -84,7 +84,7 @@ def gmmDemux(infile, outfile):
         os.makedirs(gmm_working_dir)
 
     if PARAMS["hto_per_library"] == True:
-        HTOs = "_".join([PARAMS["hto"], library_id])
+        HTOs = PARAMS["hto_" + library_id]
     else:
         HTOs = PARAMS["hto_names"]
 

diff --git a/cellhub/tasks/samples.py b/cellhub/tasks/samples.py
@@ -64,16 +64,18 @@ def __init__(self,
                  library_tsv = None):
 
         samples = pd.read_csv(sample_tsv, sep="\t")
-        sample_cols = ["sample_id", "library_id", 
-                       "chemistry", "expect_cells"]
+        required_sample_cols = ["sample_id", "library_id"]
 
-        check_cols(samples,sample_cols,
+        check_cols(samples,required_sample_cols,
                    "samples.tsv")
 
         libs = pd.read_csv(library_tsv, sep="\t")
 
-        check_cols(libs, ["library_id","feature_type",
-                            "sample","fastq_path"],
+        required_library_cols = ["library_id","feature_type",
+                                 "sample","fastq_path",
+                                 "chemistry", "expect_cells"]
+
+        check_cols(libs, required_library_cols,
                    "libraries.tsv")
 
         self.known_feature_library_types = ["Gene Expression",
@@ -86,12 +88,10 @@ def __init__(self,
         check_values(libs, "feature_type", 
                      self.known_feature_library_types + self.known_vdj_types)
 
-        if not samples["library_id"].is_unique:
-            raise ValueError("Non-unique library_ids provided")
-
-        libraries = samples["library_id"].values
+        if not samples["sample_id"].is_unique:
+            raise ValueError("Non-unique sample_ids provided")
 
-        samples.index = samples["library_id"]
+        samples.index = samples["sample_id"]
         self.samples = samples.to_dict(orient="index")
 
         libs.sort_values(["library_id",
@@ -100,6 +100,17 @@ def __init__(self,
 
         self.libs = libs
 
+        # make a dictionary for looking up the expect cells and chemistry information
+        library_parameters = libs[["library_id","chemistry","expect_cells"]].drop_duplicates()
+
+        if not library_parameters["library_id"].is_unique:
+            raise ValueError('Different "chemistry" and/or "expect_cells" values are mapped'
+                             ' to the same library_id in the libraries.tsv file')
+
+        library_parameters.index = library_parameters["library_id"]
+
+        self.library_parameters = library_parameters.to_dict(orient="index")
+
         self.feature_types = set(libs["feature_type"])
 
 
@@ -132,12 +143,14 @@ def get_samples_and_fastqs(self, library_id, feature_type):
         # check that the fastq paths are different (see note in pipeline_cellranger.py)
         fqps = [x.strip() for x in fastq_path.split(",")]
 
-        if len(set(fqps)) < len(fqps):
+        if feature_type in ["VDJ-T", "VDJ-B"]:
+
+            if len(set(fqps)) < len(fqps):
 
-            raise ValueError("Duplicate FASTQ paths detected for data from different"
-                             " flow cells. This is not supported by the 'cellranger vdj' command." 
-                             " VDJ data from different flow cells must be arranged in different" 
-                             " folders. See note in pipeline_cellranger.py")
+                raise ValueError("Duplicate FASTQ paths detected for data from different"
+                                 " flow cells. This is not supported by the 'cellranger vdj' command." 
+                                 " VDJ data from different flow cells must be arranged in different" 
+                                 " folders. See note in pipeline_cellranger.py")
 
         sample = x["sample"].values[0]
 

diff --git a/examples/ifnb_pbmc/cellhub/libraries.tsv b/examples/ifnb_pbmc/cellhub/libraries.tsv
@@ -0,0 +1,3 @@
+library_id	sample	feature_type	fastq_path	expect_cells	chemistry
+GSM2560248	bamtofastq	Gene Expression	/well/kir/mirror/geo/GSE96583/GSM2560248/	8000	auto
+GSM2560249	bamtofastq	Gene Expression	/well/kir/mirror/geo/GSE96583/GSM2560249/	8000	auto
diff --git a/examples/pbmc10k/libraries.tsv b/examples/pbmc10k/libraries.tsv
@@ -1,6 +1,6 @@
-library_id	sample	feature_type	fastq_path
-10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_1	Gene Expression	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_fastqs
-10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_2	Gene Expression	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_fastqs
-10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_1	VDJ-T	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_1_fastqs
-10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_2	VDJ-T	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_2_fastqs
-10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_b_1	VDJ-B	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_b_fastqs
+library_id	sample	feature_type	fastq_path	chemistry	expect_cells
+10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_1	Gene Expression	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_fastqs	auto	10000
+10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_2	Gene Expression	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_fastqs	auto	10000
+10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_1	VDJ-T	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_1_fastqs	auto	10000
+10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_2	VDJ-T	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_2_fastqs	auto	10000
+10k_PBMC_5pv2	10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_b_1	VDJ-B	/well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_b_fastqs	auto	10000
diff --git a/examples/pbmc10k/samples.tsv b/examples/pbmc10k/samples.tsv
@@ -1,2 +1,2 @@
-sample_id	library_id	chemistry	expect_cells
-10k_PBMC_5pv2	10k_PBMC_5pv2	auto	10000
+sample_id	library_id
+10k_PBMC_5pv2	10k_PBMC_5pv2
diff --git a/python/fetch_cells_from_h5.py b/python/fetch_cells_from_h5.py
@@ -73,6 +73,9 @@
                            library_id,
                            "h5","data.h5")
 
+    if not os.path.exists(h5_path):
+        raise ValueError("h5 path does not exist: " + h5_path)
+
     try:
 
         x = sc.read_10x_h5(h5_path)