Skip to content

Commit

Permalink
Merge pull request #46 from sansomlab/sns_fix_hashing
Browse files Browse the repository at this point in the history
Sns fix hashing
  • Loading branch information
snsansom committed Sep 1, 2023
2 parents 56b880d + febc1b4 commit a8d5e14
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 45 deletions.
46 changes: 25 additions & 21 deletions cellhub/pipeline_cellranger.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,6 @@
* "sample_id" a unique identifier for the biological sample being analysed
* "library_id" is a unique identifier for the sequencing libraries generated from a single channel on a single 10x chip. Use the same "library ID" for Gene Expression, Antibody Capture, VDJ-T and VDJ-B libraries that are generated from the same channel.
* "chemistry": The 10x reaction chemistry, the options are:
* 'auto' for autodetection,
* 'threeprime' for Single Cell 3',
* 'fiveprime' for Single Cell 5',
* 'SC3Pv1',
* 'SC3Pv2',
* 'SC3Pv3',
* 'SC5P-PE',
* 'SC5P-R2' for Single Cell 5', paired-end/R2-only,
* 'SC-FB' for Single Cell Antibody-only 3' v2 or 5'.
* "expect_cells": An integer specifying the expected number of cells
It is recommended to include the following columns
* "chip": a unique ID for the 10x Chip
* "channel_id": an integer denoting the channel on the chip
* "date": the date the 10x run was performed
Additional arbitrary columns describing the sample metadata should be included
to aid the downstream analysis, for example
Expand All @@ -74,6 +55,9 @@
* "age"
* "sex"
For HTO hashing experiments include a column containing details of the hash tag, e.g.
* "hto_id"
(ii) libraries.tsv
Expand All @@ -88,6 +72,26 @@
* "feature_type": One of "Gene Expression", "Antibody Capture", "VDJ-T" or "VDJ-B". Case sensitive.
* "fastq_path": the location of the folder containing the fastq files
* "sample": this will be passed to the "--sample" parameter of the cellranger pipelines (see: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input). It is only used to match the relevant FASTQ files: it does not have to match the "sample_id" provided in the "samples.tsv" table, and is not used in downstream analysis.
* "chemistry": The 10x reaction chemistry, the options are:
* 'auto' for autodetection,
* 'threeprime' for Single Cell 3',
* 'fiveprime' for Single Cell 5',
* 'SC3Pv1',
* 'SC3Pv2',
* 'SC3Pv3',
* 'SC5P-PE',
* 'SC5P-R2' for Single Cell 5', paired-end/R2-only,
* 'SC-FB' for Single Cell Antibody-only 3' v2 or 5'.
* "expect_cells": An integer specifying the expected number of cells
It is recommended to include the following columns
* "chip": a unique ID for the 10x Chip
* "channel_id": an integer denoting the channel on the chip
* "date": the date the 10x run was performed
Note: Use of the cellranger "--lanes": parameter is not supported. This means that data from all the lanes present in the given location for the given "sample" prefix will be run. This applies for both Gene Expression and VDJ analysis. If you need to analyse data from a single lane, link the data from that lane into a different location.
Expand Down Expand Up @@ -222,7 +226,7 @@ def count(infile, outfile):

this_library_id = os.path.basename(infile)[:-len(".csv")]

library_parameters = S.samples[this_library_id]
library_parameters = S.library_parameters[this_library_id]

# provide references for the present feature types
lib_types = S.lib_types(this_library_id)
Expand All @@ -232,7 +236,7 @@ def count(infile, outfile):
transcriptome = "--transcriptome=" + PARAMS["gex_reference"]

if "Antibody Capture" in lib_types:
feature_ref = "--feature-ref" + PARAMS["feature_reference"]
feature_ref = "--feature-ref=" + PARAMS["feature_reference"]

# add read trimming if specified
r1len, r2len = "", ""
Expand Down
2 changes: 1 addition & 1 deletion cellhub/pipeline_dehash.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def gmmDemux(infile, outfile):
os.makedirs(gmm_working_dir)

if PARAMS["hto_per_library"] == True:
HTOs = "_".join([PARAMS["hto"], library_id])
HTOs = PARAMS["hto_" + library_id]
else:
HTOs = PARAMS["hto_names"]

Expand Down
43 changes: 28 additions & 15 deletions cellhub/tasks/samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,18 @@ def __init__(self,
library_tsv = None):

samples = pd.read_csv(sample_tsv, sep="\t")
sample_cols = ["sample_id", "library_id",
"chemistry", "expect_cells"]
required_sample_cols = ["sample_id", "library_id"]

check_cols(samples,sample_cols,
check_cols(samples,required_sample_cols,
"samples.tsv")

libs = pd.read_csv(library_tsv, sep="\t")

check_cols(libs, ["library_id","feature_type",
"sample","fastq_path"],
required_library_cols = ["library_id","feature_type",
"sample","fastq_path",
"chemistry", "expect_cells"]

check_cols(libs, required_library_cols,
"libraries.tsv")

self.known_feature_library_types = ["Gene Expression",
Expand All @@ -86,12 +88,10 @@ def __init__(self,
check_values(libs, "feature_type",
self.known_feature_library_types + self.known_vdj_types)

if not samples["library_id"].is_unique:
raise ValueError("Non-unique library_ids provided")

libraries = samples["library_id"].values
if not samples["sample_id"].is_unique:
raise ValueError("Non-unique sample_ids provided")

samples.index = samples["library_id"]
samples.index = samples["sample_id"]
self.samples = samples.to_dict(orient="index")

libs.sort_values(["library_id",
Expand All @@ -100,6 +100,17 @@ def __init__(self,

self.libs = libs

# make a dictionary for looking up the expect cells and chemistry information
library_parameters = libs[["library_id","chemistry","expect_cells"]].drop_duplicates()

if not library_parameters["library_id"].is_unique:
raise ValueError('Different "chemistry" and/or "expect_cells" values are mapped'
' to the same library_id in the libraries.tsv file')

library_parameters.index = library_parameters["library_id"]

self.library_parameters = library_parameters.to_dict(orient="index")

self.feature_types = set(libs["feature_type"])


Expand Down Expand Up @@ -132,12 +143,14 @@ def get_samples_and_fastqs(self, library_id, feature_type):
# check that the fastq paths are different (see note in pipeline_cellranger.py)
fqps = [x.strip() for x in fastq_path.split(",")]

if len(set(fqps)) < len(fqps):
if feature_type in ["VDJ-T", "VDJ-B"]:

if len(set(fqps)) < len(fqps):

raise ValueError("Duplicate FASTQ paths detected for data from different"
" flow cells. This is not supported by the 'cellranger vdj' command."
" VDJ data from different flow cells must be arranged in different"
" folders. See note in pipeline_cellranger.py")
raise ValueError("Duplicate FASTQ paths detected for data from different"
" flow cells. This is not supported by the 'cellranger vdj' command."
" VDJ data from different flow cells must be arranged in different"
" folders. See note in pipeline_cellranger.py")

sample = x["sample"].values[0]

Expand Down
3 changes: 3 additions & 0 deletions examples/ifnb_pbmc/cellhub/libraries.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
library_id sample feature_type fastq_path expect_cells chemistry
GSM2560248 bamtofastq Gene Expression /well/kir/mirror/geo/GSE96583/GSM2560248/ 8000 auto
GSM2560249 bamtofastq Gene Expression /well/kir/mirror/geo/GSE96583/GSM2560249/ 8000 auto
12 changes: 6 additions & 6 deletions examples/pbmc10k/libraries.tsv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
library_id sample feature_type fastq_path
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_1 Gene Expression /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_fastqs
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_2 Gene Expression /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_fastqs
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_1 VDJ-T /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_1_fastqs
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_2 VDJ-T /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_2_fastqs
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_b_1 VDJ-B /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_b_fastqs
library_id sample feature_type fastq_path chemistry expect_cells
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_1 Gene Expression /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_fastqs auto 10000
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_2 Gene Expression /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_gex_fastqs auto 10000
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_1 VDJ-T /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_1_fastqs auto 10000
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_2 VDJ-T /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_t_2_fastqs auto 10000
10k_PBMC_5pv2 10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_b_1 VDJ-B /well/kir/mirror/10xgenomics/10k_human_pbmc_v2/10k_PBMC_5pv2_nextgem_Chromium_Controller_fastqs/10k_PBMC_5pv2_nextgem_Chromium_Controller_vdj_b_fastqs auto 10000
4 changes: 2 additions & 2 deletions examples/pbmc10k/samples.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
sample_id library_id chemistry expect_cells
10k_PBMC_5pv2 10k_PBMC_5pv2 auto 10000
sample_id library_id
10k_PBMC_5pv2 10k_PBMC_5pv2
3 changes: 3 additions & 0 deletions python/fetch_cells_from_h5.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@
library_id,
"h5","data.h5")

if not os.path.exists(h5_path):
raise ValueError("h5 path does not exist: " + h5_path)

try:

x = sc.read_10x_h5(h5_path)
Expand Down

0 comments on commit a8d5e14

Please sign in to comment.