diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml index 90216fa..e1da299 100644 --- a/.github/workflows/qiita-plugin-ci.yml +++ b/.github/workflows/qiita-plugin-ci.yml @@ -64,7 +64,7 @@ jobs: conda config --add channels conda-forge conda create -q --yes -n qiita python=3.9 libgfortran numpy nginx cython redis conda activate qiita - pip install sphinx sphinx-bootstrap-theme nose-timer codecov Click + pip install sphinx sphinx-bootstrap-theme nose-timer Click - name: Qiita install shell: bash -l {0} @@ -137,7 +137,7 @@ jobs: nosetests --with-doctest --with-coverage -v --cover-package=qp_klp - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v3 with: token: ${{ secrets.CODECOV_TOKEN }} file: codecov.yml diff --git a/qp_klp/klp_util.py b/qp_klp/klp_util.py index 6ec65b9..b61d4ef 100644 --- a/qp_klp/klp_util.py +++ b/qp_klp/klp_util.py @@ -128,3 +128,20 @@ def write(self, failed_ids, job_name): with open(self.output_path, 'w') as f: f.write(df.to_html(border=2, index=False, justify="left", render_links=True, escape=False)) + + +def parse_prep_file(prep_file_path): + metadata = pd.read_csv(prep_file_path, + dtype=str, + delimiter='\t', + # forces Pandas to not make the first column the + # index even when the values appear numeric. + index_col=False) + + if metadata is None: + raise ValueError(f"{prep_file_path} does not exist.") + + metadata.set_index('sample_name', inplace=True) + + # convert to standard dictionary. + return metadata.to_dict('index') diff --git a/qp_klp/process_amplicon_job.py b/qp_klp/process_amplicon_job.py index 39fe083..20b54f1 100644 --- a/qp_klp/process_amplicon_job.py +++ b/qp_klp/process_amplicon_job.py @@ -4,7 +4,7 @@ from os.path import exists, join, isfile, basename from qiita_client import ArtifactInfo from qp_klp.klp_util import (map_sample_names_to_tube_ids, - update_blanks_in_qiita) + update_blanks_in_qiita, parse_prep_file) from random import sample as rsampl from sequence_processing_pipeline.ConvertJob import ConvertJob from sequence_processing_pipeline.FastQCJob import FastQCJob @@ -305,10 +305,7 @@ def process_amplicon(mapping_file_path, qclient, run_identifier, out_dir, for study_id in gpf_job.prep_file_paths: for prep_file_path in gpf_job.prep_file_paths[study_id]: - metadata = pd.read_csv(prep_file_path, - delimiter='\t', - index_col='sample_name').to_dict( - 'index') + metadata = parse_prep_file(prep_file_path) # determine data_type based on target_gene column. target_gene = metadata[list(metadata.keys())[0]]['target_gene'] diff --git a/qp_klp/process_metagenomics_job.py b/qp_klp/process_metagenomics_job.py index b2a9efd..2c1ea01 100644 --- a/qp_klp/process_metagenomics_job.py +++ b/qp_klp/process_metagenomics_job.py @@ -15,7 +15,7 @@ from random import sample as rsampl import pandas as pd from qp_klp.klp_util import (map_sample_names_to_tube_ids, FailedSamplesRecord, - update_blanks_in_qiita) + update_blanks_in_qiita, parse_prep_file) from json import dumps from itertools import chain from collections import defaultdict @@ -286,11 +286,7 @@ def process_metagenomics(sample_sheet_path, lane_number, qclient, for study_id in gpf_job.prep_file_paths: for prep_file_path in gpf_job.prep_file_paths[study_id]: - metadata = pd.read_csv(prep_file_path, - delimiter='\t', - index_col='sample_name').to_dict( - 'index') - + metadata = parse_prep_file(prep_file_path) # determine data_type based on sample-sheet # value will be from the Assay field data = {'prep_info': dumps(metadata), diff --git a/qp_klp/tests/good-prep-file-small.txt b/qp_klp/tests/good-prep-file-small.txt new file mode 100644 index 0000000..afb1cfb --- /dev/null +++ b/qp_klp/tests/good-prep-file-small.txt @@ -0,0 +1,6 @@ +sample_name experiment_design_description library_construction_protocol platform run_center run_date run_prefix sequencing_meth center_name center_project_name instrument_model runid lane sample project well_description i5_index_id sample_plate index2 index sample_well i7_index_id raw_reads quality_filtered_reads non_host_reads +363192526 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363192526_S9_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363192526.A3 iTru5_09_A Sample_Project_99999_1-4 TCTGAGAG CATCTACG A3 iTru7_114_05 10749 1 4 +363192073 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363192073_S195_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363192073.F1 iTru5_103_A Sample_Project_99999_1-4 TGGTCCTT GCAATTCG F1 iTru7_305_11 16435 2 5 +363193755 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363193755_S7_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363193755.M1 iTru5_07_A Sample_Project_99999_1-4 GGTGTCTT GATTGCTC M1 iTru7_114_03 14303 3 6 +1e-3 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363192073_S195_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363192073.F1 iTru5_103_A Sample_Project_99999_1-4 TGGTCCTT GCAATTCG F1 iTru7_305_11 16435 11 13 +123.000 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363193755_S7_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363193755.M1 iTru5_07_A Sample_Project_99999_1-4 GGTGTCTT GATTGCTC M1 iTru7_114_03 14303 12 14 diff --git a/qp_klp/tests/test_klp.py b/qp_klp/tests/test_klp.py index e46fd20..0d5c420 100644 --- a/qp_klp/tests/test_klp.py +++ b/qp_klp/tests/test_klp.py @@ -129,7 +129,7 @@ def setUp(self): "Date,2/26/20,,,,,,,,,\n", "Workflow,GenerateFASTQ,,,,,,,,,\n", "Application,FASTQ Only,,,,,,,,,\n", - "Assay,Metagenomics,,,,,,,,,\n", + "Assay,Metagenomic,,,,,,,,,\n", "Description,,,,,,,,,,\n", "Chemistry,Default,,,,,,,,,\n", ",,,,,,,,,,\n", diff --git a/qp_klp/tests/test_klp_util.py b/qp_klp/tests/test_klp_util.py new file mode 100644 index 0000000..d420447 --- /dev/null +++ b/qp_klp/tests/test_klp_util.py @@ -0,0 +1,130 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2014--, The Qiita Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ----------------------------------------------------------------------------- +from unittest import TestCase +from qp_klp.klp_util import parse_prep_file +from os.path import join + + +class KLPUtilTests(TestCase): + def test_parse_prep_file(self): + good_prep_file = join('qp_klp', 'tests', 'good-prep-file-small.txt') + + obs = parse_prep_file(good_prep_file) + + # assert that prep-files that begin with sample-names of the form + # '363192526', '1e-3', and '123.000' are parsed as strings instead of + # numeric values. + exp = {'363192526': {'experiment_design_description': 'sample project', + 'library_construction_protocol': ('Knight Lab Kap' + 'a HyperPlus'), + 'platform': 'Illumina', 'run_center': 'KLM', + 'run_date': '2022-04-18', + 'run_prefix': '363192526_S9_L001', + 'sequencing_meth': 'sequencing by synthesis', + 'center_name': 'UCSD', + 'center_project_name': 'Sample_Project', + 'instrument_model': 'Illumina iSeq', + 'runid': '20220101_FS10001776_07_ABC12345-4567', + 'lane': '1', 'sample project': 'Sample_Project', + 'well_description': ('Sample_Project_99999_1-' + '4.363192526.A3'), + 'i5_index_id': 'iTru5_09_A', + 'sample_plate': 'Sample_Project_99999_1-4', + 'index2': 'TCTGAGAG', 'index': 'CATCTACG', + 'sample_well': 'A3', + 'i7_index_id': 'iTru7_114_05', + 'raw_reads': '10749', + 'quality_filtered_reads': '1', + 'non_host_reads': '4'}, + '363192073': {'experiment_design_description': 'sample project', + 'library_construction_protocol': ('Knight Lab Ka' + 'pa HyperPlus'), + 'platform': 'Illumina', 'run_center': 'KLM', + 'run_date': '2022-04-18', + 'run_prefix': '363192073_S195_L001', + 'sequencing_meth': 'sequencing by synthesis', + 'center_name': 'UCSD', + 'center_project_name': 'Sample_Project', + 'instrument_model': 'Illumina iSeq', + 'runid': '20220101_FS10001776_07_ABC12345-4567', + 'lane': '1', 'sample project': 'Sample_Project', + 'well_description': ('Sample_Project_99999_1-' + '4.363192073.F1'), + 'i5_index_id': 'iTru5_103_A', + 'sample_plate': 'Sample_Project_99999_1-4', + 'index2': 'TGGTCCTT', 'index': 'GCAATTCG', + 'sample_well': 'F1', + 'i7_index_id': 'iTru7_305_11', + 'raw_reads': '16435', + 'quality_filtered_reads': '2', + 'non_host_reads': '5'}, + '363193755': {'experiment_design_description': 'sample project', + 'library_construction_protocol': ('Knight Lab Ka' + 'pa HyperPlus'), + 'platform': 'Illumina', 'run_center': 'KLM', + 'run_date': '2022-04-18', + 'run_prefix': '363193755_S7_L001', + 'sequencing_meth': 'sequencing by synthesis', + 'center_name': 'UCSD', + 'center_project_name': 'Sample_Project', + 'instrument_model': 'Illumina iSeq', + 'runid': '20220101_FS10001776_07_ABC12345-4567', + 'lane': '1', 'sample project': 'Sample_Project', + 'well_description': ('Sample_Project_99999_1-' + '4.363193755.M1'), + 'i5_index_id': 'iTru5_07_A', + 'sample_plate': 'Sample_Project_99999_1-4', + 'index2': 'GGTGTCTT', 'index': 'GATTGCTC', + 'sample_well': 'M1', + 'i7_index_id': 'iTru7_114_03', + 'raw_reads': '14303', + 'quality_filtered_reads': '3', + 'non_host_reads': '6'}, + '1e-3': {'experiment_design_description': 'sample project', + 'library_construction_protocol': ('Knight Lab Kapa ' + 'HyperPlus'), + 'platform': 'Illumina', 'run_center': 'KLM', + 'run_date': '2022-04-18', + 'run_prefix': '363192073_S195_L001', + 'sequencing_meth': 'sequencing by synthesis', + 'center_name': 'UCSD', + 'center_project_name': 'Sample_Project', + 'instrument_model': 'Illumina iSeq', + 'runid': '20220101_FS10001776_07_ABC12345-4567', + 'lane': '1', 'sample project': 'Sample_Project', + 'well_description': ('Sample_Project_99999_1-' + '4.363192073.F1'), + 'i5_index_id': 'iTru5_103_A', + 'sample_plate': 'Sample_Project_99999_1-4', + 'index2': 'TGGTCCTT', 'index': 'GCAATTCG', + 'sample_well': 'F1', 'i7_index_id': 'iTru7_305_11', + 'raw_reads': '16435', 'quality_filtered_reads': '11', + 'non_host_reads': '13'}, + '123.000': {'experiment_design_description': 'sample project', + 'library_construction_protocol': ('Knight Lab Kapa' + ' HyperPlus'), + 'platform': 'Illumina', 'run_center': 'KLM', + 'run_date': '2022-04-18', + 'run_prefix': '363193755_S7_L001', + 'sequencing_meth': 'sequencing by synthesis', + 'center_name': 'UCSD', + 'center_project_name': 'Sample_Project', + 'instrument_model': 'Illumina iSeq', + 'runid': '20220101_FS10001776_07_ABC12345-4567', + 'lane': '1', 'sample project': 'Sample_Project', + 'well_description': ('Sample_Project_99999_1-' + '4.363193755.M1'), + 'i5_index_id': 'iTru5_07_A', + 'sample_plate': 'Sample_Project_99999_1-4', + 'index2': 'GGTGTCTT', 'index': 'GATTGCTC', + 'sample_well': 'M1', 'i7_index_id': 'iTru7_114_03', + 'raw_reads': '14303', + 'quality_filtered_reads': '12', + 'non_host_reads': '14'}} + + self.assertDictEqual(obs, exp)