Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hotfix to ensure sample-names are always strings #49

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/qiita-plugin-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
conda config --add channels conda-forge
conda create -q --yes -n qiita python=3.9 libgfortran numpy nginx cython redis
conda activate qiita
pip install sphinx sphinx-bootstrap-theme nose-timer codecov Click
pip install sphinx sphinx-bootstrap-theme nose-timer Click

- name: Qiita install
shell: bash -l {0}
Expand Down Expand Up @@ -137,7 +137,7 @@ jobs:

nosetests --with-doctest --with-coverage -v --cover-package=qp_klp

- uses: codecov/codecov-action@v1
- uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: codecov.yml
Expand Down
17 changes: 17 additions & 0 deletions qp_klp/klp_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,20 @@ def write(self, failed_ids, job_name):
with open(self.output_path, 'w') as f:
f.write(df.to_html(border=2, index=False, justify="left",
render_links=True, escape=False))


def parse_prep_file(prep_file_path):
metadata = pd.read_csv(prep_file_path,
dtype=str,
delimiter='\t',
# forces Pandas to not make the first column the
# index even when the values appear numeric.
index_col=False)

if metadata is None:
raise ValueError(f"{prep_file_path} does not exist.")

metadata.set_index('sample_name', inplace=True)

# convert to standard dictionary.
return metadata.to_dict('index')
7 changes: 2 additions & 5 deletions qp_klp/process_amplicon_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from os.path import exists, join, isfile, basename
from qiita_client import ArtifactInfo
from qp_klp.klp_util import (map_sample_names_to_tube_ids,
update_blanks_in_qiita)
update_blanks_in_qiita, parse_prep_file)
from random import sample as rsampl
from sequence_processing_pipeline.ConvertJob import ConvertJob
from sequence_processing_pipeline.FastQCJob import FastQCJob
Expand Down Expand Up @@ -305,10 +305,7 @@ def process_amplicon(mapping_file_path, qclient, run_identifier, out_dir,

for study_id in gpf_job.prep_file_paths:
for prep_file_path in gpf_job.prep_file_paths[study_id]:
metadata = pd.read_csv(prep_file_path,
delimiter='\t',
index_col='sample_name').to_dict(
'index')
metadata = parse_prep_file(prep_file_path)

# determine data_type based on target_gene column.
target_gene = metadata[list(metadata.keys())[0]]['target_gene']
Expand Down
8 changes: 2 additions & 6 deletions qp_klp/process_metagenomics_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from random import sample as rsampl
import pandas as pd
from qp_klp.klp_util import (map_sample_names_to_tube_ids, FailedSamplesRecord,
update_blanks_in_qiita)
update_blanks_in_qiita, parse_prep_file)
from json import dumps
from itertools import chain
from collections import defaultdict
Expand Down Expand Up @@ -286,11 +286,7 @@ def process_metagenomics(sample_sheet_path, lane_number, qclient,

for study_id in gpf_job.prep_file_paths:
for prep_file_path in gpf_job.prep_file_paths[study_id]:
metadata = pd.read_csv(prep_file_path,
delimiter='\t',
index_col='sample_name').to_dict(
'index')

metadata = parse_prep_file(prep_file_path)
# determine data_type based on sample-sheet
# value will be from the Assay field
data = {'prep_info': dumps(metadata),
Expand Down
6 changes: 6 additions & 0 deletions qp_klp/tests/good-prep-file-small.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sample_name experiment_design_description library_construction_protocol platform run_center run_date run_prefix sequencing_meth center_name center_project_name instrument_model runid lane sample project well_description i5_index_id sample_plate index2 index sample_well i7_index_id raw_reads quality_filtered_reads non_host_reads
363192526 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363192526_S9_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363192526.A3 iTru5_09_A Sample_Project_99999_1-4 TCTGAGAG CATCTACG A3 iTru7_114_05 10749 1 4
363192073 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363192073_S195_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363192073.F1 iTru5_103_A Sample_Project_99999_1-4 TGGTCCTT GCAATTCG F1 iTru7_305_11 16435 2 5
363193755 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363193755_S7_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363193755.M1 iTru5_07_A Sample_Project_99999_1-4 GGTGTCTT GATTGCTC M1 iTru7_114_03 14303 3 6
1e-3 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363192073_S195_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363192073.F1 iTru5_103_A Sample_Project_99999_1-4 TGGTCCTT GCAATTCG F1 iTru7_305_11 16435 11 13
123.000 sample project Knight Lab Kapa HyperPlus Illumina KLM 2022-04-18 363193755_S7_L001 sequencing by synthesis UCSD Sample_Project Illumina iSeq 20220101_FS10001776_07_ABC12345-4567 1 Sample_Project Sample_Project_99999_1-4.363193755.M1 iTru5_07_A Sample_Project_99999_1-4 GGTGTCTT GATTGCTC M1 iTru7_114_03 14303 12 14
2 changes: 1 addition & 1 deletion qp_klp/tests/test_klp.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def setUp(self):
"Date,2/26/20,,,,,,,,,\n",
"Workflow,GenerateFASTQ,,,,,,,,,\n",
"Application,FASTQ Only,,,,,,,,,\n",
"Assay,Metagenomics,,,,,,,,,\n",
"Assay,Metagenomic,,,,,,,,,\n",
"Description,,,,,,,,,,\n",
"Chemistry,Default,,,,,,,,,\n",
",,,,,,,,,,\n",
Expand Down
130 changes: 130 additions & 0 deletions qp_klp/tests/test_klp_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------
from unittest import TestCase
from qp_klp.klp_util import parse_prep_file
from os.path import join


class KLPUtilTests(TestCase):
def test_parse_prep_file(self):
good_prep_file = join('qp_klp', 'tests', 'good-prep-file-small.txt')

obs = parse_prep_file(good_prep_file)

# assert that prep-files that begin with sample-names of the form
# '363192526', '1e-3', and '123.000' are parsed as strings instead of
# numeric values.
exp = {'363192526': {'experiment_design_description': 'sample project',
'library_construction_protocol': ('Knight Lab Kap'
'a HyperPlus'),
'platform': 'Illumina', 'run_center': 'KLM',
'run_date': '2022-04-18',
'run_prefix': '363192526_S9_L001',
'sequencing_meth': 'sequencing by synthesis',
'center_name': 'UCSD',
'center_project_name': 'Sample_Project',
'instrument_model': 'Illumina iSeq',
'runid': '20220101_FS10001776_07_ABC12345-4567',
'lane': '1', 'sample project': 'Sample_Project',
'well_description': ('Sample_Project_99999_1-'
'4.363192526.A3'),
'i5_index_id': 'iTru5_09_A',
'sample_plate': 'Sample_Project_99999_1-4',
'index2': 'TCTGAGAG', 'index': 'CATCTACG',
'sample_well': 'A3',
'i7_index_id': 'iTru7_114_05',
'raw_reads': '10749',
'quality_filtered_reads': '1',
'non_host_reads': '4'},
'363192073': {'experiment_design_description': 'sample project',
'library_construction_protocol': ('Knight Lab Ka'
'pa HyperPlus'),
'platform': 'Illumina', 'run_center': 'KLM',
'run_date': '2022-04-18',
'run_prefix': '363192073_S195_L001',
'sequencing_meth': 'sequencing by synthesis',
'center_name': 'UCSD',
'center_project_name': 'Sample_Project',
'instrument_model': 'Illumina iSeq',
'runid': '20220101_FS10001776_07_ABC12345-4567',
'lane': '1', 'sample project': 'Sample_Project',
'well_description': ('Sample_Project_99999_1-'
'4.363192073.F1'),
'i5_index_id': 'iTru5_103_A',
'sample_plate': 'Sample_Project_99999_1-4',
'index2': 'TGGTCCTT', 'index': 'GCAATTCG',
'sample_well': 'F1',
'i7_index_id': 'iTru7_305_11',
'raw_reads': '16435',
'quality_filtered_reads': '2',
'non_host_reads': '5'},
'363193755': {'experiment_design_description': 'sample project',
'library_construction_protocol': ('Knight Lab Ka'
'pa HyperPlus'),
'platform': 'Illumina', 'run_center': 'KLM',
'run_date': '2022-04-18',
'run_prefix': '363193755_S7_L001',
'sequencing_meth': 'sequencing by synthesis',
'center_name': 'UCSD',
'center_project_name': 'Sample_Project',
'instrument_model': 'Illumina iSeq',
'runid': '20220101_FS10001776_07_ABC12345-4567',
'lane': '1', 'sample project': 'Sample_Project',
'well_description': ('Sample_Project_99999_1-'
'4.363193755.M1'),
'i5_index_id': 'iTru5_07_A',
'sample_plate': 'Sample_Project_99999_1-4',
'index2': 'GGTGTCTT', 'index': 'GATTGCTC',
'sample_well': 'M1',
'i7_index_id': 'iTru7_114_03',
'raw_reads': '14303',
'quality_filtered_reads': '3',
'non_host_reads': '6'},
'1e-3': {'experiment_design_description': 'sample project',
'library_construction_protocol': ('Knight Lab Kapa '
'HyperPlus'),
'platform': 'Illumina', 'run_center': 'KLM',
'run_date': '2022-04-18',
'run_prefix': '363192073_S195_L001',
'sequencing_meth': 'sequencing by synthesis',
'center_name': 'UCSD',
'center_project_name': 'Sample_Project',
'instrument_model': 'Illumina iSeq',
'runid': '20220101_FS10001776_07_ABC12345-4567',
'lane': '1', 'sample project': 'Sample_Project',
'well_description': ('Sample_Project_99999_1-'
'4.363192073.F1'),
'i5_index_id': 'iTru5_103_A',
'sample_plate': 'Sample_Project_99999_1-4',
'index2': 'TGGTCCTT', 'index': 'GCAATTCG',
'sample_well': 'F1', 'i7_index_id': 'iTru7_305_11',
'raw_reads': '16435', 'quality_filtered_reads': '11',
'non_host_reads': '13'},
'123.000': {'experiment_design_description': 'sample project',
'library_construction_protocol': ('Knight Lab Kapa'
' HyperPlus'),
'platform': 'Illumina', 'run_center': 'KLM',
'run_date': '2022-04-18',
'run_prefix': '363193755_S7_L001',
'sequencing_meth': 'sequencing by synthesis',
'center_name': 'UCSD',
'center_project_name': 'Sample_Project',
'instrument_model': 'Illumina iSeq',
'runid': '20220101_FS10001776_07_ABC12345-4567',
'lane': '1', 'sample project': 'Sample_Project',
'well_description': ('Sample_Project_99999_1-'
'4.363193755.M1'),
'i5_index_id': 'iTru5_07_A',
'sample_plate': 'Sample_Project_99999_1-4',
'index2': 'GGTGTCTT', 'index': 'GATTGCTC',
'sample_well': 'M1', 'i7_index_id': 'iTru7_114_03',
'raw_reads': '14303',
'quality_filtered_reads': '12',
'non_host_reads': '14'}}

self.assertDictEqual(obs, exp)