Skip to content

Commit 0e93e6c

Browse files
committed
Merge branch 'fix-1084' of https://github.com/biocore/qiita into fix-1084
2 parents 36ee292 + 3ac6ab9 commit 0e93e6c

28 files changed

+1455
-578
lines changed

qiita_db/metadata_template/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@
88

99
from .sample_template import SampleTemplate
1010
from .prep_template import PrepTemplate
11-
from .util import load_template_to_dataframe
11+
from .util import load_template_to_dataframe, looks_like_qiime_mapping_file
1212
from .constants import (TARGET_GENE_DATA_TYPES, SAMPLE_TEMPLATE_COLUMNS,
1313
PREP_TEMPLATE_COLUMNS,
14-
PREP_TEMPLATE_COLUMNS_TARGET_GENE)
14+
PREP_TEMPLATE_COLUMNS_TARGET_GENE, CONTROLLED_COLS)
1515

1616

1717
__all__ = ['SampleTemplate', 'PrepTemplate', 'load_template_to_dataframe',
1818
'TARGET_GENE_DATA_TYPES', 'SAMPLE_TEMPLATE_COLUMNS',
19-
'PREP_TEMPLATE_COLUMNS', 'PREP_TEMPLATE_COLUMNS_TARGET_GENE']
19+
'PREP_TEMPLATE_COLUMNS', 'PREP_TEMPLATE_COLUMNS_TARGET_GENE',
20+
'CONTROLLED_COLS', 'looks_like_qiime_mapping_file']

qiita_db/metadata_template/test/test_util.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
QiitaDBError)
1818
from qiita_db.metadata_template.util import (
1919
get_datatypes, as_python_types, prefix_sample_names_with_id,
20-
load_template_to_dataframe, get_invalid_sample_names)
20+
load_template_to_dataframe, get_invalid_sample_names,
21+
looks_like_qiime_mapping_file, _parse_mapping_file)
2122

2223

2324
class TestUtil(TestCase):
@@ -64,6 +65,17 @@ def test_load_template_to_dataframe(self):
6465
exp.index.name = 'sample_name'
6566
assert_frame_equal(obs, exp)
6667

68+
def test_load_template_to_dataframe_qiime_map(self):
69+
obs = load_template_to_dataframe(StringIO(QIIME_TUTORIAL_MAP_SUBSET),
70+
index='#SampleID')
71+
exp = pd.DataFrame.from_dict(QIIME_TUTORIAL_MAP_DICT_FORM)
72+
exp.index.name = 'SampleID'
73+
obs.sort_index(axis=0, inplace=True)
74+
obs.sort_index(axis=1, inplace=True)
75+
exp.sort_index(axis=0, inplace=True)
76+
exp.sort_index(axis=1, inplace=True)
77+
assert_frame_equal(obs, exp)
78+
6779
def test_load_template_to_dataframe_duplicate_cols(self):
6880
obs = load_template_to_dataframe(
6981
StringIO(EXP_SAMPLE_TEMPLATE_DUPE_COLS))
@@ -218,6 +230,43 @@ def test_invalid_lat_long(self):
218230
# prevent flake8 from complaining
219231
str(obs)
220232

233+
def test_looks_like_qiime_mapping_file(self):
234+
obs = looks_like_qiime_mapping_file(
235+
StringIO(EXP_SAMPLE_TEMPLATE))
236+
self.assertFalse(obs)
237+
238+
obs = looks_like_qiime_mapping_file(
239+
StringIO(QIIME_TUTORIAL_MAP_SUBSET))
240+
self.assertTrue(obs)
241+
242+
obs = looks_like_qiime_mapping_file(StringIO())
243+
self.assertFalse(obs)
244+
245+
def test_parse_mapping_file(self):
246+
# Tests ported over from QIIME
247+
s1 = ['#sample\ta\tb', '#comment line to skip',
248+
'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
249+
exp = ([['x', 'y', 'z'], ['i', 'j', 'k']],
250+
['sample', 'a', 'b'],
251+
['comment line to skip', 'more skip'])
252+
obs = _parse_mapping_file(s1)
253+
self.assertEqual(obs, exp)
254+
255+
# check that we strip double quotes by default
256+
s2 = ['#sample\ta\tb', '#comment line to skip',
257+
'"x "\t" y "\t z ', ' ', '"#more skip"', 'i\t"j"\tk']
258+
obs = _parse_mapping_file(s2)
259+
self.assertEqual(obs, exp)
260+
261+
262+
QIIME_TUTORIAL_MAP_SUBSET = (
263+
"#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tTreatment\tDOB\t"
264+
"Description\n"
265+
"PC.354\tAGCACGAGCCTA\tYATGCTGCCTCCCGTAGGAGT\tControl\t20061218\t"
266+
"Control_mouse_I.D._354\n"
267+
"PC.607\tAACTGTGCGTAC\tYATGCTGCCTCCCGTAGGAGT\tFast\t20071112\t"
268+
"Fasting_mouse_I.D._607\n"
269+
)
221270

222271
EXP_SAMPLE_TEMPLATE = (
223272
"sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"
@@ -685,6 +734,19 @@ def test_invalid_lat_long(self):
685734
'2.Sample3': 'type1'},
686735
'str_column': {'2.Sample1': 'NA', '2.Sample2': 'NA', '2.Sample3': 'NA'}}
687736

737+
QIIME_TUTORIAL_MAP_DICT_FORM = {
738+
'BarcodeSequence': {'PC.354': 'AGCACGAGCCTA',
739+
'PC.607': 'AACTGTGCGTAC'},
740+
'LinkerPrimerSequence': {'PC.354': 'YATGCTGCCTCCCGTAGGAGT',
741+
'PC.607': 'YATGCTGCCTCCCGTAGGAGT'},
742+
'Treatment': {'PC.354': 'Control',
743+
'PC.607': 'Fast'},
744+
'DOB': {'PC.354': 20061218,
745+
'PC.607': 20071112},
746+
'Description': {'PC.354': 'Control_mouse_I.D._354',
747+
'PC.607': 'Fasting_mouse_I.D._607'}
748+
}
749+
688750
EXP_PREP_TEMPLATE = (
689751
'sample_name\tbarcodesequence\tcenter_name\tcenter_project_name\t'
690752
'ebi_submission_accession\temp_status\texperiment_design_description\t'

qiita_db/metadata_template/util.py

Lines changed: 141 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,8 @@ def prefix_sample_names_with_id(md_template, study_id):
122122
md_template.index.name = None
123123

124124

125-
def load_template_to_dataframe(fn, strip_whitespace=True):
126-
"""Load a sample or a prep template into a data frame
125+
def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
126+
"""Load a sample/prep template or a QIIME mapping file into a data frame
127127
128128
Parameters
129129
----------
@@ -132,6 +132,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
132132
strip_whitespace : bool, optional
133133
Defaults to True. Whether or not to strip whitespace from values in the
134134
input file
135+
index : str, optional
136+
Defaults to 'sample_name'. The index to use in the loaded information
135137
136138
Returns
137139
-------
@@ -167,6 +169,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
167169
+=======================+==============+
168170
| sample_name | str |
169171
+-----------------------+--------------+
172+
| #SampleID | str |
173+
+-----------------------+--------------+
170174
| physical_location | str |
171175
+-----------------------+--------------+
172176
| has_physical_specimen | bool |
@@ -203,6 +207,17 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
203207
controlled_cols.update(CONTROLLED_COLS)
204208
holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c
205209
for c in cols)
210+
211+
if index == "#SampleID":
212+
# We're going to parse a QIIME mapping file. We are going to first
213+
# parse it with the QIIME function so we can remove the comments
214+
# easily and make sure that QIIME will accept this as a mapping file
215+
data, headers, comments = _parse_mapping_file(holdfile)
216+
holdfile = ["%s\n" % '\t'.join(d) for d in data]
217+
holdfile.insert(0, "%s\n" % '\t'.join(headers))
218+
# The QIIME parser fixes the index and removes the #
219+
index = 'SampleID'
220+
206221
# index_col:
207222
# is set as False, otherwise it is cast as a float and we want a string
208223
# keep_default:
@@ -224,7 +239,7 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
224239
keep_default_na=False, na_values=[''],
225240
parse_dates=True, index_col=False, comment='\t',
226241
mangle_dupe_cols=False, converters={
227-
'sample_name': lambda x: str(x).strip(),
242+
index: lambda x: str(x).strip(),
228243
# required sample template information
229244
'physical_location': str,
230245
'sample_type': str,
@@ -263,21 +278,22 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
263278

264279
initial_columns = set(template.columns)
265280

266-
if 'sample_name' not in template.columns:
267-
raise QiitaDBColumnError("The 'sample_name' column is missing from "
268-
"your template, this file cannot be parsed.")
281+
if index not in template.columns:
282+
raise QiitaDBColumnError("The '%s' column is missing from "
283+
"your template, this file cannot be parsed."
284+
% index)
269285

270286
# remove rows that have no sample identifier but that may have other data
271287
# in the rest of the columns
272-
template.dropna(subset=['sample_name'], how='all', inplace=True)
288+
template.dropna(subset=[index], how='all', inplace=True)
273289

274290
# set the sample name as the index
275-
template.set_index('sample_name', inplace=True)
291+
template.set_index(index, inplace=True)
276292

277293
# it is not uncommon to find templates that have empty columns
278294
template.dropna(how='all', axis=1, inplace=True)
279295

280-
initial_columns.remove('sample_name')
296+
initial_columns.remove(index)
281297
dropped_cols = initial_columns - set(template.columns)
282298
if dropped_cols:
283299
warnings.warn('The following column(s) were removed from the template '
@@ -315,3 +331,119 @@ def get_invalid_sample_names(sample_names):
315331
inv.append(s)
316332

317333
return inv
334+
335+
336+
def looks_like_qiime_mapping_file(fp):
337+
"""Checks if the file looks like a QIIME mapping file
338+
339+
Parameters
340+
----------
341+
fp : str or file-like object
342+
filepath to check if it looks like a QIIME mapping file
343+
344+
Returns
345+
-------
346+
bool
347+
True if fp looks like a QIIME mapping file, false otherwise.
348+
349+
350+
Notes
351+
-----
352+
This is not doing a validation of the QIIME mapping file. It simply checks
353+
the first line in the file and it returns true if the line starts with
354+
'#SampleID', since a sample/prep template will start with 'sample_name' or
355+
some other different column.
356+
"""
357+
first_line = None
358+
with open_file(fp, mode='U') as f:
359+
first_line = f.readline()
360+
if not first_line:
361+
return False
362+
363+
first_col = first_line.split()[0]
364+
return first_col == '#SampleID'
365+
366+
367+
def _parse_mapping_file(lines, strip_quotes=True, suppress_stripping=False):
368+
"""Parser for map file that relates samples to metadata.
369+
370+
Format: header line with fields
371+
optionally other comment lines starting with #
372+
tab-delimited fields
373+
374+
Parameters
375+
----------
376+
lines : iterable of str
377+
The contents of the QIIME mapping file
378+
strip_quotes : bool, optional
379+
Defaults to true. If true, quotes are removed from the data
380+
suppress_stripping : bool, optional
381+
Defaults to false. If true, spaces are not stripped
382+
383+
Returns
384+
-------
385+
list of lists, list of str, list of str
386+
The data in the mapping file, the headers and the comments
387+
388+
Raises
389+
------
390+
QiitaDBError
391+
If there is any error parsing the mapping file
392+
393+
Notes
394+
-----
395+
This code has been ported from QIIME.
396+
"""
397+
if strip_quotes:
398+
if suppress_stripping:
399+
# remove quotes but not spaces
400+
401+
def strip_f(x):
402+
return x.replace('"', '')
403+
else:
404+
# remove quotes and spaces
405+
406+
def strip_f(x):
407+
return x.replace('"', '').strip()
408+
else:
409+
if suppress_stripping:
410+
# don't remove quotes or spaces
411+
412+
def strip_f(x):
413+
return x
414+
else:
415+
# remove spaces but not quotes
416+
417+
def strip_f(x):
418+
return x.strip()
419+
420+
# Create lists to store the results
421+
mapping_data = []
422+
header = []
423+
comments = []
424+
425+
# Begin iterating over lines
426+
for line in lines:
427+
line = strip_f(line)
428+
if not line or (suppress_stripping and not line.strip()):
429+
# skip blank lines when not stripping lines
430+
continue
431+
432+
if line.startswith('#'):
433+
line = line[1:]
434+
if not header:
435+
header = line.strip().split('\t')
436+
else:
437+
comments.append(line)
438+
else:
439+
# Will add empty string to empty fields
440+
tmp_line = map(strip_f, line.split('\t'))
441+
if len(tmp_line) < len(header):
442+
tmp_line.extend([''] * (len(header) - len(tmp_line)))
443+
mapping_data.append(tmp_line)
444+
if not header:
445+
raise QiitaDBError("No header line was found in mapping file.")
446+
if not mapping_data:
447+
raise QiitaDBError("No data found in mapping file.")
448+
449+
return mapping_data, header, comments

qiita_db/support_files/patches/python_patches/25.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,14 @@
2626
WHERE raw_data_id = %s AND study_id = %s"""
2727
sql_unlink = "DELETE FROM qiita.raw_filepath WHERE raw_data_id = %s"
2828
sql_delete = "DELETE FROM qiita.raw_data WHERE raw_data_id = %s"
29+
sql_studies = """SELECT study_id FROM qiita.study_raw_data
30+
WHERE raw_data_id = %s"""
2931
move_files = []
3032
for rd_id in rd_ids:
3133
rd = RawData(rd_id)
3234
filepaths = rd.get_filepaths()
33-
studies = sorted(rd.studies)
35+
studies = [s[0] for s in conn_handler.execute_fetchall(sql_studies,
36+
(rd_id,))]
3437
if filepaths:
3538
# we need to move the files to a study. We chose the one with lower
3639
# study id. Currently there is no case in the live database in which a

qiita_db/test/test_commands.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ def test_update_preprocessed_data_from_cmd(self):
563563
# We need to sort the list returned from the db because the ordering
564564
# on that list is based on db modification time, rather than id
565565
obs_fps = sorted(ppd.get_filepaths())
566-
self.assertEqual(obs_fps, exp_fps)
566+
self.assertEqual(obs_fps, sorted(exp_fps))
567567

568568
# Check that the checksums have been updated
569569
sql = "SELECT checksum FROM qiita.filepath WHERE filepath_id=%s"

qiita_db/test/test_data.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,11 @@ def test_delete(self):
261261
RawData.delete(rd.id, self.pt1.id)
262262

263263
# Clear the files so we can actually remove the RawData
264+
study_id = rd.studies[0]
265+
path_for_removal = join(get_mountpoint("uploads")[0][1], str(study_id))
266+
self._clean_up_files.extend([join(path_for_removal,
267+
basename(f).split('_', 1)[1])
268+
for _, f, _ in rd.get_filepaths()])
264269
rd.clear_filepaths()
265270

266271
RawData.delete(rd.id, self.pt1.id)
@@ -545,7 +550,7 @@ def test_get_filepaths(self):
545550
"preprocessed_fastq"),
546551
(5, join(self.db_test_ppd_dir, '1_seqs.demux'),
547552
"preprocessed_demux")]
548-
self.assertEqual(obs, exp)
553+
self.assertItemsEqual(obs, exp)
549554

550555
def test_processed_data(self):
551556
"""Correctly returns the processed data id"""

qiita_db/test/test_util.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,11 @@ def test_move_upload_files_to_trash(self):
575575

576576
# create file to move to trash
577577
fid, folder = get_mountpoint("uploads")[0]
578-
open(join(folder, '1', test_filename), 'w').write('test')
578+
test_fp = join(folder, '1', test_filename)
579+
with open(test_fp, 'w') as f:
580+
f.write('test')
581+
582+
self.files_to_remove.append(test_fp)
579583

580584
exp = [(fid, 'this_is_a_test_file.txt'), (fid, 'uploaded_file.txt')]
581585
obs = get_files_from_uploads_folders("1")

0 commit comments

Comments
 (0)