Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions qiita_db/metadata_template/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@

from .sample_template import SampleTemplate
from .prep_template import PrepTemplate
from .util import load_template_to_dataframe
from .util import load_template_to_dataframe, looks_like_qiime_mapping_file
from .constants import (TARGET_GENE_DATA_TYPES, SAMPLE_TEMPLATE_COLUMNS,
PREP_TEMPLATE_COLUMNS,
PREP_TEMPLATE_COLUMNS_TARGET_GENE)
PREP_TEMPLATE_COLUMNS_TARGET_GENE, CONTROLLED_COLS)


__all__ = ['SampleTemplate', 'PrepTemplate', 'load_template_to_dataframe',
'TARGET_GENE_DATA_TYPES', 'SAMPLE_TEMPLATE_COLUMNS',
'PREP_TEMPLATE_COLUMNS', 'PREP_TEMPLATE_COLUMNS_TARGET_GENE']
'PREP_TEMPLATE_COLUMNS', 'PREP_TEMPLATE_COLUMNS_TARGET_GENE',
'CONTROLLED_COLS', 'looks_like_qiime_mapping_file']
64 changes: 63 additions & 1 deletion qiita_db/metadata_template/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
QiitaDBError)
from qiita_db.metadata_template.util import (
get_datatypes, as_python_types, prefix_sample_names_with_id,
load_template_to_dataframe, get_invalid_sample_names)
load_template_to_dataframe, get_invalid_sample_names,
looks_like_qiime_mapping_file, _parse_mapping_file)


class TestUtil(TestCase):
Expand Down Expand Up @@ -64,6 +65,17 @@ def test_load_template_to_dataframe(self):
exp.index.name = 'sample_name'
assert_frame_equal(obs, exp)

def test_load_template_to_dataframe_qiime_map(self):
obs = load_template_to_dataframe(StringIO(QIIME_TUTORIAL_MAP_SUBSET),
index='#SampleID')
exp = pd.DataFrame.from_dict(QIIME_TUTORIAL_MAP_DICT_FORM)
exp.index.name = 'SampleID'
obs.sort_index(axis=0, inplace=True)
obs.sort_index(axis=1, inplace=True)
exp.sort_index(axis=0, inplace=True)
exp.sort_index(axis=1, inplace=True)
assert_frame_equal(obs, exp)

def test_load_template_to_dataframe_duplicate_cols(self):
obs = load_template_to_dataframe(
StringIO(EXP_SAMPLE_TEMPLATE_DUPE_COLS))
Expand Down Expand Up @@ -218,6 +230,43 @@ def test_invalid_lat_long(self):
# prevent flake8 from complaining
str(obs)

def test_looks_like_qiime_mapping_file(self):
obs = looks_like_qiime_mapping_file(
StringIO(EXP_SAMPLE_TEMPLATE))
self.assertFalse(obs)

obs = looks_like_qiime_mapping_file(
StringIO(QIIME_TUTORIAL_MAP_SUBSET))
self.assertTrue(obs)

obs = looks_like_qiime_mapping_file(StringIO())
self.assertFalse(obs)

def test_parse_mapping_file(self):
# Tests ported over from QIIME
s1 = ['#sample\ta\tb', '#comment line to skip',
'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
exp = ([['x', 'y', 'z'], ['i', 'j', 'k']],
['sample', 'a', 'b'],
['comment line to skip', 'more skip'])
obs = _parse_mapping_file(s1)
self.assertEqual(obs, exp)

# check that we strip double quotes by default
s2 = ['#sample\ta\tb', '#comment line to skip',
'"x "\t" y "\t z ', ' ', '"#more skip"', 'i\t"j"\tk']
obs = _parse_mapping_file(s2)
self.assertEqual(obs, exp)


QIIME_TUTORIAL_MAP_SUBSET = (
"#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tTreatment\tDOB\t"
"Description\n"
"PC.354\tAGCACGAGCCTA\tYATGCTGCCTCCCGTAGGAGT\tControl\t20061218\t"
"Control_mouse_I.D._354\n"
"PC.607\tAACTGTGCGTAC\tYATGCTGCCTCCCGTAGGAGT\tFast\t20071112\t"
"Fasting_mouse_I.D._607\n"
)

EXP_SAMPLE_TEMPLATE = (
"sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"
Expand Down Expand Up @@ -685,6 +734,19 @@ def test_invalid_lat_long(self):
'2.Sample3': 'type1'},
'str_column': {'2.Sample1': 'NA', '2.Sample2': 'NA', '2.Sample3': 'NA'}}

QIIME_TUTORIAL_MAP_DICT_FORM = {
'BarcodeSequence': {'PC.354': 'AGCACGAGCCTA',
'PC.607': 'AACTGTGCGTAC'},
'LinkerPrimerSequence': {'PC.354': 'YATGCTGCCTCCCGTAGGAGT',
'PC.607': 'YATGCTGCCTCCCGTAGGAGT'},
'Treatment': {'PC.354': 'Control',
'PC.607': 'Fast'},
'DOB': {'PC.354': 20061218,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strictly speaking, QIIME mapping files do not do any sort of data type inferences, but I guess it's OK in this context as we need these data types to insert the information into the DB?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we need the type inference to be able to search correctly over metadata. Also, the idea is that the DB will not know that this data came from a mapping file.

'PC.607': 20071112},
'Description': {'PC.354': 'Control_mouse_I.D._354',
'PC.607': 'Fasting_mouse_I.D._607'}
}

EXP_PREP_TEMPLATE = (
'sample_name\tbarcodesequence\tcenter_name\tcenter_project_name\t'
'ebi_submission_accession\temp_status\texperiment_design_description\t'
Expand Down
150 changes: 141 additions & 9 deletions qiita_db/metadata_template/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ def prefix_sample_names_with_id(md_template, study_id):
md_template.index.name = None


def load_template_to_dataframe(fn, strip_whitespace=True):
"""Load a sample or a prep template into a data frame
def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
"""Load a sample/prep template or a QIIME mapping file into a data frame

Parameters
----------
Expand All @@ -132,6 +132,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
strip_whitespace : bool, optional
Defaults to True. Whether or not to strip whitespace from values in the
input file
index : str, optional
Defaults to 'sample_name'. The index to use in the loaded information

Returns
-------
Expand Down Expand Up @@ -167,6 +169,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
+=======================+==============+
| sample_name | str |
+-----------------------+--------------+
| #SampleID | str |
+-----------------------+--------------+
| physical_location | str |
+-----------------------+--------------+
| has_physical_specimen | bool |
Expand Down Expand Up @@ -203,6 +207,17 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
controlled_cols.update(CONTROLLED_COLS)
holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c
for c in cols)

if index == "#SampleID":
# We're going to parse a QIIME mapping file. We are going to first
# parse it with the QIIME function so we can remove the comments
# easily and make sure that QIIME will accept this as a mapping file
data, headers, comments = _parse_mapping_file(holdfile)
holdfile = ["%s\n" % '\t'.join(d) for d in data]
holdfile.insert(0, "%s\n" % '\t'.join(headers))
# The QIIME parser fixes the index and removes the #
index = 'SampleID'

# index_col:
# is set as False, otherwise it is cast as a float and we want a string
# keep_default:
Expand All @@ -224,7 +239,7 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
keep_default_na=False, na_values=[''],
parse_dates=True, index_col=False, comment='\t',
mangle_dupe_cols=False, converters={
'sample_name': lambda x: str(x).strip(),
index: lambda x: str(x).strip(),
# required sample template information
'physical_location': str,
'sample_type': str,
Expand Down Expand Up @@ -263,21 +278,22 @@ def load_template_to_dataframe(fn, strip_whitespace=True):

initial_columns = set(template.columns)

if 'sample_name' not in template.columns:
raise QiitaDBColumnError("The 'sample_name' column is missing from "
"your template, this file cannot be parsed.")
if index not in template.columns:
raise QiitaDBColumnError("The '%s' column is missing from "
"your template, this file cannot be parsed."
% index)

# remove rows that have no sample identifier but that may have other data
# in the rest of the columns
template.dropna(subset=['sample_name'], how='all', inplace=True)
template.dropna(subset=[index], how='all', inplace=True)

# set the sample name as the index
template.set_index('sample_name', inplace=True)
template.set_index(index, inplace=True)

# it is not uncommon to find templates that have empty columns
template.dropna(how='all', axis=1, inplace=True)

initial_columns.remove('sample_name')
initial_columns.remove(index)
dropped_cols = initial_columns - set(template.columns)
if dropped_cols:
warnings.warn('The following column(s) were removed from the template '
Expand Down Expand Up @@ -315,3 +331,119 @@ def get_invalid_sample_names(sample_names):
inv.append(s)

return inv


def looks_like_qiime_mapping_file(fp):
"""Checks if the file looks like a QIIME mapping file

Parameters
----------
fp : str or file-like object
filepath to check if it looks like a QIIME mapping file

Returns
-------
bool
True if fp looks like a QIIME mapping file, false otherwise.


Notes
-----
This is not doing a validation of the QIIME mapping file. It simply checks
the first line in the file and it returns true if the line starts with
'#SampleID', since a sample/prep template will start with 'sample_name' or
some other different column.
"""
first_line = None
with open_file(fp, mode='U') as f:
first_line = f.readline()
if not first_line:
return False

first_col = first_line.split()[0]
return first_col == '#SampleID'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should you also check for LinkerPrimer, BarcodeSequence, and ReverseBarcodeSequence (optional)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a quick and fast check to see if the file is a mapping file. Note that templates do not have any # sign at the beginning, not even for comments. The fact that the file starts with '#SampleID' is enough to know that we are not parsing a template, but a mapping file.



def _parse_mapping_file(lines, strip_quotes=True, suppress_stripping=False):
"""Parser for map file that relates samples to metadata.

Format: header line with fields
optionally other comment lines starting with #
tab-delimited fields

Parameters
----------
lines : iterable of str
The contents of the QIIME mapping file
strip_quotes : bool, optional
Defaults to true. If true, quotes are removed from the data
suppress_stripping : bool, optional
Defaults to false. If true, spaces are not stripped

Returns
-------
list of lists, list of str, list of str
The data in the mapping file, the headers and the comments

Raises
------
QiitaDBError
If there is any error parsing the mapping file

Notes
-----
This code has been ported from QIIME.
"""
if strip_quotes:
if suppress_stripping:
# remove quotes but not spaces

def strip_f(x):
return x.replace('"', '')
else:
# remove quotes and spaces

def strip_f(x):
return x.replace('"', '').strip()
else:
if suppress_stripping:
# don't remove quotes or spaces

def strip_f(x):
return x
else:
# remove spaces but not quotes

def strip_f(x):
return x.strip()

# Create lists to store the results
mapping_data = []
header = []
comments = []

# Begin iterating over lines
for line in lines:
line = strip_f(line)
if not line or (suppress_stripping and not line.strip()):
# skip blank lines when not stripping lines
continue

if line.startswith('#'):
line = line[1:]
if not header:
header = line.strip().split('\t')
else:
comments.append(line)
else:
# Will add empty string to empty fields
tmp_line = map(strip_f, line.split('\t'))
if len(tmp_line) < len(header):
tmp_line.extend([''] * (len(header) - len(tmp_line)))
mapping_data.append(tmp_line)
if not header:
raise QiitaDBError("No header line was found in mapping file.")
if not mapping_data:
raise QiitaDBError("No data found in mapping file.")

return mapping_data, header, comments
Loading