Skip to content

Commit

Permalink
Merge pull request #224 from pymzml/feature/custom_regex_for_index_pa…
Browse files Browse the repository at this point in the history
…rsing

Feature/custom regex for index parsing
  • Loading branch information
MKoesters committed May 26, 2020
2 parents a3d950f + 11d5741 commit b110673
Show file tree
Hide file tree
Showing 8 changed files with 653 additions and 79 deletions.
23 changes: 23 additions & 0 deletions docs/source/quick_start.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,27 @@ no matter if an index was included into the file or not.
spectrum_with_id_2540 = run[ 2540 ]
Reading mzML indices with a custom regular expression
------------------------------------------------------

When reading mzML files with indices wich is not an integer or contains "scan=1" or similar,
you can set a custom regex to parse the index when initializing the reader.

Say for example you have an index as in the example file Manuels_customs_ids.mzML:
<offset idRef="ManuelsCustomID=1 diesdas">4026</offset>

.. code-block:: python
#!/usr/bin/env python
import pymzml
import re
index_re = re.compile(
b'.*idRef="ManuelsCustomID=(?P<ID>.*) diesdas">(?P<offset>[0-9]*)</offset>'
)
run = pymzml.run.Reader(your_file_path, index_regex=index_re)
spec_1 = run[1]
The regular expression has to contain a group called ID and a group called offset.
Also be aware that your regex need to be a byte string.

99 changes: 39 additions & 60 deletions pymzml/file_classes/standardMzml.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,15 @@ class StandardMzml(object):
"""
"""

def __init__(self, path, encoding, build_index_from_scratch=False):
def __init__(self, path, encoding, build_index_from_scratch=False, index_regex=None):
"""
Initalize Wrapper object for standard mzML files.
Arguments:
path (str) : path to the file
encoding (str) : encoding of the file
"""
self.index_regex = index_regex
self.path = path
self.file_handler = self.get_file_handler(encoding)
self.offset_dict = dict()
Expand Down Expand Up @@ -131,7 +132,6 @@ def __getitem__(self, identifier):
elif type(identifier) == str:
return self._search_string_identifier(identifier)
else:
# spectrum = self._interpol_search(identifier)
spectrum = self._binary_search(identifier)

return spectrum
Expand All @@ -153,7 +153,6 @@ def _binary_search(self, target_index):
jump_history = {'forwards': 0, 'backwards': 0}
# This will be used if no spec was found at all during a jump
# self._average_bytes_per_spec *= 10
# print(f"self.seek_list : {self.seek_list}")
with open(self.path, "rb") as seeker:
if target_index not in self.offset_dict.keys():
for jump in range(40):
Expand Down Expand Up @@ -184,13 +183,6 @@ def _binary_search(self, target_index):
average_spec_between_m1_p1 = int(
round(byte_diff_m1_p1 / scan_diff_m1_p1)
)
# print("\n------------")
# print(f"jump {jump}")
# print(f"insert_pos {insert_position}")
# print(f"average_spec_between_m1_p1 {average_spec_between_m1_p1}")
# print(f"diff target to m1 / spec_offset_m1 {spec_offset_m1}")
# print(f"diff target to p1 / spec_offset_p1 {spec_offset_p1}")

# which side are we closer to ...
if spec_offset_m1 < spec_offset_p1:
# print("Closer to m1 - jumping forward")
Expand All @@ -205,19 +197,13 @@ def _binary_search(self, target_index):
# and read chunks until found
byte_offset = element_before[1]
else:
# print("Closer to p1 - jumping backwards")
jump_direction = 'backwards'
jump_history['forwards'] = 0
jump_history['backwards'] += 1
byte_offset = element_after[1] - jump_history['backwards'] * (
offset_scale * average_spec_between_m1_p1 * spec_offset_p1
)
byte_offset = int(byte_offset)
# print(f"jump_history {jump_history}")
# print(f"bytes offset {byte_offset}")
# print(f"offset_scale {offset_scale}")
# print(f"seek_list: {min(self.seek_list)} - {max(self.seek_list)} .. n: {len(self.seek_list)}")
# print(f"seek_list[:-10]: {self.seek_list[:10]}")
found_scan = False
chunk = b""
break_outer = False
Expand All @@ -227,7 +213,6 @@ def _binary_search(self, target_index):
max([os.SEEK_SET + byte_offset + x * chunk_size, 1])
)
chunk += seeker.read(chunk_size)
# print(f'read {len(chunk)}')
matches = re.finditer(regex_patterns.SPECTRUM_OPEN_PATTERN, chunk)
for _match_number, match in enumerate(matches):
if match is not None:
Expand All @@ -248,7 +233,6 @@ def _binary_search(self, target_index):
offset_scale = 1

if scan in self.offset_dict.keys():
# print("Have seen this scan {scan} already")
continue
found_scan = True
new_entry = (
Expand All @@ -274,12 +258,10 @@ def _binary_search(self, target_index):
break

start = self.offset_dict[target_index]
# print(f"reading spec at pos {start}")
seeker.seek(start[0])
match = None
data = b""
while b"</spectrum>" not in data:
# print("reading to end")
data += seeker.read(chunk_size)
end = data.find(b"</spectrum>")
seeker.seek(start[0])
Expand Down Expand Up @@ -308,13 +290,6 @@ def _build_index(self, from_scratch=False):
"""
# Declare the pre-seeker
seeker = self.get_binary_file_handler()
# Reading last 1024 bytes to find chromatogram Pos and SpectrumIndex Pos
index_list_offset_pattern = re.compile(
b"<indexListOffset>(?P<indexListOffset>[0-9]*)</indexListOffset>"
)
chromatogram_offset_pattern = re.compile(
b'(?P<WTF>[nativeID|idRef])="TIC">(?P<offset>[0-9]*)</offset'
)
self.offset_dict["TIC"] = None
seeker.seek(0, 2)
index_found = False
Expand All @@ -331,7 +306,7 @@ def _build_index(self, from_scratch=False):
break
# File is smaller than 10kbytes ...
for line in seeker:
match = chromatogram_offset_pattern.search(line)
match = regex_patterns.CHROMATOGRAM_OFFSET_PATTERN.search(line)
if match:
self.offset_dict["TIC"] = int(bytes.decode(match.group("offset")))

Expand All @@ -340,16 +315,12 @@ def _build_index(self, from_scratch=False):
spec_byte_offset = int(bytes.decode(match_spec.group("offset")))
sanity_check_set.add(spec_byte_offset)

match = index_list_offset_pattern.search(line)
match = regex_patterns.INDEX_LIST_OFFSET_PATTERN.search(line)
if match:
index_found = True
# print(int(match.group('indexListOffset').decode('utf-8')))
# print(line)
# exit(1)
index_list_offset = int(
match.group("indexListOffset").decode("utf-8")
)
# break

if index_found is True and self.offset_dict["TIC"] is not None:
break
Expand All @@ -365,33 +336,42 @@ def _build_index(self, from_scratch=False):
if match_spec and match_spec.group("nativeID") == b"":
match_spec = None
match_sim = sim_index_pattern.search(line)
if match_spec:
offset = int(bytes.decode(match_spec.group("offset")))
native_id = int(bytes.decode(match_spec.group("nativeID")))
self.offset_dict[native_id] = offset
elif match_sim:
offset = int(bytes.decode(match_sim.group("offset")))
native_id = bytes.decode(match_sim.group("nativeID"))
# if native_id == 'DECOY_126104_C[160]NVVISGGTGSGK/2_y10':
try:
native_id = int(
regex_patterns.SPECTRUM_ID_PATTERN.search(native_id).group(
1
if self.index_regex is None:
if match_spec:
offset = int(bytes.decode(match_spec.group("offset")))
native_id = int(bytes.decode(match_spec.group("nativeID")))
self.offset_dict[native_id] = offset
elif match_sim:
offset = int(bytes.decode(match_sim.group("offset")))
native_id = bytes.decode(match_sim.group("nativeID"))
try:
native_id = int(
regex_patterns.SPECTRUM_ID_PATTERN2.search(native_id).group(
2
)
)
)
# exit(1)
except AttributeError:
# match is None and has no attribute group,
# so use the whole string as ID
pass
self.offset_dict[native_id] = (offset,)
except AttributeError:
# match is None and has no attribute group,
# so use the whole string as ID
pass
self.offset_dict[native_id] = (offset,)
else:
match = self.index_regex.search(line)
if match:
native_id = match.group("ID")
try:
native_id = int(native_id)
except ValueError:
pass
offset = match.group("offset")
self.offset_dict[native_id] = (offset,)

elif from_scratch is True:
seeker.seek(0)
self._build_index_from_scratch(seeker)
else:
print('[Warning] Not index found and build_index_from_scratch is False')

seeker.close()


Expand Down Expand Up @@ -532,7 +512,6 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):

self.offset_dict[current_index] = (spec_start_offset,)
if current_index in used_indices:
# seeker.close()
if current_index > target_index:
jumper_scaling -= 0.1
else:
Expand All @@ -543,7 +522,6 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):
dist = current_index - target_index
if dist < -1 and dist > -(fallback_cutoff):
spectrum = self._search_linear(seeker, target_index)
# seeker.close()
spectrum_found = True
break
elif dist > 0 and dist < fallback_cutoff:
Expand All @@ -560,7 +538,6 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):
)
seeker.seek(current_position)
spectrum = self._search_linear(seeker, target_index)
# seeker.close()
spectrum_found = True
break

Expand All @@ -571,7 +548,6 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):
seeker.seek(start)
self.offset_dict[current_index] = (start, end)
xml_string = seeker.read(end - start)
# seeker.close()
spectrum = spec.Spectrum(XML(xml_string), measured_precision=5e-6)
spectrum_found = True
break
Expand Down Expand Up @@ -665,9 +641,12 @@ def _read_extremes(self):
match = regex_patterns.SPECTRUM_OPEN_PATTERN_SIMPLE.search(buffer)
if match is not None:
id_match = regex_patterns.SPECTRUM_ID_PATTERN_SIMPLE.search(buffer)
first_scan = int(
re.search(b"[0-9]*$", id_match.group("id")).group()
)
try:
first_scan = int(
re.search(b"[0-9]*$", id_match.group("id")).group()
)
except ValueError:
first_scan = 0
#
seek_list.append(
(first_scan, seeker.tell() - chunk_size + match.start())
Expand Down
5 changes: 3 additions & 2 deletions pymzml/file_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class FileInterface(object):
"""Interface to different mzML formats."""

def __init__(self, path, encoding, build_index_from_scratch=False):
def __init__(self, path, encoding, build_index_from_scratch=False, index_regex=None):
"""
Initialize a object interface to mzML files.
Expand All @@ -24,6 +24,7 @@ def __init__(self, path, encoding, build_index_from_scratch=False):
"""
self.build_index_from_scratch = build_index_from_scratch
self.encoding = encoding
self.index_regex = index_regex
self.file_handler = self._open(path)
self.offset_dict = self.file_handler.offset_dict

Expand Down Expand Up @@ -55,7 +56,7 @@ def _open(self, path_or_file):
else:
return standardGzip.StandardGzip(path_or_file, self.encoding)
return standardMzml.StandardMzml(
path_or_file, self.encoding, self.build_index_from_scratch
path_or_file, self.encoding, self.build_index_from_scratch, index_regex=self.index_regex
)

def _indexed_gzip(self, path):
Expand Down
9 changes: 9 additions & 0 deletions pymzml/regex_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"""
SPECTRUM_PATTERN3 = regex.compile(r"((\w+)=(\w+\s*))+")
SPECTRUM_ID_PATTERN = re.compile(r'="{0,1}([0-9]*)"{0,1}>{0,1}$')
SPECTRUM_ID_PATTERN2 = re.compile(r'(scan|scanId)=(\d+)')
"""
Simplified spectrum id regex. Greedly catches ints at the end of line
"""
Expand Down Expand Up @@ -70,3 +71,11 @@
r"<\s*(chromatogram|spectrum)\s*(id=(\".*?\")|index=\".*?\")\s(id=(\".*?\"))*\s*.*\sdefaultArrayLength=\"[0-9]+\">"
)
"""Regex to catch combined chromatogram and spectrum patterns"""

INDEX_LIST_OFFSET_PATTERN = re.compile(
b"<indexListOffset>(?P<indexListOffset>[0-9]*)</indexListOffset>"
)

CHROMATOGRAM_OFFSET_PATTERN = re.compile(
b'(?P<WTF>[nativeID|idRef])="TIC">(?P<offset>[0-9]*)</offset'
)
3 changes: 3 additions & 0 deletions pymzml/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,11 @@ def __init__(
obo_version=None,
build_index_from_scratch=False,
skip_chromatogram=True,
index_regex=None,
**kwargs
):
"""Initialize and set required attributes."""
self.index_regex = index_regex
self.build_index_from_scratch = build_index_from_scratch
self.skip_chromatogram = skip_chromatogram
if MS_precisions is None:
Expand Down Expand Up @@ -221,6 +223,7 @@ def _open_file(self, path_or_file):
path_or_file,
self.info["encoding"],
build_index_from_scratch=self.build_index_from_scratch,
index_regex=self.index_regex
)

def _guess_encoding(self, mzml_file):
Expand Down
20 changes: 12 additions & 8 deletions pymzml/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ def ID(self):
except ValueError:
self._ID = match.group(1)
else:
self._ID = ""
self._ID = self.element.get("id")
return self._ID

@property
Expand All @@ -821,14 +821,18 @@ def id_dict(self):
"""
if self._id_dict is None:
tuples = []
captures = regex_patterns.SPECTRUM_PATTERN3.match(
match = regex_patterns.SPECTRUM_PATTERN3.match(
self.element.attrib["id"]
).captures(1)
for element in captures:
k, v = element.strip().split("=")
v = int(v)
tuples.append([k, v])
self._id_dict = dict(tuples)
)
if match is not None:
captures = match.captures(1)
for element in captures:
k, v = element.strip().split("=")
v = int(v)
tuples.append([k, v])
self._id_dict = dict(tuples)
else:
self._id_dict = {}
return self._id_dict

@property
Expand Down
Loading

0 comments on commit b110673

Please sign in to comment.