Merge pull request #224 from pymzml/feature/custom_regex_for_index_pa…

…rsing Feature/custom regex for index parsing
pymzml · May 26, 2020 · b110673 · b110673
2 parents a3d950f + 11d5741
commit b110673
Show file tree

Hide file tree

Showing 8 changed files with 653 additions and 79 deletions.
diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst
@@ -33,4 +33,27 @@ no matter if an index was included into the file or not.
     spectrum_with_id_2540 = run[ 2540 ]
 
 
+Reading mzML indices with a custom regular expression
+------------------------------------------------------
+
+When reading mzML files with indices wich is not an integer or contains "scan=1" or similar,
+you can set a custom regex to parse the index when initializing the reader.
+
+Say for example you have an index as in the example file Manuels_customs_ids.mzML:
+    <offset idRef="ManuelsCustomID=1 diesdas">4026</offset>
+
+.. code-block:: python
+
+    #!/usr/bin/env python
+    import pymzml
+    import re
+
+    index_re = re.compile(
+        b'.*idRef="ManuelsCustomID=(?P<ID>.*) diesdas">(?P<offset>[0-9]*)</offset>'
+    )
+    run = pymzml.run.Reader(your_file_path, index_regex=index_re)
+    spec_1 = run[1]
+
+The regular expression has to contain a group called ID and a group called offset.
+Also be aware that your regex need to be a byte string.
 
diff --git a/pymzml/file_classes/standardMzml.py b/pymzml/file_classes/standardMzml.py
@@ -42,14 +42,15 @@ class StandardMzml(object):
     """
     """
 
-    def __init__(self, path, encoding, build_index_from_scratch=False):
+    def __init__(self, path, encoding, build_index_from_scratch=False, index_regex=None):
         """
         Initalize Wrapper object for standard mzML files.
 
         Arguments:
             path (str)     : path to the file
             encoding (str) : encoding of the file
         """
+        self.index_regex = index_regex
         self.path = path
         self.file_handler = self.get_file_handler(encoding)
         self.offset_dict = dict()
@@ -131,7 +132,6 @@ def __getitem__(self, identifier):
         elif type(identifier) == str:
             return self._search_string_identifier(identifier)
         else:
-            # spectrum = self._interpol_search(identifier)
             spectrum = self._binary_search(identifier)
 
         return spectrum
@@ -153,7 +153,6 @@ def _binary_search(self, target_index):
         jump_history = {'forwards': 0, 'backwards': 0}
         # This will be used if no spec was found at all during a jump
         # self._average_bytes_per_spec *= 10
-        # print(f"self.seek_list : {self.seek_list}")
         with open(self.path, "rb") as seeker:
             if target_index not in self.offset_dict.keys():
                 for jump in range(40):
@@ -184,13 +183,6 @@ def _binary_search(self, target_index):
                     average_spec_between_m1_p1 = int(
                         round(byte_diff_m1_p1 / scan_diff_m1_p1)
                     )
-                    # print("\n------------")
-                    # print(f"jump {jump}")
-                    # print(f"insert_pos {insert_position}")
-                    # print(f"average_spec_between_m1_p1 {average_spec_between_m1_p1}")
-                    # print(f"diff target to m1 / spec_offset_m1 {spec_offset_m1}")
-                    # print(f"diff target to p1 / spec_offset_p1 {spec_offset_p1}")
-
                     # which side are we closer to ...
                     if spec_offset_m1 < spec_offset_p1:
                         # print("Closer to m1 - jumping forward")
@@ -205,19 +197,13 @@ def _binary_search(self, target_index):
                             # and read chunks until found
                             byte_offset = element_before[1]
                     else:
-                        # print("Closer to p1 - jumping backwards")
                         jump_direction = 'backwards'
                         jump_history['forwards'] = 0
                         jump_history['backwards'] += 1
                         byte_offset = element_after[1] - jump_history['backwards'] * (
                             offset_scale * average_spec_between_m1_p1 * spec_offset_p1
                         )
                     byte_offset = int(byte_offset)
-                    # print(f"jump_history {jump_history}")
-                    # print(f"bytes offset {byte_offset}")
-                    # print(f"offset_scale {offset_scale}")
-                    # print(f"seek_list: {min(self.seek_list)} - {max(self.seek_list)} .. n: {len(self.seek_list)}")
-                    # print(f"seek_list[:-10]: {self.seek_list[:10]}")
                     found_scan = False
                     chunk = b""
                     break_outer = False
@@ -227,7 +213,6 @@ def _binary_search(self, target_index):
                             max([os.SEEK_SET + byte_offset + x * chunk_size, 1])
                         )
                         chunk += seeker.read(chunk_size)
-                    # print(f'read {len(chunk)}')
                     matches = re.finditer(regex_patterns.SPECTRUM_OPEN_PATTERN, chunk)
                     for _match_number, match in enumerate(matches):
                         if match is not None:
@@ -248,7 +233,6 @@ def _binary_search(self, target_index):
                                     offset_scale = 1
 
                             if scan in self.offset_dict.keys():
-                                # print("Have seen this scan {scan} already")
                                 continue
                             found_scan = True
                             new_entry = (
@@ -274,12 +258,10 @@ def _binary_search(self, target_index):
                         break
 
             start = self.offset_dict[target_index]
-            # print(f"reading spec at pos {start}")
             seeker.seek(start[0])
             match = None
             data = b""
             while b"</spectrum>" not in data:
-                # print("reading to end")
                 data += seeker.read(chunk_size)
             end = data.find(b"</spectrum>")
             seeker.seek(start[0])
@@ -308,13 +290,6 @@ def _build_index(self, from_scratch=False):
         """
         # Declare the pre-seeker
         seeker = self.get_binary_file_handler()
-        # Reading last 1024 bytes to find chromatogram Pos and SpectrumIndex Pos
-        index_list_offset_pattern = re.compile(
-            b"<indexListOffset>(?P<indexListOffset>[0-9]*)</indexListOffset>"
-        )
-        chromatogram_offset_pattern = re.compile(
-            b'(?P<WTF>[nativeID|idRef])="TIC">(?P<offset>[0-9]*)</offset'
-        )
         self.offset_dict["TIC"] = None
         seeker.seek(0, 2)
         index_found = False
@@ -331,7 +306,7 @@ def _build_index(self, from_scratch=False):
                 break
                 # File is smaller than 10kbytes ...
             for line in seeker:
-                match = chromatogram_offset_pattern.search(line)
+                match = regex_patterns.CHROMATOGRAM_OFFSET_PATTERN.search(line)
                 if match:
                     self.offset_dict["TIC"] = int(bytes.decode(match.group("offset")))
 
@@ -340,16 +315,12 @@ def _build_index(self, from_scratch=False):
                     spec_byte_offset = int(bytes.decode(match_spec.group("offset")))
                     sanity_check_set.add(spec_byte_offset)
 
-                match = index_list_offset_pattern.search(line)
+                match = regex_patterns.INDEX_LIST_OFFSET_PATTERN.search(line)
                 if match:
                     index_found = True
-                    # print(int(match.group('indexListOffset').decode('utf-8')))
-                    # print(line)
-                    # exit(1)
                     index_list_offset = int(
                         match.group("indexListOffset").decode("utf-8")
                     )
-                    # break
 
             if index_found is True and self.offset_dict["TIC"] is not None:
                 break
@@ -365,33 +336,42 @@ def _build_index(self, from_scratch=False):
                 if match_spec and match_spec.group("nativeID") == b"":
                     match_spec = None
                 match_sim = sim_index_pattern.search(line)
-                if match_spec:
-                    offset = int(bytes.decode(match_spec.group("offset")))
-                    native_id = int(bytes.decode(match_spec.group("nativeID")))
-                    self.offset_dict[native_id] = offset
-                elif match_sim:
-                    offset = int(bytes.decode(match_sim.group("offset")))
-                    native_id = bytes.decode(match_sim.group("nativeID"))
-                    # if native_id == 'DECOY_126104_C[160]NVVISGGTGSGK/2_y10':
-                    try:
-                        native_id = int(
-                            regex_patterns.SPECTRUM_ID_PATTERN.search(native_id).group(
-                                1
+                if self.index_regex is None:
+                    if match_spec:
+                        offset = int(bytes.decode(match_spec.group("offset")))
+                        native_id = int(bytes.decode(match_spec.group("nativeID")))
+                        self.offset_dict[native_id] = offset
+                    elif match_sim:
+                        offset = int(bytes.decode(match_sim.group("offset")))
+                        native_id = bytes.decode(match_sim.group("nativeID"))
+                        try:
+                            native_id = int(
+                                regex_patterns.SPECTRUM_ID_PATTERN2.search(native_id).group(
+                                    2
+                                )
                             )
-                        )
-                        # exit(1)
-                    except AttributeError:
-                        # match is None and has no attribute group,
-                        # so use the whole string as ID
-                        pass
-                    self.offset_dict[native_id] = (offset,)
+                        except AttributeError:
+                            # match is None and has no attribute group,
+                            # so use the whole string as ID
+                            pass
+                        self.offset_dict[native_id] = (offset,)
+                else:
+                    match = self.index_regex.search(line)
+                    if match:
+                        native_id = match.group("ID")
+                        try:
+                            native_id = int(native_id)
+                        except ValueError:
+                            pass
+                        offset = match.group("offset")
+                        self.offset_dict[native_id] = (offset,)
 
         elif from_scratch is True:
             seeker.seek(0)
             self._build_index_from_scratch(seeker)
         else:
             print('[Warning] Not index found and build_index_from_scratch is False')
-        
+
         seeker.close()
 
 
@@ -532,7 +512,6 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):
 
                 self.offset_dict[current_index] = (spec_start_offset,)
                 if current_index in used_indices:
-                    # seeker.close()
                     if current_index > target_index:
                         jumper_scaling -= 0.1
                     else:
@@ -543,7 +522,6 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):
                 dist = current_index - target_index
                 if dist < -1 and dist > -(fallback_cutoff):
                     spectrum = self._search_linear(seeker, target_index)
-                    # seeker.close()
                     spectrum_found = True
                     break
                 elif dist > 0 and dist < fallback_cutoff:
@@ -560,7 +538,6 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):
                             )
                     seeker.seek(current_position)
                     spectrum = self._search_linear(seeker, target_index)
-                    # seeker.close()
                     spectrum_found = True
                     break
 
@@ -571,7 +548,6 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):
                     seeker.seek(start)
                     self.offset_dict[current_index] = (start, end)
                     xml_string = seeker.read(end - start)
-                    # seeker.close()
                     spectrum = spec.Spectrum(XML(xml_string), measured_precision=5e-6)
                     spectrum_found = True
                     break
@@ -665,9 +641,12 @@ def _read_extremes(self):
                 match = regex_patterns.SPECTRUM_OPEN_PATTERN_SIMPLE.search(buffer)
                 if match is not None:
                     id_match = regex_patterns.SPECTRUM_ID_PATTERN_SIMPLE.search(buffer)
-                    first_scan = int(
-                        re.search(b"[0-9]*$", id_match.group("id")).group()
-                    )
+                    try:
+                        first_scan = int(
+                            re.search(b"[0-9]*$", id_match.group("id")).group()
+                        )
+                    except ValueError:
+                        first_scan = 0
                     #
                     seek_list.append(
                         (first_scan, seeker.tell() - chunk_size + match.start())

diff --git a/pymzml/file_interface.py b/pymzml/file_interface.py
@@ -13,7 +13,7 @@
 class FileInterface(object):
     """Interface to different mzML formats."""
 
-    def __init__(self, path, encoding, build_index_from_scratch=False):
+    def __init__(self, path, encoding, build_index_from_scratch=False, index_regex=None):
         """
         Initialize a object interface to mzML files.
 
@@ -24,6 +24,7 @@ def __init__(self, path, encoding, build_index_from_scratch=False):
         """
         self.build_index_from_scratch = build_index_from_scratch
         self.encoding = encoding
+        self.index_regex = index_regex
         self.file_handler = self._open(path)
         self.offset_dict = self.file_handler.offset_dict
 
@@ -55,7 +56,7 @@ def _open(self, path_or_file):
             else:
                 return standardGzip.StandardGzip(path_or_file, self.encoding)
         return standardMzml.StandardMzml(
-            path_or_file, self.encoding, self.build_index_from_scratch
+            path_or_file, self.encoding, self.build_index_from_scratch, index_regex=self.index_regex
         )
 
     def _indexed_gzip(self, path):

diff --git a/pymzml/regex_patterns.py b/pymzml/regex_patterns.py
@@ -24,6 +24,7 @@
 """
 SPECTRUM_PATTERN3 = regex.compile(r"((\w+)=(\w+\s*))+")
 SPECTRUM_ID_PATTERN = re.compile(r'="{0,1}([0-9]*)"{0,1}>{0,1}$')
+SPECTRUM_ID_PATTERN2 = re.compile(r'(scan|scanId)=(\d+)')
 """
 Simplified spectrum id regex. Greedly catches ints at the end of line
 """
@@ -70,3 +71,11 @@
     r"<\s*(chromatogram|spectrum)\s*(id=(\".*?\")|index=\".*?\")\s(id=(\".*?\"))*\s*.*\sdefaultArrayLength=\"[0-9]+\">"
 )
 """Regex to catch combined chromatogram and spectrum patterns"""
+
+INDEX_LIST_OFFSET_PATTERN = re.compile(
+    b"<indexListOffset>(?P<indexListOffset>[0-9]*)</indexListOffset>"
+)
+
+CHROMATOGRAM_OFFSET_PATTERN = re.compile(
+    b'(?P<WTF>[nativeID|idRef])="TIC">(?P<offset>[0-9]*)</offset'
+)
diff --git a/pymzml/run.py b/pymzml/run.py
@@ -82,9 +82,11 @@ def __init__(
         obo_version=None,
         build_index_from_scratch=False,
         skip_chromatogram=True,
+        index_regex=None,
         **kwargs
     ):
         """Initialize and set required attributes."""
+        self.index_regex = index_regex
         self.build_index_from_scratch = build_index_from_scratch
         self.skip_chromatogram = skip_chromatogram
         if MS_precisions is None:
@@ -221,6 +223,7 @@ def _open_file(self, path_or_file):
             path_or_file,
             self.info["encoding"],
             build_index_from_scratch=self.build_index_from_scratch,
+            index_regex=self.index_regex
         )
 
     def _guess_encoding(self, mzml_file):

diff --git a/pymzml/spec.py b/pymzml/spec.py
@@ -808,7 +808,7 @@ def ID(self):
                     except ValueError:
                         self._ID = match.group(1)
                 else:
-                    self._ID = ""
+                    self._ID = self.element.get("id")
         return self._ID
 
     @property
@@ -821,14 +821,18 @@ def id_dict(self):
         """
         if self._id_dict is None:
             tuples = []
-            captures = regex_patterns.SPECTRUM_PATTERN3.match(
+            match = regex_patterns.SPECTRUM_PATTERN3.match(
                 self.element.attrib["id"]
-            ).captures(1)
-            for element in captures:
-                k, v = element.strip().split("=")
-                v = int(v)
-                tuples.append([k, v])
-            self._id_dict = dict(tuples)
+            )
+            if match is not None:
+                captures = match.captures(1)
+                for element in captures:
+                    k, v = element.strip().split("=")
+                    v = int(v)
+                    tuples.append([k, v])
+                self._id_dict = dict(tuples)
+            else:
+                self._id_dict = {}
         return self._id_dict
 
     @property