.pdb header reading

samirelanduk · Sep 17, 2018 · 27a9b0b · 27a9b0b
1 parent 638e262
commit 27a9b0b
Show file tree

Hide file tree

Showing 5 changed files with 534 additions and 7 deletions.
diff --git a/atomium/mmcif.py b/atomium/mmcif.py
@@ -266,7 +266,7 @@ def mmcif_to_data_transfer(mmcif_dict, data_dict, d_cat, d_key, m_table, m_key,
     :param bool split: if True, the value will be split on commas.
     :param bool multi: if True, every row in the table will be read.
     :param function func: if given, this will be applied to the value."""
-    
+
     try:
         if multi:
             value = [row[m_key] for row in mmcif_dict[m_table]]

diff --git a/atomium/pdb.py b/atomium/pdb.py
@@ -1,5 +1,8 @@
 """Contains functions for dealing with the .mmtf file format."""
 
+from datetime import datetime
+import re
+
 def pdb_string_to_pdb_dict(filestring):
     """Takes a .pdb filestring and turns into a ``dict`` which represents its
     record structure. Only lines which aren't empty are used.
@@ -51,3 +54,192 @@ def update_dict(d, key, value):
     try:
         d[key].append(value)
     except: d[key] = [value]
+
+
+def pdb_dict_to_data_dict(pdb_dict):
+    """Converts an .pdb dictionary into an atomium data dictionary, with the
+    same standard layout that the other file formats get converted into.
+
+    :param dict pdb_dict: the .pdb dictionary.
+    :rtype: ``dict``"""
+
+    data_dict = {
+     "description": {
+      "code": None, "title": None, "deposition_date": None,
+      "classification": None, "keywords": [], "authors": []
+     }, "experiment": {
+      "technique": None, "source_organism": None, "expression_system": None
+     }, "quality": {"resolution": None, "rvalue": None, "rfree": None}
+    }
+    update_description_dict(pdb_dict, data_dict)
+    update_experiment_dict(pdb_dict, data_dict)
+    update_quality_dict(pdb_dict, data_dict)
+    return data_dict
+
+
+def update_description_dict(pdb_dict, data_dict):
+    """Creates the description component of a standard atomium data dictionary
+    from a .pdb dictionary.
+
+    :param dict pdb_dict: The .pdb dictionary to read.
+    :param dict data_dict: The data dictionary to update."""
+
+    extract_header(pdb_dict, data_dict["description"])
+    extract_title(pdb_dict, data_dict["description"])
+    extract_keywords(pdb_dict, data_dict["description"])
+    extract_authors(pdb_dict, data_dict["description"])
+
+
+def update_experiment_dict(pdb_dict, data_dict):
+    """Creates the experiment component of a standard atomium data dictionary
+    from a .pdb dictionary.
+
+    :param dict pdb_dict: The .pdb dictionary to update.
+    :param dict data_dict: The data dictionary to update."""
+
+    extract_technique(pdb_dict, data_dict["experiment"])
+    extract_source(pdb_dict, data_dict["experiment"])
+
+
+def update_quality_dict(pdb_dict, data_dict):
+    """Creates the quality component of a standard atomium data dictionary
+    from a .pdb dictionary.
+
+    :param dict pdb_dict: The .pdb dictionary to update.
+    :param dict data_dict: The data dictionary to update."""
+
+    extract_resolution_remark(pdb_dict, data_dict["quality"])
+    extract_rvalue_remark(pdb_dict, data_dict["quality"])
+
+
+def extract_header(pdb_dict, description_dict):
+    """Takes a ``dict`` and adds header information to it by parsing the HEADER
+    line.
+
+    :param dict pdb_dict: the ``dict`` to read.
+    :param dict description_dict: the ``dict`` to update."""
+
+    if pdb_dict.get("HEADER"):
+        line = pdb_dict["HEADER"][0]
+        if line[50:59].strip():
+            description_dict["deposition_date"] = datetime.strptime(
+             line[50:59], "%d-%b-%y"
+            ).date()
+        if line[62:66].strip(): description_dict["code"] = line[62:66]
+        if line[10:50].strip():
+            description_dict["classification"] = line[10:50].strip()
+
+
+def extract_title(pdb_dict, description_dict):
+    """Takes a ``dict`` and adds header information to it by parsing the TITLE
+    lines.
+
+    :param dict pdb_dict: the ``dict`` to read.
+    :param dict description_dict: the ``dict`` to update."""
+
+    if pdb_dict.get("TITLE"):
+        description_dict["title"] = merge_lines(pdb_dict["TITLE"], 10)
+
+
+def extract_keywords(pdb_dict, description_dict):
+    """Takes a ``dict`` and adds header information to it by parsing the KEYWDS
+    line.
+
+    :param dict pdb_dict: the ``dict`` to read.
+    :param dict description_dict: the ``dict`` to update."""
+
+    if pdb_dict.get("KEYWDS"):
+        text = merge_lines(pdb_dict["KEYWDS"], 10)
+        description_dict["keywords"] = [w.strip() for w in text.split(",")]
+
+
+def extract_authors(pdb_dict, description_dict):
+    """Takes a ``dict`` and adds header information to it by parsing the AUTHOR
+    line.
+
+    :param dict pdb_dict: the ``dict`` to read.
+    :param dict description_dict: the ``dict`` to update."""
+
+    if pdb_dict.get("AUTHOR"):
+        text = merge_lines(pdb_dict["AUTHOR"], 10)
+        description_dict["authors"] = [w.strip() for w in text.split(",")]
+
+
+def extract_technique(pdb_dict, experiment_dict):
+    """Takes a ``dict`` and adds technique information to it by parsing EXPDTA
+    lines.
+
+    :param dict pdb_dict: the ``dict`` to read.
+    :param dict experiment_dict: the ``dict`` to update."""
+
+    if pdb_dict.get("EXPDTA"):
+        if pdb_dict["EXPDTA"][0].strip():
+            experiment_dict["technique"] = pdb_dict["EXPDTA"][0][6:].strip()
+
+
+def extract_source(pdb_dict, experiment_dict):
+    """Takes a ``dict`` and adds source information to it by parsing SOURCE
+    lines.
+
+    :param dict pdb_dict: the ``dict`` to read.
+    :param dict experiment_dict: the ``dict`` to update."""
+
+    if pdb_dict.get("SOURCE"):
+        data = merge_lines(pdb_dict["SOURCE"], 10)
+        patterns = {
+         "source_organism": r"ORGANISM_SCIENTIFIC\: (.+?);",
+         "expression_system": r"EXPRESSION_SYSTEM\: (.+?);"
+        }
+        for attribute, pattern in patterns.items():
+            matches = re.findall(pattern, data)
+            if matches:
+                experiment_dict[attribute] = matches[0]
+
+
+def extract_resolution_remark(pdb_dict, quality_dict):
+    """Takes a ``dict`` and adds resolution information to it by parsing REMARK
+    2 lines.
+
+    :param dict pdb_dict: the ``dict`` to read.
+    :param dict quality_dict: the ``dict`` to update."""
+
+    if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("2"):
+        for remark in pdb_dict["REMARK"]["2"]:
+            try:
+                quality_dict["resolution"] = float(remark[10:].strip().split()[1])
+                break
+            except: pass
+
+
+def extract_rvalue_remark(pdb_dict, quality_dict):
+    """Takes a ``dict`` and adds resolution information to it by parsing REMARK
+    3 lines.
+
+    :param dict pdb_dict: the ``dict`` to read.
+    :param dict quality_dict: the ``dict`` to update."""
+
+    if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("3"):
+        patterns = {
+         "rvalue": r"R VALUE[ ]{2,}\(WORKING SET\) : (.+)",
+         "rfree": r"FREE R VALUE[ ]{2,}: (.+)",
+        }
+        for attribute, pattern in patterns.items():
+            for remark in pdb_dict["REMARK"]["3"]:
+                matches = re.findall(pattern, remark.strip())
+                if matches:
+                    try:
+                        quality_dict[attribute] = float(matches[0].strip())
+                    except: pass
+                    break
+
+
+def merge_lines(lines, start, join=" "):
+    """Gets a single continuous string from a sequence of lines.
+
+    :param list lines: The lines to merge.
+    :param int start: The start point in each record.
+    :param str join: The string to join on.
+    :rtype: ``str``"""
+
+    string = join.join([line[start:].strip() for line in lines])
+    return string
diff --git a/atomium/utilities.py b/atomium/utilities.py
@@ -4,7 +4,7 @@
 from requests import get
 from .mmcif import mmcif_string_to_mmcif_dict, mmcif_dict_to_data_dict
 from .mmtf import mmtf_bytes_to_mmtf_dict, mmtf_dict_to_data_dict
-from .pdb import pdb_string_to_pdb_dict
+from .pdb import pdb_string_to_pdb_dict, pdb_dict_to_data_dict
 
 def open(path, *args, **kwargs):
     try:
@@ -44,5 +44,5 @@ def get_parse_functions(filestring, path):
             return {
              "cif": (mmcif_string_to_mmcif_dict, mmcif_dict_to_data_dict),
              "mmtf": (mmtf_bytes_to_mmtf_dict, mmtf_dict_to_data_dict),
-             "pdb": (pdb_string_to_pdb_dict, None)
+             "pdb": (pdb_string_to_pdb_dict, pdb_dict_to_data_dict)
             }[ending]
diff --git a/tests/integration/test_file_reading.py b/tests/integration/test_file_reading.py
@@ -144,3 +144,26 @@ def test_5xme_file_dict(self):
          d["MODEL"][1][4],
          "ATOM      5  CB  ALA A 199      36.093  -8.556  -1.452  1.00  0.00           C"
         )
+
+
+    def test_1lol_data_dict(self):
+        d = atomium.open("tests/integration/files/1lol.pdb", data_dict=True)
+        self.assertEqual(set(d.keys()), {
+         "description", "experiment", "quality"
+        })
+        self.assertEqual(d["description"], {
+         "code": "1LOL",
+         "title": "CRYSTAL STRUCTURE OF OROTIDINE MONOPHOSPHATE DECARBOXYLASE COMPLEX WITH XMP",
+         "deposition_date": date(2002, 5, 6),
+         "classification": "LYASE",
+         "keywords": ["TIM BARREL", "LYASE"],
+         "authors": ["N.WU", "E.F.PAI"]
+        })
+        self.assertEqual(d["experiment"], {
+         "technique": "X-RAY DIFFRACTION",
+         "source_organism": "METHANOTHERMOBACTER THERMAUTOTROPHICUS STR. DELTA H",
+         "expression_system": "ESCHERICHIA COLI"
+        })
+        self.assertEqual(d["quality"], {
+         "resolution": 1.9, "rvalue": 0.193, "rfree": 0.229
+        })