Skip to content

Commit

Permalink
.pdb header reading
Browse files Browse the repository at this point in the history
  • Loading branch information
samirelanduk committed Sep 17, 2018
1 parent 638e262 commit 27a9b0b
Show file tree
Hide file tree
Showing 5 changed files with 534 additions and 7 deletions.
2 changes: 1 addition & 1 deletion atomium/mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def mmcif_to_data_transfer(mmcif_dict, data_dict, d_cat, d_key, m_table, m_key,
:param bool split: if True, the value will be split on commas.
:param bool multi: if True, every row in the table will be read.
:param function func: if given, this will be applied to the value."""

try:
if multi:
value = [row[m_key] for row in mmcif_dict[m_table]]
Expand Down
192 changes: 192 additions & 0 deletions atomium/pdb.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Contains functions for dealing with the .mmtf file format."""

from datetime import datetime
import re

def pdb_string_to_pdb_dict(filestring):
"""Takes a .pdb filestring and turns into a ``dict`` which represents its
record structure. Only lines which aren't empty are used.
Expand Down Expand Up @@ -51,3 +54,192 @@ def update_dict(d, key, value):
try:
d[key].append(value)
except: d[key] = [value]


def pdb_dict_to_data_dict(pdb_dict):
"""Converts an .pdb dictionary into an atomium data dictionary, with the
same standard layout that the other file formats get converted into.
:param dict pdb_dict: the .pdb dictionary.
:rtype: ``dict``"""

data_dict = {
"description": {
"code": None, "title": None, "deposition_date": None,
"classification": None, "keywords": [], "authors": []
}, "experiment": {
"technique": None, "source_organism": None, "expression_system": None
}, "quality": {"resolution": None, "rvalue": None, "rfree": None}
}
update_description_dict(pdb_dict, data_dict)
update_experiment_dict(pdb_dict, data_dict)
update_quality_dict(pdb_dict, data_dict)
return data_dict


def update_description_dict(pdb_dict, data_dict):
"""Creates the description component of a standard atomium data dictionary
from a .pdb dictionary.
:param dict pdb_dict: The .pdb dictionary to read.
:param dict data_dict: The data dictionary to update."""

extract_header(pdb_dict, data_dict["description"])
extract_title(pdb_dict, data_dict["description"])
extract_keywords(pdb_dict, data_dict["description"])
extract_authors(pdb_dict, data_dict["description"])


def update_experiment_dict(pdb_dict, data_dict):
"""Creates the experiment component of a standard atomium data dictionary
from a .pdb dictionary.
:param dict pdb_dict: The .pdb dictionary to update.
:param dict data_dict: The data dictionary to update."""

extract_technique(pdb_dict, data_dict["experiment"])
extract_source(pdb_dict, data_dict["experiment"])


def update_quality_dict(pdb_dict, data_dict):
"""Creates the quality component of a standard atomium data dictionary
from a .pdb dictionary.
:param dict pdb_dict: The .pdb dictionary to update.
:param dict data_dict: The data dictionary to update."""

extract_resolution_remark(pdb_dict, data_dict["quality"])
extract_rvalue_remark(pdb_dict, data_dict["quality"])


def extract_header(pdb_dict, description_dict):
"""Takes a ``dict`` and adds header information to it by parsing the HEADER
line.
:param dict pdb_dict: the ``dict`` to read.
:param dict description_dict: the ``dict`` to update."""

if pdb_dict.get("HEADER"):
line = pdb_dict["HEADER"][0]
if line[50:59].strip():
description_dict["deposition_date"] = datetime.strptime(
line[50:59], "%d-%b-%y"
).date()
if line[62:66].strip(): description_dict["code"] = line[62:66]
if line[10:50].strip():
description_dict["classification"] = line[10:50].strip()


def extract_title(pdb_dict, description_dict):
"""Takes a ``dict`` and adds header information to it by parsing the TITLE
lines.
:param dict pdb_dict: the ``dict`` to read.
:param dict description_dict: the ``dict`` to update."""

if pdb_dict.get("TITLE"):
description_dict["title"] = merge_lines(pdb_dict["TITLE"], 10)


def extract_keywords(pdb_dict, description_dict):
"""Takes a ``dict`` and adds header information to it by parsing the KEYWDS
line.
:param dict pdb_dict: the ``dict`` to read.
:param dict description_dict: the ``dict`` to update."""

if pdb_dict.get("KEYWDS"):
text = merge_lines(pdb_dict["KEYWDS"], 10)
description_dict["keywords"] = [w.strip() for w in text.split(",")]


def extract_authors(pdb_dict, description_dict):
"""Takes a ``dict`` and adds header information to it by parsing the AUTHOR
line.
:param dict pdb_dict: the ``dict`` to read.
:param dict description_dict: the ``dict`` to update."""

if pdb_dict.get("AUTHOR"):
text = merge_lines(pdb_dict["AUTHOR"], 10)
description_dict["authors"] = [w.strip() for w in text.split(",")]


def extract_technique(pdb_dict, experiment_dict):
"""Takes a ``dict`` and adds technique information to it by parsing EXPDTA
lines.
:param dict pdb_dict: the ``dict`` to read.
:param dict experiment_dict: the ``dict`` to update."""

if pdb_dict.get("EXPDTA"):
if pdb_dict["EXPDTA"][0].strip():
experiment_dict["technique"] = pdb_dict["EXPDTA"][0][6:].strip()


def extract_source(pdb_dict, experiment_dict):
"""Takes a ``dict`` and adds source information to it by parsing SOURCE
lines.
:param dict pdb_dict: the ``dict`` to read.
:param dict experiment_dict: the ``dict`` to update."""

if pdb_dict.get("SOURCE"):
data = merge_lines(pdb_dict["SOURCE"], 10)
patterns = {
"source_organism": r"ORGANISM_SCIENTIFIC\: (.+?);",
"expression_system": r"EXPRESSION_SYSTEM\: (.+?);"
}
for attribute, pattern in patterns.items():
matches = re.findall(pattern, data)
if matches:
experiment_dict[attribute] = matches[0]


def extract_resolution_remark(pdb_dict, quality_dict):
"""Takes a ``dict`` and adds resolution information to it by parsing REMARK
2 lines.
:param dict pdb_dict: the ``dict`` to read.
:param dict quality_dict: the ``dict`` to update."""

if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("2"):
for remark in pdb_dict["REMARK"]["2"]:
try:
quality_dict["resolution"] = float(remark[10:].strip().split()[1])
break
except: pass


def extract_rvalue_remark(pdb_dict, quality_dict):
"""Takes a ``dict`` and adds resolution information to it by parsing REMARK
3 lines.
:param dict pdb_dict: the ``dict`` to read.
:param dict quality_dict: the ``dict`` to update."""

if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("3"):
patterns = {
"rvalue": r"R VALUE[ ]{2,}\(WORKING SET\) : (.+)",
"rfree": r"FREE R VALUE[ ]{2,}: (.+)",
}
for attribute, pattern in patterns.items():
for remark in pdb_dict["REMARK"]["3"]:
matches = re.findall(pattern, remark.strip())
if matches:
try:
quality_dict[attribute] = float(matches[0].strip())
except: pass
break


def merge_lines(lines, start, join=" "):
"""Gets a single continuous string from a sequence of lines.
:param list lines: The lines to merge.
:param int start: The start point in each record.
:param str join: The string to join on.
:rtype: ``str``"""

string = join.join([line[start:].strip() for line in lines])
return string
4 changes: 2 additions & 2 deletions atomium/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from requests import get
from .mmcif import mmcif_string_to_mmcif_dict, mmcif_dict_to_data_dict
from .mmtf import mmtf_bytes_to_mmtf_dict, mmtf_dict_to_data_dict
from .pdb import pdb_string_to_pdb_dict
from .pdb import pdb_string_to_pdb_dict, pdb_dict_to_data_dict

def open(path, *args, **kwargs):
try:
Expand Down Expand Up @@ -44,5 +44,5 @@ def get_parse_functions(filestring, path):
return {
"cif": (mmcif_string_to_mmcif_dict, mmcif_dict_to_data_dict),
"mmtf": (mmtf_bytes_to_mmtf_dict, mmtf_dict_to_data_dict),
"pdb": (pdb_string_to_pdb_dict, None)
"pdb": (pdb_string_to_pdb_dict, pdb_dict_to_data_dict)
}[ending]
23 changes: 23 additions & 0 deletions tests/integration/test_file_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,26 @@ def test_5xme_file_dict(self):
d["MODEL"][1][4],
"ATOM 5 CB ALA A 199 36.093 -8.556 -1.452 1.00 0.00 C"
)


def test_1lol_data_dict(self):
d = atomium.open("tests/integration/files/1lol.pdb", data_dict=True)
self.assertEqual(set(d.keys()), {
"description", "experiment", "quality"
})
self.assertEqual(d["description"], {
"code": "1LOL",
"title": "CRYSTAL STRUCTURE OF OROTIDINE MONOPHOSPHATE DECARBOXYLASE COMPLEX WITH XMP",
"deposition_date": date(2002, 5, 6),
"classification": "LYASE",
"keywords": ["TIM BARREL", "LYASE"],
"authors": ["N.WU", "E.F.PAI"]
})
self.assertEqual(d["experiment"], {
"technique": "X-RAY DIFFRACTION",
"source_organism": "METHANOTHERMOBACTER THERMAUTOTROPHICUS STR. DELTA H",
"expression_system": "ESCHERICHIA COLI"
})
self.assertEqual(d["quality"], {
"resolution": 1.9, "rvalue": 0.193, "rfree": 0.229
})
Loading

0 comments on commit 27a9b0b

Please sign in to comment.