pandasmol2 code

BioPandas · Apr 2, 2017 · c591821 · c591821
1 parent 9aa160d
commit c591821
Show file tree

Hide file tree

Showing 11 changed files with 5,848 additions and 3 deletions.
diff --git a/biopandas/__init__.py b/biopandas/__init__.py
@@ -4,5 +4,5 @@
 # Project Website: http://rasbt.github.io/biopandas/
 # Code Repository: https://github.com/rasbt/biopandas
 
-__version__ = '0.1.5'
+__version__ = '0.2.0.dev0'
 __author__ = "Sebastian Raschka <mail@sebastianraschka.com>"
diff --git a/biopandas/mol2/__init__.py b/biopandas/mol2/__init__.py
@@ -0,0 +1,14 @@
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
+
+"""
+BioPandas module for working with TRIPOS MOL2
+files in pandas DataFrames.
+"""
+
+from .pandas_mol2 import PandasMOL2
+
+__all__ = ["PandasMOL2"]
diff --git a/biopandas/mol2/mol2_io.py b/biopandas/mol2/mol2_io.py
@@ -0,0 +1,54 @@
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
+
+import gzip
+
+
+def split_multimol2(mol2_path):
+    """
+    Splits a multi-mol2 file into individual Mol2 file contents.
+
+    Parameters
+    -----------
+    mol2_path : str
+      Path to the multi-mol2 file. Parses gzip files if the filepath
+      ends on .gz.
+
+    Returns
+    -----------
+    A generator object for lists for every extracted mol2-file. Lists contain
+        the molecule ID and the mol2 file contents.
+        e.g., ['ID1234', ['@<TRIPOS>MOLECULE\n', '...']]. Note that bytestrings
+        are returned (for reasons of efficieny) if the Mol2 content is read
+        from a gzip (.gz) file.
+
+    """
+    if mol2_path.endswith('.gz'):
+        open_file = gzip.open
+        read_mode = 'rb'
+    else:
+        open_file = open
+        read_mode = 'r'
+    check = {'rb': b'@<TRIPOS>MOLECULE', 'r': '@<TRIPOS>MOLECULE'}
+
+    with open_file(mol2_path, read_mode) as f:
+        mol2 = ['', []]
+        while True:
+            try:
+                line = next(f)
+                if line.startswith(check[read_mode]):
+                    if mol2[0]:
+                        yield(mol2)
+                    mol2 = ['', []]
+                    mol2_id = next(f)
+                    mol2[0] = mol2_id.rstrip()
+                    mol2[1].append(line)
+                    mol2[1].append(mol2_id)
+                else:
+                    mol2[1].append(line)
+            except StopIteration:
+                yield(mol2)
+                return
diff --git a/biopandas/mol2/pandas_mol2.py b/biopandas/mol2/pandas_mol2.py
@@ -0,0 +1,213 @@
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
+
+import pandas as pd
+from .mol2_io import split_multimol2
+
+
+COLUMN_NAMES = (
+ 'atom_id',
+ 'atom_name',
+ 'x',
+ 'y',
+ 'z',
+ 'atom_type',
+ 'subst_id',
+ 'subst_name',
+ 'charge'
+)
+
+COLUMN_TYPES = (int, str, float, float, float, str, int, str, float)
+
+
+class PandasMOL2(object):
+    """ Object for working with Tripos Mol2 structure files.
+
+   Attributes
+    ----------
+    df : pandas.DataFrame
+        DataFrame of a Mol2's ATOM section
+
+    mol2_text : str
+        Mol2 file contents in string format
+
+    code : str
+        ID, code, or name of the molecule stored
+
+    """
+    def __init__(self):
+        self._df = None
+        self.mol2_text = ''
+        self.header = ''
+        self.code = ''
+
+    @property
+    def df(self):
+        """Acccesses the pandas DataFrame"""
+        return self._df
+
+    def _load_mol2(self, mol2_lines, mol2_code, columns):
+        """Load mol2 contents into assert_raise_message instance"""
+        if columns is None:
+            col_names = COLUMN_NAMES
+            col_types = COLUMN_TYPES
+        else:
+            col_names, col_types = [], []
+            for i in range(len(columns)):
+                col_names.append(columns[i][0])
+                col_types.append(columns[i][1])
+
+        try:
+            self.mol2_text = ''.join(mol2_lines)
+            self.code = mol2_code
+        except TypeError:
+            mol2_lines = [m.decode() for m in mol2_lines]
+            self.mol2_text = ''.join(mol2_lines)
+            self.code = mol2_code.decode()
+
+        self._df = self._construct_df(mol2_lines, col_names, col_types)
+
+    def read_mol2(self, path, columns=None):
+        """Reads Mol2 files (unzipped or gzipped) from local drive
+
+        Note that if your mol2 file contains more than one molecule,
+        only the first molecule is loaded into the DataFrame
+
+        Attributes
+        ----------
+        path : str
+            Path to the Mol2 file in .mol2 format or gzipped format (.mol2.gz)
+
+        columns : dict or None (default: None)
+            If None, this methods expects a 9-column ATOM section that contains
+            the following columns:
+
+            {0:('atom_id', int), 1:('atom_name', str),
+             2:('x', float), 3:('y', float), 4:('z', float),
+             5:('atom_type', str), 6:('subst_id', int),
+             7:('subst_name', str), 8:('charge', float)}
+
+            If your Mol2 files are formatted differently, you can provide your
+            own column_mapping dictionary in a format similar to the one above.
+            However, note that not all assert_raise_message methods
+            may be supported then.
+
+        Returns
+        ---------
+        self
+
+        """
+        mol2_code, mol2_lines = next(split_multimol2(path))
+        self._load_mol2(mol2_lines, mol2_code, columns)
+        return self
+
+    def read_mol2_from_list(self, mol2_lines, mol2_code, columns=None):
+        """Reads Mol2 file from a list into DataFrames
+
+        Attributes
+        ----------
+        mol2_lines : list
+            A list of lines containing the mol2 file contents. For example,
+            ['@<TRIPOS>MOLECULE\n',
+             'ZINC38611810\n',
+             '   65    68     0     0     0\n',
+             'SMALL\n',
+             'NO_CHARGES\n',
+             '\n',
+             '@<TRIPOS>ATOM\n',
+             '      1 C1  -1.1786  2.7011  -4.0323 C.3  1 <0>   -0.1537\n',
+             '      2 C2  -1.2950  1.2442  -3.5798 C.3  1 <0>   -0.1156\n',
+             ...]
+
+        mol2_code : str or None
+            Name or ID of the molecule.
+
+        columns : dict or None (default: None)
+            If None, this methods expects a 9-column ATOM section that contains
+            the following columns:
+            {0:('atom_id', int), 1:('atom_name', str),
+             2:('x', float), 3:('y', float), 4:('z', float),
+             5:('atom_type', str), 6:('subst_id', int),
+             7:('subst_name', str), 8:('charge', float)}
+            If your Mol2 files are formatted differently, you can provide your
+            own column_mapping dictionary in a format similar to the one above.
+            However, note that not all assert_raise_message methods may be
+            supported then.
+
+        Returns
+        ---------
+        self
+
+        """
+
+        self._load_mol2(mol2_lines, mol2_code, columns)
+        return self
+
+    def _construct_df(self, mol2_lines, col_names, col_types):
+        """Construct DataFrames from list of PDB lines."""
+        return self._atomsection_to_pandas(self._get_atomsection(mol2_lines),
+                                           col_names=col_names,
+                                           col_types=col_types)
+
+    @staticmethod
+    def _get_atomsection(mol2_lst):
+        """Returns atom section from mol2 provided as list of strings"""
+        started = False
+        for idx, s in enumerate(mol2_lst):
+            if s.startswith('@<TRIPOS>ATOM'):
+                first_idx = idx + 1
+                started = True
+            elif started and s.startswith('@<TRIPOS>'):
+                last_idx_plus1 = idx
+                break
+        return mol2_lst[first_idx:last_idx_plus1]
+
+    @staticmethod
+    def _atomsection_to_pandas(mol2_atom_lst, col_names, col_types):
+
+        df = pd.DataFrame([lst.split() for lst in mol2_atom_lst],
+                          columns=col_names)
+
+        for i in range(df.shape[1]):
+            df[col_names[i]] = df[col_names[i]].astype(col_types[i])
+
+        return df
+
+    @staticmethod
+    def rmsd(df1, df2, heavy_only=True):
+        """Compute the Root Mean Square Deviation between molecules
+
+        Parameters
+        ----------
+        df1 : pandas.DataFrame
+            DataFrame with HETATM, ATOM, and/or ANISOU entries
+        df2 : pandas.DataFrame
+            Second DataFrame for RMSD computation against df1. Must have the
+            same number of entries as df1
+        heavy_only : bool (default: True)
+            Which atoms to compare to compute the RMSD. If `True` (default),
+            computes the RMSD between non-hydrogen atoms only.
+
+        Returns
+        ---------
+        rmsd : float
+            Root Mean Square Deviation between df1 and df2
+
+        """
+        if df1.shape[0] != df2.shape[0]:
+            raise AttributeError('DataFrames have unequal lengths')
+
+        if heavy_only:
+            d1 = df1[df1['atom_type'] != 'H']
+            d2 = df2[df2['atom_type'] != 'H']
+        else:
+            d1, d2 = df1, df2
+
+        total = ((d1['x'] - d2['x'])**2 +
+                 (d1['y'] - d2['y'])**2 +
+                 (d1['z'] - d2['z'])**2)
+        rmsd = round((total.sum() / df1.shape[0])**0.5, 4)
+        return rmsd
diff --git a/biopandas/mol2/tests/data/1b5e_1.mol2 b/biopandas/mol2/tests/data/1b5e_1.mol2
@@ -0,0 +1,74 @@
+@<TRIPOS>MOLECULE
+DCM Pose 1
+   32    33     0     0     0
+SMALL
+USER_CHARGES
+
+@<TRIPOS>ATOM
+      1 C1         18.8934    5.5819   24.1747 C.2       1 <0>       -0.1356 
+      2 C2         18.1301    4.7642   24.8969 C.2       1 <0>       -0.0410 
+      3 C3         18.2645    6.8544   23.7342 C.2       1 <0>        0.4856 
+      4 C4         16.2520    6.2866   24.7933 C.2       1 <0>        0.8410 
+      5 C5         15.3820    3.0682   25.1622 C.3       1 <0>        0.0000 
+      6 C6         15.4162    1.8505   26.0566 C.3       1 <0>        0.2800 
+      7 C7         16.7283    2.0138   26.8111 C.3       1 <0>        0.2800 
+      8 C8         16.0764    4.1199   26.0119 C.3       1 <0>        0.5801 
+      9 C9         17.9106    1.3823   26.0876 C.3       1 <0>        0.2800 
+     10 N1         17.0289    7.1510   24.0411 N.2       1 <0>       -0.6610 
+     11 N2         16.8196    5.0644   25.2302 N.am      1 <0>       -0.4691 
+     12 N3         19.0194    7.7275   22.9859 N.pl3     1 <0>       -0.8500 
+     13 O1         18.7676   -2.3524   26.1510 O.3       1 <0>       -1.0333 
+     14 O2         20.3972   -0.3812   26.2318 O.3       1 <0>       -1.0333 
+     15 O3         15.0888    6.5824   25.0727 O.2       1 <0>       -0.5700 
+     16 O4         18.9314   -0.7527   24.1606 O.2       1 <0>       -1.0333 
+     17 O5         16.9690    3.4315   26.8994 O.3       1 <0>       -0.5600 
+     18 O6         14.3223    1.8946   26.9702 O.3       1 <0>       -0.6800 
+     19 O7         17.9091   -0.0135   26.3390 O.3       1 <0>       -0.5512 
+     20 P1         19.0969   -0.9440   25.6653 P.3       1 <0>        1.3712 
+     21 H1         19.9176    5.3550   23.9105 H         1 <0>        0.1500 
+     22 H2         18.5100    3.8155   25.2595 H         1 <0>        0.1500 
+     23 H3         15.8520    2.8983   24.1870 H         1 <0>        0.0000 
+     24 H4         14.3405    3.3601   24.9711 H         1 <0>        0.0000 
+     25 H5         15.3663    0.9351   25.4839 H         1 <0>        0.0000 
+     26 H6         16.6681    1.6130   27.8171 H         1 <0>        0.0000 
+     27 H7         15.3483    4.6961   26.6094 H         1 <0>        0.0000 
+     28 H8         18.8490    1.8078   26.4511 H         1 <0>        0.0000 
+     29 H9         17.8303    1.5497   25.0110 H         1 <0>        0.0000 
+     30 H10        19.9527    7.4708   22.7715 H         1 <0>        0.4000 
+     31 H11        18.5977    8.5756   22.6932 H         1 <0>        0.4000 
+     32 H12        14.2530    1.0535   27.4278 H         1 <0>        0.4000 
+@<TRIPOS>BOND
+    1     1     2 2
+    2     1     3 1
+    3     2    11 1
+    4     3    10 2
+    5     3    12 1
+    6     4    10 1
+    7     4    11 am
+    8     4    15 2
+    9     5     6 1
+   10     5     8 1
+   11     6     7 1
+   12     6    18 1
+   13     7     9 1
+   14     7    17 1
+   15     8    11 1
+   16     8    17 1
+   17     9    19 1
+   18    13    20 1
+   19    14    20 1
+   20    16    20 2
+   21    19    20 1
+   22     1    21 1
+   23     2    22 1
+   24     5    23 1
+   25     5    24 1
+   26     6    25 1
+   27     7    26 1
+   28     8    27 1
+   29     9    28 1
+   30     9    29 1
+   31    12    30 1
+   32    12    31 1
+   33    18    32 1
+@<TRIPOS>SUBSTRUCTURE