Skip to content

Commit

Permalink
pandasmol2 code
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Apr 2, 2017
1 parent 9aa160d commit c591821
Show file tree
Hide file tree
Showing 11 changed files with 5,848 additions and 3 deletions.
2 changes: 1 addition & 1 deletion biopandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

__version__ = '0.1.5'
__version__ = '0.2.0.dev0'
__author__ = "Sebastian Raschka <mail@sebastianraschka.com>"
14 changes: 14 additions & 0 deletions biopandas/mol2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

"""
BioPandas module for working with TRIPOS MOL2
files in pandas DataFrames.
"""

from .pandas_mol2 import PandasMOL2

__all__ = ["PandasMOL2"]
54 changes: 54 additions & 0 deletions biopandas/mol2/mol2_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

import gzip


def split_multimol2(mol2_path):
"""
Splits a multi-mol2 file into individual Mol2 file contents.
Parameters
-----------
mol2_path : str
Path to the multi-mol2 file. Parses gzip files if the filepath
ends on .gz.
Returns
-----------
A generator object for lists for every extracted mol2-file. Lists contain
the molecule ID and the mol2 file contents.
e.g., ['ID1234', ['@<TRIPOS>MOLECULE\n', '...']]. Note that bytestrings
are returned (for reasons of efficieny) if the Mol2 content is read
from a gzip (.gz) file.
"""
if mol2_path.endswith('.gz'):
open_file = gzip.open
read_mode = 'rb'
else:
open_file = open
read_mode = 'r'
check = {'rb': b'@<TRIPOS>MOLECULE', 'r': '@<TRIPOS>MOLECULE'}

with open_file(mol2_path, read_mode) as f:
mol2 = ['', []]
while True:
try:
line = next(f)
if line.startswith(check[read_mode]):
if mol2[0]:
yield(mol2)
mol2 = ['', []]
mol2_id = next(f)
mol2[0] = mol2_id.rstrip()
mol2[1].append(line)
mol2[1].append(mol2_id)
else:
mol2[1].append(line)
except StopIteration:
yield(mol2)
return
213 changes: 213 additions & 0 deletions biopandas/mol2/pandas_mol2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

import pandas as pd
from .mol2_io import split_multimol2


COLUMN_NAMES = (
'atom_id',
'atom_name',
'x',
'y',
'z',
'atom_type',
'subst_id',
'subst_name',
'charge'
)

COLUMN_TYPES = (int, str, float, float, float, str, int, str, float)


class PandasMOL2(object):
""" Object for working with Tripos Mol2 structure files.
Attributes
----------
df : pandas.DataFrame
DataFrame of a Mol2's ATOM section
mol2_text : str
Mol2 file contents in string format
code : str
ID, code, or name of the molecule stored
"""
def __init__(self):
self._df = None
self.mol2_text = ''
self.header = ''
self.code = ''

@property
def df(self):
"""Acccesses the pandas DataFrame"""
return self._df

def _load_mol2(self, mol2_lines, mol2_code, columns):
"""Load mol2 contents into assert_raise_message instance"""
if columns is None:
col_names = COLUMN_NAMES
col_types = COLUMN_TYPES
else:
col_names, col_types = [], []
for i in range(len(columns)):
col_names.append(columns[i][0])
col_types.append(columns[i][1])

try:
self.mol2_text = ''.join(mol2_lines)
self.code = mol2_code
except TypeError:
mol2_lines = [m.decode() for m in mol2_lines]
self.mol2_text = ''.join(mol2_lines)
self.code = mol2_code.decode()

self._df = self._construct_df(mol2_lines, col_names, col_types)

def read_mol2(self, path, columns=None):
"""Reads Mol2 files (unzipped or gzipped) from local drive
Note that if your mol2 file contains more than one molecule,
only the first molecule is loaded into the DataFrame
Attributes
----------
path : str
Path to the Mol2 file in .mol2 format or gzipped format (.mol2.gz)
columns : dict or None (default: None)
If None, this methods expects a 9-column ATOM section that contains
the following columns:
{0:('atom_id', int), 1:('atom_name', str),
2:('x', float), 3:('y', float), 4:('z', float),
5:('atom_type', str), 6:('subst_id', int),
7:('subst_name', str), 8:('charge', float)}
If your Mol2 files are formatted differently, you can provide your
own column_mapping dictionary in a format similar to the one above.
However, note that not all assert_raise_message methods
may be supported then.
Returns
---------
self
"""
mol2_code, mol2_lines = next(split_multimol2(path))
self._load_mol2(mol2_lines, mol2_code, columns)
return self

def read_mol2_from_list(self, mol2_lines, mol2_code, columns=None):
"""Reads Mol2 file from a list into DataFrames
Attributes
----------
mol2_lines : list
A list of lines containing the mol2 file contents. For example,
['@<TRIPOS>MOLECULE\n',
'ZINC38611810\n',
' 65 68 0 0 0\n',
'SMALL\n',
'NO_CHARGES\n',
'\n',
'@<TRIPOS>ATOM\n',
' 1 C1 -1.1786 2.7011 -4.0323 C.3 1 <0> -0.1537\n',
' 2 C2 -1.2950 1.2442 -3.5798 C.3 1 <0> -0.1156\n',
...]
mol2_code : str or None
Name or ID of the molecule.
columns : dict or None (default: None)
If None, this methods expects a 9-column ATOM section that contains
the following columns:
{0:('atom_id', int), 1:('atom_name', str),
2:('x', float), 3:('y', float), 4:('z', float),
5:('atom_type', str), 6:('subst_id', int),
7:('subst_name', str), 8:('charge', float)}
If your Mol2 files are formatted differently, you can provide your
own column_mapping dictionary in a format similar to the one above.
However, note that not all assert_raise_message methods may be
supported then.
Returns
---------
self
"""

self._load_mol2(mol2_lines, mol2_code, columns)
return self

def _construct_df(self, mol2_lines, col_names, col_types):
"""Construct DataFrames from list of PDB lines."""
return self._atomsection_to_pandas(self._get_atomsection(mol2_lines),
col_names=col_names,
col_types=col_types)

@staticmethod
def _get_atomsection(mol2_lst):
"""Returns atom section from mol2 provided as list of strings"""
started = False
for idx, s in enumerate(mol2_lst):
if s.startswith('@<TRIPOS>ATOM'):
first_idx = idx + 1
started = True
elif started and s.startswith('@<TRIPOS>'):
last_idx_plus1 = idx
break
return mol2_lst[first_idx:last_idx_plus1]

@staticmethod
def _atomsection_to_pandas(mol2_atom_lst, col_names, col_types):

df = pd.DataFrame([lst.split() for lst in mol2_atom_lst],
columns=col_names)

for i in range(df.shape[1]):
df[col_names[i]] = df[col_names[i]].astype(col_types[i])

return df

@staticmethod
def rmsd(df1, df2, heavy_only=True):
"""Compute the Root Mean Square Deviation between molecules
Parameters
----------
df1 : pandas.DataFrame
DataFrame with HETATM, ATOM, and/or ANISOU entries
df2 : pandas.DataFrame
Second DataFrame for RMSD computation against df1. Must have the
same number of entries as df1
heavy_only : bool (default: True)
Which atoms to compare to compute the RMSD. If `True` (default),
computes the RMSD between non-hydrogen atoms only.
Returns
---------
rmsd : float
Root Mean Square Deviation between df1 and df2
"""
if df1.shape[0] != df2.shape[0]:
raise AttributeError('DataFrames have unequal lengths')

if heavy_only:
d1 = df1[df1['atom_type'] != 'H']
d2 = df2[df2['atom_type'] != 'H']
else:
d1, d2 = df1, df2

total = ((d1['x'] - d2['x'])**2 +
(d1['y'] - d2['y'])**2 +
(d1['z'] - d2['z'])**2)
rmsd = round((total.sum() / df1.shape[0])**0.5, 4)
return rmsd
74 changes: 74 additions & 0 deletions biopandas/mol2/tests/data/1b5e_1.mol2
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
@<TRIPOS>MOLECULE
DCM Pose 1
32 33 0 0 0
SMALL
USER_CHARGES

@<TRIPOS>ATOM
1 C1 18.8934 5.5819 24.1747 C.2 1 <0> -0.1356
2 C2 18.1301 4.7642 24.8969 C.2 1 <0> -0.0410
3 C3 18.2645 6.8544 23.7342 C.2 1 <0> 0.4856
4 C4 16.2520 6.2866 24.7933 C.2 1 <0> 0.8410
5 C5 15.3820 3.0682 25.1622 C.3 1 <0> 0.0000
6 C6 15.4162 1.8505 26.0566 C.3 1 <0> 0.2800
7 C7 16.7283 2.0138 26.8111 C.3 1 <0> 0.2800
8 C8 16.0764 4.1199 26.0119 C.3 1 <0> 0.5801
9 C9 17.9106 1.3823 26.0876 C.3 1 <0> 0.2800
10 N1 17.0289 7.1510 24.0411 N.2 1 <0> -0.6610
11 N2 16.8196 5.0644 25.2302 N.am 1 <0> -0.4691
12 N3 19.0194 7.7275 22.9859 N.pl3 1 <0> -0.8500
13 O1 18.7676 -2.3524 26.1510 O.3 1 <0> -1.0333
14 O2 20.3972 -0.3812 26.2318 O.3 1 <0> -1.0333
15 O3 15.0888 6.5824 25.0727 O.2 1 <0> -0.5700
16 O4 18.9314 -0.7527 24.1606 O.2 1 <0> -1.0333
17 O5 16.9690 3.4315 26.8994 O.3 1 <0> -0.5600
18 O6 14.3223 1.8946 26.9702 O.3 1 <0> -0.6800
19 O7 17.9091 -0.0135 26.3390 O.3 1 <0> -0.5512
20 P1 19.0969 -0.9440 25.6653 P.3 1 <0> 1.3712
21 H1 19.9176 5.3550 23.9105 H 1 <0> 0.1500
22 H2 18.5100 3.8155 25.2595 H 1 <0> 0.1500
23 H3 15.8520 2.8983 24.1870 H 1 <0> 0.0000
24 H4 14.3405 3.3601 24.9711 H 1 <0> 0.0000
25 H5 15.3663 0.9351 25.4839 H 1 <0> 0.0000
26 H6 16.6681 1.6130 27.8171 H 1 <0> 0.0000
27 H7 15.3483 4.6961 26.6094 H 1 <0> 0.0000
28 H8 18.8490 1.8078 26.4511 H 1 <0> 0.0000
29 H9 17.8303 1.5497 25.0110 H 1 <0> 0.0000
30 H10 19.9527 7.4708 22.7715 H 1 <0> 0.4000
31 H11 18.5977 8.5756 22.6932 H 1 <0> 0.4000
32 H12 14.2530 1.0535 27.4278 H 1 <0> 0.4000
@<TRIPOS>BOND
1 1 2 2
2 1 3 1
3 2 11 1
4 3 10 2
5 3 12 1
6 4 10 1
7 4 11 am
8 4 15 2
9 5 6 1
10 5 8 1
11 6 7 1
12 6 18 1
13 7 9 1
14 7 17 1
15 8 11 1
16 8 17 1
17 9 19 1
18 13 20 1
19 14 20 1
20 16 20 2
21 19 20 1
22 1 21 1
23 2 22 1
24 5 23 1
25 5 24 1
26 6 25 1
27 7 26 1
28 8 27 1
29 9 28 1
30 9 29 1
31 12 30 1
32 12 31 1
33 18 32 1
@<TRIPOS>SUBSTRUCTURE
Loading

0 comments on commit c591821

Please sign in to comment.