-
Notifications
You must be signed in to change notification settings - Fork 117
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
5,848 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# BioPandas | ||
# Author: Sebastian Raschka <mail@sebastianraschka.com> | ||
# License: BSD 3 clause | ||
# Project Website: http://rasbt.github.io/biopandas/ | ||
# Code Repository: https://github.com/rasbt/biopandas | ||
|
||
""" | ||
BioPandas module for working with TRIPOS MOL2 | ||
files in pandas DataFrames. | ||
""" | ||
|
||
from .pandas_mol2 import PandasMOL2 | ||
|
||
__all__ = ["PandasMOL2"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# BioPandas | ||
# Author: Sebastian Raschka <mail@sebastianraschka.com> | ||
# License: BSD 3 clause | ||
# Project Website: http://rasbt.github.io/biopandas/ | ||
# Code Repository: https://github.com/rasbt/biopandas | ||
|
||
import gzip | ||
|
||
|
||
def split_multimol2(mol2_path): | ||
""" | ||
Splits a multi-mol2 file into individual Mol2 file contents. | ||
Parameters | ||
----------- | ||
mol2_path : str | ||
Path to the multi-mol2 file. Parses gzip files if the filepath | ||
ends on .gz. | ||
Returns | ||
----------- | ||
A generator object for lists for every extracted mol2-file. Lists contain | ||
the molecule ID and the mol2 file contents. | ||
e.g., ['ID1234', ['@<TRIPOS>MOLECULE\n', '...']]. Note that bytestrings | ||
are returned (for reasons of efficieny) if the Mol2 content is read | ||
from a gzip (.gz) file. | ||
""" | ||
if mol2_path.endswith('.gz'): | ||
open_file = gzip.open | ||
read_mode = 'rb' | ||
else: | ||
open_file = open | ||
read_mode = 'r' | ||
check = {'rb': b'@<TRIPOS>MOLECULE', 'r': '@<TRIPOS>MOLECULE'} | ||
|
||
with open_file(mol2_path, read_mode) as f: | ||
mol2 = ['', []] | ||
while True: | ||
try: | ||
line = next(f) | ||
if line.startswith(check[read_mode]): | ||
if mol2[0]: | ||
yield(mol2) | ||
mol2 = ['', []] | ||
mol2_id = next(f) | ||
mol2[0] = mol2_id.rstrip() | ||
mol2[1].append(line) | ||
mol2[1].append(mol2_id) | ||
else: | ||
mol2[1].append(line) | ||
except StopIteration: | ||
yield(mol2) | ||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
# BioPandas | ||
# Author: Sebastian Raschka <mail@sebastianraschka.com> | ||
# License: BSD 3 clause | ||
# Project Website: http://rasbt.github.io/biopandas/ | ||
# Code Repository: https://github.com/rasbt/biopandas | ||
|
||
import pandas as pd | ||
from .mol2_io import split_multimol2 | ||
|
||
|
||
COLUMN_NAMES = ( | ||
'atom_id', | ||
'atom_name', | ||
'x', | ||
'y', | ||
'z', | ||
'atom_type', | ||
'subst_id', | ||
'subst_name', | ||
'charge' | ||
) | ||
|
||
COLUMN_TYPES = (int, str, float, float, float, str, int, str, float) | ||
|
||
|
||
class PandasMOL2(object): | ||
""" Object for working with Tripos Mol2 structure files. | ||
Attributes | ||
---------- | ||
df : pandas.DataFrame | ||
DataFrame of a Mol2's ATOM section | ||
mol2_text : str | ||
Mol2 file contents in string format | ||
code : str | ||
ID, code, or name of the molecule stored | ||
""" | ||
def __init__(self): | ||
self._df = None | ||
self.mol2_text = '' | ||
self.header = '' | ||
self.code = '' | ||
|
||
@property | ||
def df(self): | ||
"""Acccesses the pandas DataFrame""" | ||
return self._df | ||
|
||
def _load_mol2(self, mol2_lines, mol2_code, columns): | ||
"""Load mol2 contents into assert_raise_message instance""" | ||
if columns is None: | ||
col_names = COLUMN_NAMES | ||
col_types = COLUMN_TYPES | ||
else: | ||
col_names, col_types = [], [] | ||
for i in range(len(columns)): | ||
col_names.append(columns[i][0]) | ||
col_types.append(columns[i][1]) | ||
|
||
try: | ||
self.mol2_text = ''.join(mol2_lines) | ||
self.code = mol2_code | ||
except TypeError: | ||
mol2_lines = [m.decode() for m in mol2_lines] | ||
self.mol2_text = ''.join(mol2_lines) | ||
self.code = mol2_code.decode() | ||
|
||
self._df = self._construct_df(mol2_lines, col_names, col_types) | ||
|
||
def read_mol2(self, path, columns=None): | ||
"""Reads Mol2 files (unzipped or gzipped) from local drive | ||
Note that if your mol2 file contains more than one molecule, | ||
only the first molecule is loaded into the DataFrame | ||
Attributes | ||
---------- | ||
path : str | ||
Path to the Mol2 file in .mol2 format or gzipped format (.mol2.gz) | ||
columns : dict or None (default: None) | ||
If None, this methods expects a 9-column ATOM section that contains | ||
the following columns: | ||
{0:('atom_id', int), 1:('atom_name', str), | ||
2:('x', float), 3:('y', float), 4:('z', float), | ||
5:('atom_type', str), 6:('subst_id', int), | ||
7:('subst_name', str), 8:('charge', float)} | ||
If your Mol2 files are formatted differently, you can provide your | ||
own column_mapping dictionary in a format similar to the one above. | ||
However, note that not all assert_raise_message methods | ||
may be supported then. | ||
Returns | ||
--------- | ||
self | ||
""" | ||
mol2_code, mol2_lines = next(split_multimol2(path)) | ||
self._load_mol2(mol2_lines, mol2_code, columns) | ||
return self | ||
|
||
def read_mol2_from_list(self, mol2_lines, mol2_code, columns=None): | ||
"""Reads Mol2 file from a list into DataFrames | ||
Attributes | ||
---------- | ||
mol2_lines : list | ||
A list of lines containing the mol2 file contents. For example, | ||
['@<TRIPOS>MOLECULE\n', | ||
'ZINC38611810\n', | ||
' 65 68 0 0 0\n', | ||
'SMALL\n', | ||
'NO_CHARGES\n', | ||
'\n', | ||
'@<TRIPOS>ATOM\n', | ||
' 1 C1 -1.1786 2.7011 -4.0323 C.3 1 <0> -0.1537\n', | ||
' 2 C2 -1.2950 1.2442 -3.5798 C.3 1 <0> -0.1156\n', | ||
...] | ||
mol2_code : str or None | ||
Name or ID of the molecule. | ||
columns : dict or None (default: None) | ||
If None, this methods expects a 9-column ATOM section that contains | ||
the following columns: | ||
{0:('atom_id', int), 1:('atom_name', str), | ||
2:('x', float), 3:('y', float), 4:('z', float), | ||
5:('atom_type', str), 6:('subst_id', int), | ||
7:('subst_name', str), 8:('charge', float)} | ||
If your Mol2 files are formatted differently, you can provide your | ||
own column_mapping dictionary in a format similar to the one above. | ||
However, note that not all assert_raise_message methods may be | ||
supported then. | ||
Returns | ||
--------- | ||
self | ||
""" | ||
|
||
self._load_mol2(mol2_lines, mol2_code, columns) | ||
return self | ||
|
||
def _construct_df(self, mol2_lines, col_names, col_types): | ||
"""Construct DataFrames from list of PDB lines.""" | ||
return self._atomsection_to_pandas(self._get_atomsection(mol2_lines), | ||
col_names=col_names, | ||
col_types=col_types) | ||
|
||
@staticmethod | ||
def _get_atomsection(mol2_lst): | ||
"""Returns atom section from mol2 provided as list of strings""" | ||
started = False | ||
for idx, s in enumerate(mol2_lst): | ||
if s.startswith('@<TRIPOS>ATOM'): | ||
first_idx = idx + 1 | ||
started = True | ||
elif started and s.startswith('@<TRIPOS>'): | ||
last_idx_plus1 = idx | ||
break | ||
return mol2_lst[first_idx:last_idx_plus1] | ||
|
||
@staticmethod | ||
def _atomsection_to_pandas(mol2_atom_lst, col_names, col_types): | ||
|
||
df = pd.DataFrame([lst.split() for lst in mol2_atom_lst], | ||
columns=col_names) | ||
|
||
for i in range(df.shape[1]): | ||
df[col_names[i]] = df[col_names[i]].astype(col_types[i]) | ||
|
||
return df | ||
|
||
@staticmethod | ||
def rmsd(df1, df2, heavy_only=True): | ||
"""Compute the Root Mean Square Deviation between molecules | ||
Parameters | ||
---------- | ||
df1 : pandas.DataFrame | ||
DataFrame with HETATM, ATOM, and/or ANISOU entries | ||
df2 : pandas.DataFrame | ||
Second DataFrame for RMSD computation against df1. Must have the | ||
same number of entries as df1 | ||
heavy_only : bool (default: True) | ||
Which atoms to compare to compute the RMSD. If `True` (default), | ||
computes the RMSD between non-hydrogen atoms only. | ||
Returns | ||
--------- | ||
rmsd : float | ||
Root Mean Square Deviation between df1 and df2 | ||
""" | ||
if df1.shape[0] != df2.shape[0]: | ||
raise AttributeError('DataFrames have unequal lengths') | ||
|
||
if heavy_only: | ||
d1 = df1[df1['atom_type'] != 'H'] | ||
d2 = df2[df2['atom_type'] != 'H'] | ||
else: | ||
d1, d2 = df1, df2 | ||
|
||
total = ((d1['x'] - d2['x'])**2 + | ||
(d1['y'] - d2['y'])**2 + | ||
(d1['z'] - d2['z'])**2) | ||
rmsd = round((total.sum() / df1.shape[0])**0.5, 4) | ||
return rmsd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
@<TRIPOS>MOLECULE | ||
DCM Pose 1 | ||
32 33 0 0 0 | ||
SMALL | ||
USER_CHARGES | ||
|
||
@<TRIPOS>ATOM | ||
1 C1 18.8934 5.5819 24.1747 C.2 1 <0> -0.1356 | ||
2 C2 18.1301 4.7642 24.8969 C.2 1 <0> -0.0410 | ||
3 C3 18.2645 6.8544 23.7342 C.2 1 <0> 0.4856 | ||
4 C4 16.2520 6.2866 24.7933 C.2 1 <0> 0.8410 | ||
5 C5 15.3820 3.0682 25.1622 C.3 1 <0> 0.0000 | ||
6 C6 15.4162 1.8505 26.0566 C.3 1 <0> 0.2800 | ||
7 C7 16.7283 2.0138 26.8111 C.3 1 <0> 0.2800 | ||
8 C8 16.0764 4.1199 26.0119 C.3 1 <0> 0.5801 | ||
9 C9 17.9106 1.3823 26.0876 C.3 1 <0> 0.2800 | ||
10 N1 17.0289 7.1510 24.0411 N.2 1 <0> -0.6610 | ||
11 N2 16.8196 5.0644 25.2302 N.am 1 <0> -0.4691 | ||
12 N3 19.0194 7.7275 22.9859 N.pl3 1 <0> -0.8500 | ||
13 O1 18.7676 -2.3524 26.1510 O.3 1 <0> -1.0333 | ||
14 O2 20.3972 -0.3812 26.2318 O.3 1 <0> -1.0333 | ||
15 O3 15.0888 6.5824 25.0727 O.2 1 <0> -0.5700 | ||
16 O4 18.9314 -0.7527 24.1606 O.2 1 <0> -1.0333 | ||
17 O5 16.9690 3.4315 26.8994 O.3 1 <0> -0.5600 | ||
18 O6 14.3223 1.8946 26.9702 O.3 1 <0> -0.6800 | ||
19 O7 17.9091 -0.0135 26.3390 O.3 1 <0> -0.5512 | ||
20 P1 19.0969 -0.9440 25.6653 P.3 1 <0> 1.3712 | ||
21 H1 19.9176 5.3550 23.9105 H 1 <0> 0.1500 | ||
22 H2 18.5100 3.8155 25.2595 H 1 <0> 0.1500 | ||
23 H3 15.8520 2.8983 24.1870 H 1 <0> 0.0000 | ||
24 H4 14.3405 3.3601 24.9711 H 1 <0> 0.0000 | ||
25 H5 15.3663 0.9351 25.4839 H 1 <0> 0.0000 | ||
26 H6 16.6681 1.6130 27.8171 H 1 <0> 0.0000 | ||
27 H7 15.3483 4.6961 26.6094 H 1 <0> 0.0000 | ||
28 H8 18.8490 1.8078 26.4511 H 1 <0> 0.0000 | ||
29 H9 17.8303 1.5497 25.0110 H 1 <0> 0.0000 | ||
30 H10 19.9527 7.4708 22.7715 H 1 <0> 0.4000 | ||
31 H11 18.5977 8.5756 22.6932 H 1 <0> 0.4000 | ||
32 H12 14.2530 1.0535 27.4278 H 1 <0> 0.4000 | ||
@<TRIPOS>BOND | ||
1 1 2 2 | ||
2 1 3 1 | ||
3 2 11 1 | ||
4 3 10 2 | ||
5 3 12 1 | ||
6 4 10 1 | ||
7 4 11 am | ||
8 4 15 2 | ||
9 5 6 1 | ||
10 5 8 1 | ||
11 6 7 1 | ||
12 6 18 1 | ||
13 7 9 1 | ||
14 7 17 1 | ||
15 8 11 1 | ||
16 8 17 1 | ||
17 9 19 1 | ||
18 13 20 1 | ||
19 14 20 1 | ||
20 16 20 2 | ||
21 19 20 1 | ||
22 1 21 1 | ||
23 2 22 1 | ||
24 5 23 1 | ||
25 5 24 1 | ||
26 6 25 1 | ||
27 7 26 1 | ||
28 8 27 1 | ||
29 9 28 1 | ||
30 9 29 1 | ||
31 12 30 1 | ||
32 12 31 1 | ||
33 18 32 1 | ||
@<TRIPOS>SUBSTRUCTURE |
Oops, something went wrong.