Skip to content

Commit

Permalink
get interface
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Nov 23, 2015
1 parent 61baaba commit d0f93d4
Show file tree
Hide file tree
Showing 48 changed files with 19,620 additions and 120 deletions.
18 changes: 5 additions & 13 deletions biopandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
"""
BioPandas
Author: Sebastian Raschka <mail@sebastianraschka.com>
License: BSD 3 clause
Project Website: http://rasbt.github.io/biopandas/
Code Repository: https://github.com/rasbt/biopandas
"""


from .pandas_pdb import PandasPDB
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

__version__ = '0.1.0'
__author__ = "Sebastian Raschka <mail@sebastianraschka.com>"

__all__ = ["PandasPDB"]
14 changes: 14 additions & 0 deletions biopandas/pdb/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

"""
BioPandas module for working with Protein Data Bank (PDB)
files in pandas DataFrames.
"""

from .pandas_pdb import PandasPDB

__all__ = ["PandasPDB"]
13 changes: 5 additions & 8 deletions biopandas/engines.py → biopandas/pdb/engines.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
"""
BioPandas
Author: Sebastian Raschka <mail@sebastianraschka.com>
License: BSD 3 clause
Project Website: http://rasbt.github.io/biopandas/
Code Repository: https://github.com/rasbt/biopandas
"""
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

import pandas as pd

Expand Down
114 changes: 59 additions & 55 deletions biopandas/pandas_pdb.py → biopandas/pdb/pandas_pdb.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
"""
BioPandas
Author: Sebastian Raschka <mail@sebastianraschka.com>
License: BSD 3 clause
Project Website: http://rasbt.github.io/biopandas/
Code Repository: https://github.com/rasbt/biopandas
"""
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

import pandas as pd
import numpy as np
Expand All @@ -32,23 +29,17 @@ class PandasPDB(object):
pdb_text : str
PDB file contents in raw text format
title : str
header : str
PDB file description
code : str
PDB code
Examples
--------
>>> ppdb = PandasPDB()
>>> ppdb.fetch_pdb('3eiy')
>>> ppdb.df['ATOM'].head()
"""
def __init__(self):
self._df = {}
self.pdb_text = ''
self.title = ''
self.header = ''
self.code = ''
self._get_dict = {}

Expand All @@ -68,7 +59,7 @@ def read_pdb(self, path):
"""
self.pdb_text = self._read_pdb(path=path)
self._df = self._construct_df(pdb_lines=self.pdb_text.splitlines(True))
self.title, self.code = self._parse_title_code()
self.header, self.code = self._parse_header_code()

def fetch_pdb(self, pdb_code):
"""Fetches PDB file contents from the Protein Databank at rcsb.org.
Expand All @@ -82,18 +73,22 @@ def fetch_pdb(self, pdb_code):
self.pdb_text = self._fetch_pdb(pdb_code)
self._df = self._construct_df(pdb_lines=self.pdb_text.splitlines(True))

def get(self, s, df=None):
def get(self, s, df=None, invert=False):
"""Filter PDB DataFrames by properties
Parameters
----------
s : str {'main chain', 'hydrogen', 'no hydrogen', 'c-alpha'}
s : str in {'main chain', 'hydrogen', 'c-alpha'}
String to specify which entries to return
df : pandas.DataFrame (default : None)
df : pandas.DataFrame , default: None
Optional DataFrame to perform the filter operation on.
If df=None, filters on self.df['ATOM']
invert : bool (True)
Inverts the search query. For example if s='hydrogen' and
invert=True, all but hydrogen entries are returned
Returns
--------
df : pandas.DataFrame
Expand All @@ -106,10 +101,10 @@ def get(self, s, df=None):
raise AttributeError('s must be in %s' % self._get_dict.keys())
if not df:
df = self._df['ATOM']
return self._get_dict[s](df)
return self._get_dict[s](df, invert=invert)

@staticmethod
def rmsd(df1, df2, s='no hydrogen'):
def rmsd(df1, df2, s='main chain', invert=False):
"""Compute the Root Mean Square Deviation between molecules.
Parameters
Expand All @@ -121,8 +116,13 @@ def rmsd(df1, df2, s='no hydrogen'):
Second DataFrame for RMSD computation against df1. Must have the
same number of entries as df1
s : str {'main chain', 'hydrogen', 'no hydrogen', 'c-alpha'}
String to specify which entries to consider
s : str in {'main chain', 'hydrogen', 'c-alpha'}, default: 'main chain'
String to specify which entries to consider.
invert : bool, default: False
Inverts the string query if true. For example, the setting
`s='hydrogen', invert=True` computes the RMSD based on all
but hydrogen atoms.
Returns
---------
Expand All @@ -136,8 +136,8 @@ def rmsd(df1, df2, s='no hydrogen'):
if s:
if s not in get_dict.keys():
raise AttributeError('s must be in %s or None' % get_dict.keys())
df1 = get_dict[s](df1)
df2 = get_dict[s](df2)
df1 = get_dict[s](df1, invert=invert)
df2 = get_dict[s](df2, invert=invert)

total = ((df1['x_coord'] - df2['x_coord'])**2 +
(df1['y_coord'] - df2['y_coord'])**2 +
Expand All @@ -147,17 +147,16 @@ def rmsd(df1, df2, s='no hydrogen'):


@staticmethod
"""Initialize dictionary for filter operations."""
def _init_get_dict():
"""Initialize dictionary for filter operations."""
get_dict = {'main chain': PandasPDB._get_mainchain,
'hydrogen': PandasPDB._get_hydrogen,
'no hydrogen': PandasPDB._get_no_hydrogen,
'c-alpha': PandasPDB._get_calpha}
return get_dict

@staticmethod
"""Read PDB file from local drive."""
def _read_pdb(path):
"""Read PDB file from local drive."""
r_mode = 'r'
openf = open
if path.endswith('.gz'):
Expand All @@ -173,8 +172,8 @@ def _read_pdb(path):
return txt

@staticmethod
"""Load PDB file from rcsb.org."""
def _fetch_pdb(pdb_code):
"""Load PDB file from rcsb.org."""
txt = None
try:
response = urlopen('http://www.rcsb.org/pdb/files/%s.pdb' % pdb_code.lower())
Expand All @@ -189,46 +188,51 @@ def _fetch_pdb(pdb_code):
print('URL Error %s' %e.args)
return txt

def _parse_title_code(self):
"""Extract title information and PDB code."""
code, title = '', ''
def _parse_header_code(self):
"""Extract header information and PDB code."""
code, header = '', ''
if 'OTHERS' in self.df:

header = self.df['OTHERS'][self.df['OTHERS']['record_name'] == 'HEADER']
if not header.empty:
title = header['entry'].values[0]
s = title.split()
header = header['entry'].values[0]
s = header.split()
if s:
code = s[-1].lower()
return title, code
return header, code


@staticmethod
def _get_mainchain(df):
def _get_mainchain(df, invert):
"""Return only main chain atom entries from a DataFrame"""
mc = df[(df['atom_name'] == 'C') |
(df['atom_name'] == 'O') |
(df['atom_name'] == 'N') |
(df['atom_name'] == 'CA')]
if invert:
mc = df[(df['atom_name'] != 'C') &
(df['atom_name'] != 'O') &
(df['atom_name'] != 'N') &
(df['atom_name'] != 'CA')]
else:
mc = df[ (df['atom_name'] == 'C') |
(df['atom_name'] == 'O') |
(df['atom_name'] == 'N') |
(df['atom_name'] == 'CA')]
return mc


@staticmethod
def _get_hydrogen(df):
def _get_hydrogen(df, invert):
"""Return only hydrogen atom entries from a DataFrame"""
df_h = df[(df['atom_name'] == 'H')]
return df_h
@staticmethod
def _get_no_hydrogen(df):
"""Return all but hydrogen atom entries from a DataFrame"""
df_noh = df[(df['atom_name'] != 'H')]
return df_noh
if invert:
return df[(df['atom_name'] != 'H')]
else:
return df[(df['atom_name'] == 'H')]

@staticmethod
def _get_calpha(df):
def _get_calpha(df, invert):
"""Return c-alpha atom entries from a DataFrame"""
return df[df['atom_name'] == 'CA']
if invert:
return df[df['atom_name'] != 'CA']
else:
return df[df['atom_name'] == 'CA']

@staticmethod
def _construct_df(pdb_lines):
Expand Down Expand Up @@ -269,15 +273,15 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True):
path : str
A valid output path for the pdb file
records : iterable (default: None)
records : iterable, default: None
A list of PDB record sections in
{'ATOM', 'HETATM', 'ANISOU', 'OTHERS'} that are to be written.
Writes all lines to PDB if records=None
gz : bool (default: False)
gz : bool, default: False
Writes a gzipped PDB file if True
append_newline : bool (default: True)
append_newline : bool, default: True
Appends a new line at the end of the PDB file if True
"""
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
"""
BioPandas
Author: Sebastian Raschka <mail@sebastianraschka.com>
License: BSD 3 clause
Project Website: http://rasbt.github.io/biopandas/
Code Repository: https://github.com/rasbt/biopandas
"""

from biopandas import PandasPDB
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas


from biopandas.pdb import PandasPDB
import os
import numpy as np
import pandas as pd
# from biopandas.testutils import assertMultiLineEqual
from nose.tools import raises


Expand Down Expand Up @@ -47,7 +45,7 @@ def test_fetch_pdb():
assert ppdb.pdb_text == txt
txt = ppdb._fetch_pdb('3ey')
err = "We're sorry, but the requested file is not available"
assert err in txt
assert err in txt

def test__read_pdb_gz():
"""Test public _read_pdb with gzip files"""
Expand Down Expand Up @@ -91,7 +89,7 @@ def test_get_exceptions():
def test_get_all():
ppdb = PandasPDB()
ppdb.read_pdb(TESTDATA_FILENAME)
for i in ['c-alpha', 'no hydrogen', 'hydrogen', 'main chain']:
for i in ['c-alpha', 'hydrogen', 'main chain']:
ppdb.get(i)

def test_get_df():
Expand All @@ -101,7 +99,7 @@ def test_get_df():
shape = ppdb.get('c-alpha').shape
assert shape == (174, 21), shape

shape = ppdb.get('no hydrogen').shape
shape = ppdb.get('hydrogen', invert=True).shape
assert shape == (1330, 21), shape

shape = ppdb.get('hydrogen').shape
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
"""
BioPandas
Author: Sebastian Raschka <mail@sebastianraschka.com>
License: BSD 3 clause
Project Website: http://rasbt.github.io/biopandas/
Code Repository: https://github.com/rasbt/biopandas
"""

from biopandas import PandasPDB
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

from biopandas.pdb import PandasPDB
import os
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -46,9 +44,9 @@ def test_invalid_query():
r = PandasPDB.rmsd(p1t48.df['ATOM'].loc[1:, :], p1t48.df['ATOM'], s='bla')

def test_protein():
r = PandasPDB.rmsd(p1t48.df['ATOM'], p1t49.df['ATOM'], s='c-alpha')
r = PandasPDB.rmsd(p1t48.df['ATOM'], p1t49.df['ATOM'], s='c-alpha', invert=False)
assert r == 0.4785, r

def test_ligand():
r = PandasPDB.rmsd(pl1.df['HETATM'], pl2.df['HETATM'], s='no hydrogen')
r = PandasPDB.rmsd(pl1.df['HETATM'], pl2.df['HETATM'], s='hydrogen', invert=True)
assert r == 2.6444, r

0 comments on commit d0f93d4

Please sign in to comment.