get interface

BioPandas · Nov 23, 2015 · d0f93d4 · d0f93d4
1 parent 61baaba
commit d0f93d4
Show file tree

Hide file tree

Showing 48 changed files with 19,620 additions and 120 deletions.
diff --git a/biopandas/__init__.py b/biopandas/__init__.py
@@ -1,16 +1,8 @@
-"""
-
-BioPandas
-Author: Sebastian Raschka <mail@sebastianraschka.com>
-License: BSD 3 clause
-Project Website: http://rasbt.github.io/biopandas/
-Code Repository: https://github.com/rasbt/biopandas
-"""
-
-
-from .pandas_pdb import PandasPDB
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
 
 __version__ = '0.1.0'
 __author__ = "Sebastian Raschka <mail@sebastianraschka.com>"
-
-__all__ = ["PandasPDB"]
diff --git a/biopandas/pdb/__init__.py b/biopandas/pdb/__init__.py
@@ -0,0 +1,14 @@
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
+
+"""
+BioPandas module for working with Protein Data Bank (PDB)
+files in pandas DataFrames.
+"""
+
+from .pandas_pdb import PandasPDB
+
+__all__ = ["PandasPDB"]
diff --git a/biopandas/engines.py → biopandas/pdb/engines.py b/biopandas/engines.py → biopandas/pdb/engines.py
@@ -1,11 +1,8 @@
-"""
-BioPandas
-
-Author: Sebastian Raschka <mail@sebastianraschka.com>
-License: BSD 3 clause
-Project Website: http://rasbt.github.io/biopandas/
-Code Repository: https://github.com/rasbt/biopandas
-"""
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
 
 import pandas as pd
 

diff --git a/biopandas/pandas_pdb.py → biopandas/pdb/pandas_pdb.py b/biopandas/pandas_pdb.py → biopandas/pdb/pandas_pdb.py
@@ -1,11 +1,8 @@
-"""
-BioPandas
-
-Author: Sebastian Raschka <mail@sebastianraschka.com>
-License: BSD 3 clause
-Project Website: http://rasbt.github.io/biopandas/
-Code Repository: https://github.com/rasbt/biopandas
-"""
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
 
 import pandas as pd
 import numpy as np
@@ -32,23 +29,17 @@ class PandasPDB(object):
     pdb_text : str
         PDB file contents in raw text format
 
-    title : str
+    header : str
         PDB file description
 
     code : str
         PDB code
 
-    Examples
-    --------
-    >>> ppdb = PandasPDB()
-    >>> ppdb.fetch_pdb('3eiy')
-    >>> ppdb.df['ATOM'].head()
-
     """
     def __init__(self):
         self._df = {}
         self.pdb_text = ''
-        self.title = ''
+        self.header = ''
         self.code = ''
         self._get_dict = {}
 
@@ -68,7 +59,7 @@ def read_pdb(self, path):
         """
         self.pdb_text = self._read_pdb(path=path)
         self._df = self._construct_df(pdb_lines=self.pdb_text.splitlines(True))
-        self.title, self.code = self._parse_title_code()
+        self.header, self.code = self._parse_header_code()
 
     def fetch_pdb(self, pdb_code):
         """Fetches PDB file contents from the Protein Databank at rcsb.org.
@@ -82,18 +73,22 @@ def fetch_pdb(self, pdb_code):
         self.pdb_text = self._fetch_pdb(pdb_code)
         self._df = self._construct_df(pdb_lines=self.pdb_text.splitlines(True))
 
-    def get(self, s, df=None):
+    def get(self, s, df=None, invert=False):
         """Filter PDB DataFrames by properties
 
         Parameters
         ----------
-        s : str {'main chain', 'hydrogen', 'no hydrogen', 'c-alpha'}
+        s : str  in {'main chain', 'hydrogen', 'c-alpha'}
             String to specify which entries to return
 
-        df : pandas.DataFrame (default : None)
+        df : pandas.DataFrame , default: None
             Optional DataFrame to perform the filter operation on.
             If df=None, filters on self.df['ATOM']
 
+        invert : bool (True)
+            Inverts the search query. For example if s='hydrogen' and
+            invert=True, all but hydrogen entries are returned
+
         Returns
         --------
         df : pandas.DataFrame
@@ -106,10 +101,10 @@ def get(self, s, df=None):
             raise AttributeError('s must be in %s' % self._get_dict.keys())
         if not df:
             df = self._df['ATOM']
-        return self._get_dict[s](df)
+        return self._get_dict[s](df, invert=invert)
 
     @staticmethod
-    def rmsd(df1, df2, s='no hydrogen'):
+    def rmsd(df1, df2, s='main chain', invert=False):
         """Compute the Root Mean Square Deviation between molecules.
 
         Parameters
@@ -121,8 +116,13 @@ def rmsd(df1, df2, s='no hydrogen'):
             Second DataFrame for RMSD computation against df1. Must have the
             same number of entries as df1
 
-        s : str {'main chain', 'hydrogen', 'no hydrogen', 'c-alpha'}
-            String to specify which entries to consider
+        s : str in {'main chain', 'hydrogen', 'c-alpha'}, default: 'main chain'
+            String to specify which entries to consider.
+
+        invert : bool, default: False
+            Inverts the string query if true. For example, the setting
+            `s='hydrogen', invert=True` computes the RMSD based on all
+            but hydrogen atoms.
 
         Returns
         ---------
@@ -136,8 +136,8 @@ def rmsd(df1, df2, s='no hydrogen'):
         if s:
             if s not in get_dict.keys():
                 raise AttributeError('s must be in %s or None' % get_dict.keys())
-            df1 = get_dict[s](df1)
-            df2 = get_dict[s](df2)
+            df1 = get_dict[s](df1, invert=invert)
+            df2 = get_dict[s](df2, invert=invert)
 
         total = ((df1['x_coord'] - df2['x_coord'])**2 +
                 (df1['y_coord'] - df2['y_coord'])**2 +
@@ -147,17 +147,16 @@ def rmsd(df1, df2, s='no hydrogen'):
 
 
     @staticmethod
-    """Initialize dictionary for filter operations."""
     def _init_get_dict():
+        """Initialize dictionary for filter operations."""
         get_dict = {'main chain': PandasPDB._get_mainchain,
                     'hydrogen': PandasPDB._get_hydrogen,
-                    'no hydrogen': PandasPDB._get_no_hydrogen,
                     'c-alpha': PandasPDB._get_calpha}
         return get_dict
 
     @staticmethod
-    """Read PDB file from local drive."""
     def _read_pdb(path):
+        """Read PDB file from local drive."""
         r_mode = 'r'
         openf = open
         if path.endswith('.gz'):
@@ -173,8 +172,8 @@ def _read_pdb(path):
         return txt
 
     @staticmethod
-    """Load PDB file from rcsb.org."""
     def _fetch_pdb(pdb_code):
+        """Load PDB file from rcsb.org."""
         txt = None
         try:
             response = urlopen('http://www.rcsb.org/pdb/files/%s.pdb' % pdb_code.lower())
@@ -189,46 +188,51 @@ def _fetch_pdb(pdb_code):
             print('URL Error %s' %e.args)
         return txt
 
-    def _parse_title_code(self):
-        """Extract title information and PDB code."""
-        code, title = '', ''
+    def _parse_header_code(self):
+        """Extract header information and PDB code."""
+        code, header = '', ''
         if 'OTHERS' in self.df:
 
             header = self.df['OTHERS'][self.df['OTHERS']['record_name'] == 'HEADER']
             if not header.empty:
-                title = header['entry'].values[0]
-                s = title.split()
+                header = header['entry'].values[0]
+                s = header.split()
                 if s:
                     code = s[-1].lower()
-        return title, code
+        return header, code
 
 
     @staticmethod
-    def _get_mainchain(df):
+    def _get_mainchain(df, invert):
         """Return only main chain atom entries from a DataFrame"""
-        mc =  df[(df['atom_name'] == 'C') |
-                 (df['atom_name'] == 'O') |
-                 (df['atom_name'] == 'N') |
-                 (df['atom_name'] == 'CA')]
+        if invert:
+            mc = df[(df['atom_name'] != 'C') &
+                    (df['atom_name'] != 'O') &
+                    (df['atom_name'] != 'N') &
+                    (df['atom_name'] != 'CA')]
+        else:
+            mc = df[ (df['atom_name'] == 'C') |
+                     (df['atom_name'] == 'O') |
+                     (df['atom_name'] == 'N') |
+                     (df['atom_name'] == 'CA')]
         return mc
 
 
     @staticmethod
-    def _get_hydrogen(df):
+    def _get_hydrogen(df, invert):
         """Return only hydrogen atom entries from a DataFrame"""
-        df_h = df[(df['atom_name'] == 'H')]
-        return df_h
-
-    @staticmethod
-    def _get_no_hydrogen(df):
-        """Return all but hydrogen atom entries from a DataFrame"""
-        df_noh = df[(df['atom_name'] != 'H')]
-        return df_noh
+        if invert:
+            return df[(df['atom_name'] != 'H')]
+        else:
+            return df[(df['atom_name'] == 'H')]
 
     @staticmethod
-    def _get_calpha(df):
+    def _get_calpha(df, invert):
         """Return c-alpha atom entries from a DataFrame"""
-        return df[df['atom_name'] == 'CA']
+        if invert:
+            return df[df['atom_name'] != 'CA']
+        else:
+            return df[df['atom_name'] == 'CA']
 
     @staticmethod
     def _construct_df(pdb_lines):
@@ -269,15 +273,15 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True):
         path : str
             A valid output path for the pdb file
 
-        records : iterable (default: None)
+        records : iterable, default: None
             A list of PDB record sections in
             {'ATOM', 'HETATM', 'ANISOU', 'OTHERS'} that are to be written.
             Writes all lines to PDB if records=None
 
-        gz : bool (default: False)
+        gz : bool, default: False
             Writes a gzipped PDB file if True
 
-        append_newline : bool (default: True)
+        append_newline : bool, default: True
             Appends a new line at the end of the PDB file if True
 
         """

diff --git a/...s/tests/test_pandas_pdb/data/1t48_995.pdb → biopandas/pdb/tests/data/1t48_995.pdb b/...s/tests/test_pandas_pdb/data/1t48_995.pdb → biopandas/pdb/tests/data/1t48_995.pdb
diff --git a/...s/tests/test_pandas_pdb/data/1t49_995.pdb → biopandas/pdb/tests/data/1t49_995.pdb b/...s/tests/test_pandas_pdb/data/1t49_995.pdb → biopandas/pdb/tests/data/1t49_995.pdb
diff --git a/...andas/tests/test_pandas_pdb/data/3eiy.pdb → biopandas/pdb/tests/data/3eiy.pdb b/...andas/tests/test_pandas_pdb/data/3eiy.pdb → biopandas/pdb/tests/data/3eiy.pdb
diff --git a/...as/tests/test_pandas_pdb/data/3eiy.pdb.gz → biopandas/pdb/tests/data/3eiy.pdb.gz b/...as/tests/test_pandas_pdb/data/3eiy.pdb.gz → biopandas/pdb/tests/data/3eiy.pdb.gz
diff --git a/...tests/test_pandas_pdb/data/lig_conf_1.pdb → biopandas/pdb/tests/data/lig_conf_1.pdb b/...tests/test_pandas_pdb/data/lig_conf_1.pdb → biopandas/pdb/tests/data/lig_conf_1.pdb
diff --git a/...tests/test_pandas_pdb/data/lig_conf_2.pdb → biopandas/pdb/tests/data/lig_conf_2.pdb b/...tests/test_pandas_pdb/data/lig_conf_2.pdb → biopandas/pdb/tests/data/lig_conf_2.pdb
diff --git a/...as/tests/test_pandas_pdb/test_read_pdb.py → biopandas/pdb/tests/test_read_pdb.py b/...as/tests/test_pandas_pdb/test_read_pdb.py → biopandas/pdb/tests/test_read_pdb.py
@@ -1,16 +1,14 @@
-"""
-BioPandas
-Author: Sebastian Raschka <mail@sebastianraschka.com>
-License: BSD 3 clause
-Project Website: http://rasbt.github.io/biopandas/
-Code Repository: https://github.com/rasbt/biopandas
-"""
-
-from biopandas import PandasPDB
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
+
+
+from biopandas.pdb import PandasPDB
 import os
 import numpy as np
 import pandas as pd
-# from biopandas.testutils import assertMultiLineEqual
 from nose.tools import raises
 
 
@@ -47,7 +45,7 @@ def test_fetch_pdb():
     assert ppdb.pdb_text == txt
     txt = ppdb._fetch_pdb('3ey')
     err = "We're sorry, but the requested file is not available"
-    assert err in txt 
+    assert err in txt
 
 def test__read_pdb_gz():
     """Test public _read_pdb with gzip files"""
@@ -91,7 +89,7 @@ def test_get_exceptions():
 def test_get_all():
     ppdb = PandasPDB()
     ppdb.read_pdb(TESTDATA_FILENAME)
-    for i in ['c-alpha', 'no hydrogen', 'hydrogen', 'main chain']:
+    for i in ['c-alpha', 'hydrogen', 'main chain']:
         ppdb.get(i)
 
 def test_get_df():
@@ -101,7 +99,7 @@ def test_get_df():
     shape = ppdb.get('c-alpha').shape
     assert shape == (174, 21), shape
 
-    shape = ppdb.get('no hydrogen').shape
+    shape = ppdb.get('hydrogen', invert=True).shape
     assert shape == (1330, 21), shape
 
     shape = ppdb.get('hydrogen').shape

diff --git a/biopandas/tests/test_pandas_pdb/test_rmsd.py → biopandas/pdb/tests/test_rmsd.py b/biopandas/tests/test_pandas_pdb/test_rmsd.py → biopandas/pdb/tests/test_rmsd.py
@@ -1,12 +1,10 @@
-"""
-BioPandas
-Author: Sebastian Raschka <mail@sebastianraschka.com>
-License: BSD 3 clause
-Project Website: http://rasbt.github.io/biopandas/
-Code Repository: https://github.com/rasbt/biopandas
-"""
-
-from biopandas import PandasPDB
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
+
+from biopandas.pdb import PandasPDB
 import os
 import numpy as np
 import pandas as pd
@@ -46,9 +44,9 @@ def test_invalid_query():
     r = PandasPDB.rmsd(p1t48.df['ATOM'].loc[1:, :], p1t48.df['ATOM'], s='bla')
 
 def test_protein():
-    r = PandasPDB.rmsd(p1t48.df['ATOM'], p1t49.df['ATOM'], s='c-alpha')
+    r = PandasPDB.rmsd(p1t48.df['ATOM'], p1t49.df['ATOM'], s='c-alpha', invert=False)
     assert r == 0.4785, r
 
 def test_ligand():
-    r = PandasPDB.rmsd(pl1.df['HETATM'], pl2.df['HETATM'], s='no hydrogen')
+    r = PandasPDB.rmsd(pl1.df['HETATM'], pl2.df['HETATM'], s='hydrogen', invert=True)
     assert r == 2.6444, r