Merge pull request #16 from rasbt/amino-1-letter

amino3to1 conversion
BioPandas · Feb 1, 2017 · a5ca7a0 · a5ca7a0
2 parents bced24c + bb0618d
commit a5ca7a0
Show file tree

Hide file tree

Showing 5 changed files with 311 additions and 1 deletion.
diff --git a/biopandas/pdb/engines.py b/biopandas/pdb/engines.py
@@ -6,6 +6,36 @@
 
 import pandas as pd
 
+amino3to1dict = {'ASH': 'A',
+                 'ALA': 'A',
+                 'CYX': 'C',
+                 'CYS': 'C',
+                 'ASP': 'D',
+                 'GLU': 'E',
+                 'PHE': 'F',
+                 'GLY': 'G',
+                 'HIS': 'H',
+                 'HID': 'H',
+                 'HIE': 'H',
+                 'HIP': 'H',
+                 'ILE': 'I',
+                 'LYS': 'K',
+                 'LEU': 'L',
+                 'MET': 'M',
+                 'MSE': 'M',
+                 'ASN': 'N',
+                 'PYL': 'O',
+                 'HYP': 'P',
+                 'PRO': 'P',
+                 'GLN': 'Q',
+                 'ARG': 'R',
+                 'SER': 'S',
+                 'THR': 'T',
+                 'SEL': 'U',
+                 'VAL': 'V',
+                 'TRP': 'W',
+                 'TYR': 'Y'}
+
 pdb_df_columns = {'record_name', 'atom_number', 'blank_1',
                   'atom_name', 'alt_loc', 'residue_name',
                   'blank_2', 'chain_id', 'residue_number',

diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py
@@ -16,6 +16,7 @@
     from urllib2 import urlopen, HTTPError, URLError  # Python 2.7 compatible
 from .engines import pdb_records
 from .engines import pdb_df_columns
+from .engines import amino3to1dict
 
 
 class PandasPDB(object):
@@ -335,6 +336,37 @@ def _construct_df(pdb_lines):
             dfs[r[0]] = df
         return dfs
 
+    def amino3to1(self, record='ATOM',
+                  residue_col='residue_name', fillna='?'):
+        """Creates 1-letter amino acid codes from DataFrame
+
+        Non-canonical amino-acids are converted as follows:
+        ASH (protonated ASP) => D
+        CYX (disulfide-bonded CYS) => C
+        GLH (protonated GLU) => E
+        HID/HIE/HIP (different protonation states of HIS) = H
+        HYP (hydroxyproline) => P
+        MSE (selenomethionine) => M
+
+        Parameters
+        ----------
+        record : str (default: 'ATOM')
+            Specfies the record DataFrame
+        residue_col : str (default: 'residue_name')
+            Column in `record` DataFrame to look for 3-letter amino acid
+            codes for the conversion
+        fillna : str (default: '?')
+            Placeholder string to use for unknown amino acids
+
+        Returns
+        ---------
+        pandas.Series : Pandas Series object containing the 1-letter amino
+            acid codes after conversion
+
+        """
+        tmp = self.df[record].drop_duplicates(subset='residue_number')
+        return tmp[residue_col].map(amino3to1dict).fillna(fillna)
+
     def to_pdb(self, path, records=None, gz=False, append_newline=True):
         """Write record DataFrames to a PDB file or gzipped PDB file.
 

diff --git a/biopandas/pdb/tests/test_amino3to1.py b/biopandas/pdb/tests/test_amino3to1.py
@@ -0,0 +1,30 @@
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
+
+from biopandas.pdb import PandasPDB
+import os
+
+
+def test_defaults():
+    TESTDATA_1t48 = os.path.join(os.path.dirname(__file__), 'data',
+                                                            '1t48_995.pdb')
+    p1t48 = PandasPDB()
+    p1t48.read_pdb(TESTDATA_1t48)
+    expect = ['M', 'E', 'M', 'E', 'K', 'E', 'F', 'E', 'Q',
+              'I', 'D', 'K', 'S', 'G', 'S', 'W', 'A', 'A',
+              'I', 'Y', 'Q', 'D', 'I', 'R', 'H', 'E', 'A',
+              'S', 'D', 'F', 'P', 'C', 'R', 'V', 'A', 'K',
+              'L', 'P', 'K', 'N', 'K', 'N', 'R', 'N', 'R',
+              'Y', 'R', 'D', 'V', 'S', 'P', 'F', 'D', 'H',
+              'S', 'R', 'I', 'K', 'L', 'H', 'Q', 'E', 'D',
+              'N', 'D', 'Y', 'I', 'N', 'A', 'S', 'L', 'I',
+              'K', 'M', 'E', 'E', 'A', 'Q', 'R', 'S', 'Y',
+              'I', 'L', 'T', 'Q', 'G', 'P', 'L', 'P', 'N',
+              'T', 'C', 'G', 'H', 'F', 'W', 'E', 'M', 'V',
+              'W', 'E', 'Q', 'K', 'S', 'R', 'G', 'V', 'V',
+              'M', 'L', 'N', 'R', 'V', 'M', 'E', 'K', 'G',
+              'S', 'L', 'K']
+    assert expect == list(p1t48.amino3to1().values)
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -8,6 +8,8 @@
 
 ##### New Features
 
+- Added an `amino3to1` method to BiopandasPDB data frames to convert 3-amino acid letter codes to 1-letter codes.
+
 ##### Changes
 
 - Raises a warning if `PandasPDB` is written to PDB and ATOM and HETAM section contains unexpected columns; these columns will now be skipped.

diff --git a/docs/sources/tutorials/Working_with_PDB_Structures_in_DataFrames.ipynb b/docs/sources/tutorials/Working_with_PDB_Structures_in_DataFrames.ipynb
@@ -1665,6 +1665,221 @@
     "[more to come]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Converting Amino Acid codes from 3- to 1-letter codes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Residues in the `residue_name` field can be converted into 1-letter amino acid codes, which may be useful for further sequence analysis, for example, pair-wise or multiple sequence alignments:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['S',\n",
+       " 'F',\n",
+       " 'S',\n",
+       " 'N',\n",
+       " 'V',\n",
+       " 'P',\n",
+       " 'A',\n",
+       " 'G',\n",
+       " 'K',\n",
+       " 'D',\n",
+       " 'L',\n",
+       " 'P',\n",
+       " 'Q',\n",
+       " 'D',\n",
+       " 'F',\n",
+       " 'N',\n",
+       " 'V',\n",
+       " 'I',\n",
+       " 'I',\n",
+       " 'E',\n",
+       " 'I',\n",
+       " 'P',\n",
+       " 'A',\n",
+       " 'Q',\n",
+       " 'S',\n",
+       " 'E',\n",
+       " 'P',\n",
+       " 'V',\n",
+       " 'K',\n",
+       " 'Y',\n",
+       " 'E',\n",
+       " 'A',\n",
+       " 'D',\n",
+       " 'K',\n",
+       " 'A',\n",
+       " 'L',\n",
+       " 'G',\n",
+       " 'L',\n",
+       " 'L',\n",
+       " 'V',\n",
+       " 'V',\n",
+       " 'D',\n",
+       " 'R',\n",
+       " 'F',\n",
+       " 'I',\n",
+       " 'G',\n",
+       " 'T',\n",
+       " 'G',\n",
+       " 'M',\n",
+       " 'R',\n",
+       " 'Y',\n",
+       " 'P',\n",
+       " 'V',\n",
+       " 'N',\n",
+       " 'Y',\n",
+       " 'G',\n",
+       " 'F',\n",
+       " 'I',\n",
+       " 'P',\n",
+       " 'Q',\n",
+       " 'T',\n",
+       " 'L',\n",
+       " 'S',\n",
+       " 'G',\n",
+       " 'D',\n",
+       " 'G',\n",
+       " 'D',\n",
+       " 'P',\n",
+       " 'V',\n",
+       " 'D',\n",
+       " 'V',\n",
+       " 'L',\n",
+       " 'V',\n",
+       " 'I',\n",
+       " 'T',\n",
+       " 'P',\n",
+       " 'F',\n",
+       " 'P',\n",
+       " 'L',\n",
+       " 'L',\n",
+       " 'A',\n",
+       " 'G',\n",
+       " 'S',\n",
+       " 'V',\n",
+       " 'V',\n",
+       " 'R',\n",
+       " 'A',\n",
+       " 'R',\n",
+       " 'A',\n",
+       " 'L',\n",
+       " 'G',\n",
+       " 'M',\n",
+       " 'L',\n",
+       " 'K',\n",
+       " 'M',\n",
+       " 'T',\n",
+       " 'D',\n",
+       " 'E',\n",
+       " 'S',\n",
+       " 'G',\n",
+       " 'V',\n",
+       " 'D',\n",
+       " 'A',\n",
+       " 'K',\n",
+       " 'L',\n",
+       " 'V',\n",
+       " 'A',\n",
+       " 'V',\n",
+       " 'P',\n",
+       " 'H',\n",
+       " 'D',\n",
+       " 'K',\n",
+       " 'V',\n",
+       " 'C',\n",
+       " 'P',\n",
+       " 'M',\n",
+       " 'T',\n",
+       " 'A',\n",
+       " 'N',\n",
+       " 'L',\n",
+       " 'K',\n",
+       " 'S',\n",
+       " 'I',\n",
+       " 'D',\n",
+       " 'D',\n",
+       " 'V',\n",
+       " 'P',\n",
+       " 'A',\n",
+       " 'Y',\n",
+       " 'L',\n",
+       " 'K',\n",
+       " 'D',\n",
+       " 'Q',\n",
+       " 'I',\n",
+       " 'K',\n",
+       " 'H',\n",
+       " 'F',\n",
+       " 'F',\n",
+       " 'E',\n",
+       " 'Q',\n",
+       " 'Y',\n",
+       " 'K',\n",
+       " 'A',\n",
+       " 'L',\n",
+       " 'E',\n",
+       " 'K',\n",
+       " 'G',\n",
+       " 'K',\n",
+       " 'W',\n",
+       " 'V',\n",
+       " 'K',\n",
+       " 'V',\n",
+       " 'E',\n",
+       " 'G',\n",
+       " 'W',\n",
+       " 'D',\n",
+       " 'G',\n",
+       " 'I',\n",
+       " 'D',\n",
+       " 'A',\n",
+       " 'A',\n",
+       " 'H',\n",
+       " 'K',\n",
+       " 'E',\n",
+       " 'I',\n",
+       " 'T',\n",
+       " 'D',\n",
+       " 'G',\n",
+       " 'V',\n",
+       " 'A',\n",
+       " 'N',\n",
+       " 'F',\n",
+       " 'K',\n",
+       " 'K']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from biopandas.pdb import PandasPDB\n",
+    "ppdb = PandasPDB().read_pdb('./data/3eiy.pdb.gz')\n",
+    "ppdb.amino3to1()\n",
+    "# By default, `amino3to1` returns a pandas Series object,\n",
+    "# and to convert it into a Python list, you can wrap it in list\n",
+    "# constructor, e.g.,\n",
+    "# `list(ppdb.amino3to1())`"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1743,8 +1958,9 @@
   }
  ],
  "metadata": {
+  "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [default]",
    "language": "python",
    "name": "python3"
   },