Skip to content

Commit

Permalink
Merge pull request #16 from rasbt/amino-1-letter
Browse files Browse the repository at this point in the history
amino3to1 conversion
  • Loading branch information
rasbt committed Feb 1, 2017
2 parents bced24c + bb0618d commit a5ca7a0
Show file tree
Hide file tree
Showing 5 changed files with 311 additions and 1 deletion.
30 changes: 30 additions & 0 deletions biopandas/pdb/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,36 @@

import pandas as pd

amino3to1dict = {'ASH': 'A',
'ALA': 'A',
'CYX': 'C',
'CYS': 'C',
'ASP': 'D',
'GLU': 'E',
'PHE': 'F',
'GLY': 'G',
'HIS': 'H',
'HID': 'H',
'HIE': 'H',
'HIP': 'H',
'ILE': 'I',
'LYS': 'K',
'LEU': 'L',
'MET': 'M',
'MSE': 'M',
'ASN': 'N',
'PYL': 'O',
'HYP': 'P',
'PRO': 'P',
'GLN': 'Q',
'ARG': 'R',
'SER': 'S',
'THR': 'T',
'SEL': 'U',
'VAL': 'V',
'TRP': 'W',
'TYR': 'Y'}

pdb_df_columns = {'record_name', 'atom_number', 'blank_1',
'atom_name', 'alt_loc', 'residue_name',
'blank_2', 'chain_id', 'residue_number',
Expand Down
32 changes: 32 additions & 0 deletions biopandas/pdb/pandas_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from urllib2 import urlopen, HTTPError, URLError # Python 2.7 compatible
from .engines import pdb_records
from .engines import pdb_df_columns
from .engines import amino3to1dict


class PandasPDB(object):
Expand Down Expand Up @@ -335,6 +336,37 @@ def _construct_df(pdb_lines):
dfs[r[0]] = df
return dfs

def amino3to1(self, record='ATOM',
residue_col='residue_name', fillna='?'):
"""Creates 1-letter amino acid codes from DataFrame
Non-canonical amino-acids are converted as follows:
ASH (protonated ASP) => D
CYX (disulfide-bonded CYS) => C
GLH (protonated GLU) => E
HID/HIE/HIP (different protonation states of HIS) = H
HYP (hydroxyproline) => P
MSE (selenomethionine) => M
Parameters
----------
record : str (default: 'ATOM')
Specfies the record DataFrame
residue_col : str (default: 'residue_name')
Column in `record` DataFrame to look for 3-letter amino acid
codes for the conversion
fillna : str (default: '?')
Placeholder string to use for unknown amino acids
Returns
---------
pandas.Series : Pandas Series object containing the 1-letter amino
acid codes after conversion
"""
tmp = self.df[record].drop_duplicates(subset='residue_number')
return tmp[residue_col].map(amino3to1dict).fillna(fillna)

def to_pdb(self, path, records=None, gz=False, append_newline=True):
"""Write record DataFrames to a PDB file or gzipped PDB file.
Expand Down
30 changes: 30 additions & 0 deletions biopandas/pdb/tests/test_amino3to1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

from biopandas.pdb import PandasPDB
import os


def test_defaults():
TESTDATA_1t48 = os.path.join(os.path.dirname(__file__), 'data',
'1t48_995.pdb')
p1t48 = PandasPDB()
p1t48.read_pdb(TESTDATA_1t48)
expect = ['M', 'E', 'M', 'E', 'K', 'E', 'F', 'E', 'Q',
'I', 'D', 'K', 'S', 'G', 'S', 'W', 'A', 'A',
'I', 'Y', 'Q', 'D', 'I', 'R', 'H', 'E', 'A',
'S', 'D', 'F', 'P', 'C', 'R', 'V', 'A', 'K',
'L', 'P', 'K', 'N', 'K', 'N', 'R', 'N', 'R',
'Y', 'R', 'D', 'V', 'S', 'P', 'F', 'D', 'H',
'S', 'R', 'I', 'K', 'L', 'H', 'Q', 'E', 'D',
'N', 'D', 'Y', 'I', 'N', 'A', 'S', 'L', 'I',
'K', 'M', 'E', 'E', 'A', 'Q', 'R', 'S', 'Y',
'I', 'L', 'T', 'Q', 'G', 'P', 'L', 'P', 'N',
'T', 'C', 'G', 'H', 'F', 'W', 'E', 'M', 'V',
'W', 'E', 'Q', 'K', 'S', 'R', 'G', 'V', 'V',
'M', 'L', 'N', 'R', 'V', 'M', 'E', 'K', 'G',
'S', 'L', 'K']
assert expect == list(p1t48.amino3to1().values)
2 changes: 2 additions & 0 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

##### New Features

- Added an `amino3to1` method to BiopandasPDB data frames to convert 3-amino acid letter codes to 1-letter codes.

##### Changes

- Raises a warning if `PandasPDB` is written to PDB and ATOM and HETAM section contains unexpected columns; these columns will now be skipped.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1665,6 +1665,221 @@
"[more to come]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Converting Amino Acid codes from 3- to 1-letter codes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Residues in the `residue_name` field can be converted into 1-letter amino acid codes, which may be useful for further sequence analysis, for example, pair-wise or multiple sequence alignments:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['S',\n",
" 'F',\n",
" 'S',\n",
" 'N',\n",
" 'V',\n",
" 'P',\n",
" 'A',\n",
" 'G',\n",
" 'K',\n",
" 'D',\n",
" 'L',\n",
" 'P',\n",
" 'Q',\n",
" 'D',\n",
" 'F',\n",
" 'N',\n",
" 'V',\n",
" 'I',\n",
" 'I',\n",
" 'E',\n",
" 'I',\n",
" 'P',\n",
" 'A',\n",
" 'Q',\n",
" 'S',\n",
" 'E',\n",
" 'P',\n",
" 'V',\n",
" 'K',\n",
" 'Y',\n",
" 'E',\n",
" 'A',\n",
" 'D',\n",
" 'K',\n",
" 'A',\n",
" 'L',\n",
" 'G',\n",
" 'L',\n",
" 'L',\n",
" 'V',\n",
" 'V',\n",
" 'D',\n",
" 'R',\n",
" 'F',\n",
" 'I',\n",
" 'G',\n",
" 'T',\n",
" 'G',\n",
" 'M',\n",
" 'R',\n",
" 'Y',\n",
" 'P',\n",
" 'V',\n",
" 'N',\n",
" 'Y',\n",
" 'G',\n",
" 'F',\n",
" 'I',\n",
" 'P',\n",
" 'Q',\n",
" 'T',\n",
" 'L',\n",
" 'S',\n",
" 'G',\n",
" 'D',\n",
" 'G',\n",
" 'D',\n",
" 'P',\n",
" 'V',\n",
" 'D',\n",
" 'V',\n",
" 'L',\n",
" 'V',\n",
" 'I',\n",
" 'T',\n",
" 'P',\n",
" 'F',\n",
" 'P',\n",
" 'L',\n",
" 'L',\n",
" 'A',\n",
" 'G',\n",
" 'S',\n",
" 'V',\n",
" 'V',\n",
" 'R',\n",
" 'A',\n",
" 'R',\n",
" 'A',\n",
" 'L',\n",
" 'G',\n",
" 'M',\n",
" 'L',\n",
" 'K',\n",
" 'M',\n",
" 'T',\n",
" 'D',\n",
" 'E',\n",
" 'S',\n",
" 'G',\n",
" 'V',\n",
" 'D',\n",
" 'A',\n",
" 'K',\n",
" 'L',\n",
" 'V',\n",
" 'A',\n",
" 'V',\n",
" 'P',\n",
" 'H',\n",
" 'D',\n",
" 'K',\n",
" 'V',\n",
" 'C',\n",
" 'P',\n",
" 'M',\n",
" 'T',\n",
" 'A',\n",
" 'N',\n",
" 'L',\n",
" 'K',\n",
" 'S',\n",
" 'I',\n",
" 'D',\n",
" 'D',\n",
" 'V',\n",
" 'P',\n",
" 'A',\n",
" 'Y',\n",
" 'L',\n",
" 'K',\n",
" 'D',\n",
" 'Q',\n",
" 'I',\n",
" 'K',\n",
" 'H',\n",
" 'F',\n",
" 'F',\n",
" 'E',\n",
" 'Q',\n",
" 'Y',\n",
" 'K',\n",
" 'A',\n",
" 'L',\n",
" 'E',\n",
" 'K',\n",
" 'G',\n",
" 'K',\n",
" 'W',\n",
" 'V',\n",
" 'K',\n",
" 'V',\n",
" 'E',\n",
" 'G',\n",
" 'W',\n",
" 'D',\n",
" 'G',\n",
" 'I',\n",
" 'D',\n",
" 'A',\n",
" 'A',\n",
" 'H',\n",
" 'K',\n",
" 'E',\n",
" 'I',\n",
" 'T',\n",
" 'D',\n",
" 'G',\n",
" 'V',\n",
" 'A',\n",
" 'N',\n",
" 'F',\n",
" 'K',\n",
" 'K']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from biopandas.pdb import PandasPDB\n",
"ppdb = PandasPDB().read_pdb('./data/3eiy.pdb.gz')\n",
"ppdb.amino3to1()\n",
"# By default, `amino3to1` returns a pandas Series object,\n",
"# and to convert it into a Python list, you can wrap it in list\n",
"# constructor, e.g.,\n",
"# `list(ppdb.amino3to1())`"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -1743,8 +1958,9 @@
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
Expand Down

0 comments on commit a5ca7a0

Please sign in to comment.