# Further annotating secreted proteins

We want to see what types of proteins are secreted by the cell so lets further annotate them.

We will do this using:
dbCAN: CAZymes
MEROPS: Proteases
LED: Lipases

In [53]:
import os

workDir = '/home/sam/FullCyc_metagenome/annotation/secreted_protein_annotations'

gram_neg_signalp = '/home/sam/FullCyc_metagenome/annotation/signalp_annotation/Ga0334612_proteins_gram_neg_summary.signalp5'
gram_pos_signalp = '/home/sam/FullCyc_metagenome/annotation/signalp_annotation/Ga0334612_proteins_gram_pos_summary.signalp5'

gene_protein_fasta = '/home/sam/FullCyc_metagenome/annotation/IMG/Ga0334612_proteins.faa'

dbCAN_db = '/home/sam/new_databases/dbCAN/HMM/dbCAN-fam-HMMs.txt'
MEROPS_db = '/home/sam/new_databases/MEROPS/peptidase3/MEROPS_protease.dmnd'
ESTHER_db = '/home/sam/new_databases/ESTHER/HMMs/hydrolaseDB'

nproc = 40


In [7]:
from Bio import SeqIO
import pandas as pd

## Get fasta of just secreted proteins

In [17]:
gram_neg_signalp_df = pd.read_csv(gram_neg_signalp, sep='\t')
gram_neg_signalp_df = gram_neg_signalp_df[gram_neg_signalp_df['Prediction'] != 'OTHER']
gram_neg_signalp_list = list(gram_neg_signalp_df['# ID'])

gram_pos_signalp_df = pd.read_csv(gram_pos_signalp, sep='\t')
gram_pos_signalp_df = gram_pos_signalp_df[gram_pos_signalp_df['Prediction'] != 'OTHER']
gram_pos_signalp_list = list(gram_pos_signalp_df['# ID'])

gram_neg_signalp_df = None
gram_pos_signalp_df = None

In [19]:
signalp_list = list(set(gram_neg_signalp_list + gram_pos_signalp_list))
len(signalp_list)

1297704

In [27]:
with open(os.path.join(workDir, 'secreted_proteins.txt'), 'w') as outfile:
    for gene in signalp_list:
        outfile.write(gene + '\n')


Now subset the fasta file using awk:

awk 'BEGIN{while((getline<"secreted_proteins.txt")>0)l[">"$1]=1}/^>/{f=l[$1]}f' /home/sam/FullCyc_metagenome/annotation/IMG/Ga0334612_proteins.faa > secreted_proteins.faa

## CAZymes annotation with dbCAN
I'll use HHMER to compare the secreted protein genes to the CAZy HMM database

In [None]:
cmd = ' '.join(['hmmscan',
                '--cpu', str(nproc),
                '--domtblout', os.path.join(workDir, 'secreted_CAZymes.out.dm'),
                dbCAN_db, os.path.join(workDir, 'secreted_proteins.faa'),
                '>', os.path.join(workDir, 'secreted_CAZymes.out')])
!$cmd

In [46]:
cmd = ' '.join(['sh /home/sam/new_databases/dbCAN/HMM/hmmscan-parser.sh',
                os.path.join(workDir, 'secreted_CAZymes.out.dm'),
                '>', os.path.join(workDir, 'secreted_CAZymes.out.dm.ps')])
!$cmd

In [48]:
cmd = ' '.join(['cat', os.path.join(workDir, 'secreted_CAZymes.out.dm.ps'),
                "| awk '$5<1e-15&&$10>0.35'",
                '>', os.path.join(workDir, 'secreted_CAZymes.out.dm.ps.stringent')])
!$cmd

## Protease annotation with MEROPS
I'll use DIAMOND blastp to compare secreted protein genes with the MEROPS protease database

In [51]:
cmd = ' '.join(['diamond blastp',
                '--threads', str(nproc),
                '--evalue 0.0000000001',
                '-d', MEROPS_db,
                '-q', os.path.join(workDir, 'secreted_proteins.faa'),
                '-o', os.path.join(workDir, 'secreted_proteases.txt')])
!$cmd

diamond v0.9.14.115 | by Benjamin Buchfink <buchfink@gmail.com>
Licensed under the GNU AGPL <https://www.gnu.org/licenses/agpl.txt>
Check http://github.com/bbuchfink/diamond for updates.

#CPU threads: 40
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
#Target sequences to report alignments for: 25
Temporary directory: /home/sam/FullCyc_metagenome/annotation/secreted_protein_annotations
Opening the database...  [1.8e-05s]
Opening the input file...  [2.2e-05s]
Opening the output file...  [0.015974s]
Loading query sequences...  [1.67843s]
Masking queries...  [2.41377s]
Building query seed set...  [0.038319s]
Algorithm: Double-indexed
Building query histograms...  [0.316532s]
Allocating buffers...  [0.00024s]
Loading reference sequences...  [0.783136s]
Building reference histograms...  [0.411098s]
Allocating buffers...  [0.000196s]
Initializing temporary storage...  [0.011199s]
Processing query chunk 0, reference chunk 0, shape 0, index chunk 0.
Building referenc

## Lipase (alpha/beta hydrolysis unit) annotation with ESTHER
http://bioweb.supagro.inra.fr/ESTHER/definition

In [None]:
cmd = ' '.join(['hmmscan',
                '--cpu', str(nproc),
                '--domtblout', os.path.join(workDir, 'secreted_ABhydro.out.dm'),
                ESTHER_db, os.path.join(workDir, 'secreted_proteins.faa'),
                '>', os.path.join(workDir, 'secreted_ABhydro.out')])
!$cmd

In [56]:
print('Done!')

Done!


In [57]:
cmd = ' '.join(['sh /home/sam/new_databases/dbCAN/HMM/hmmscan-parser.sh',
                os.path.join(workDir, 'secreted_ABhydro.out.dm'),
                '>', os.path.join(workDir, 'secreted_ABhydro.out.dm.ps')])
!$cmd

In [58]:
cmd = ' '.join(['cat', os.path.join(workDir, 'secreted_ABhydro.out.dm.ps'),
                "| awk '$5<1e-15&&$10>0.35'",
                '>', os.path.join(workDir, 'secreted_ABhydro.out.dm.ps.stringent')])
!$cmd