# Getting enzyme annotations

In [1]:
import sys
import os
import glob
import re
import pandas as pd
import time

workDir = '/home/sam/FullCyc_metagenome/Other_studies_comp/redo_analysis/enzyme_output'

dbCAN_db = '/home/sam/new_databases/dbCAN/HMM/dbCAN-fam-HMMs.txt'
MEROPS_db = '/home/sam/new_databases/MEROPS/peptidase3/MEROPS_protease.dmnd'
ESTHER_db = '/home/sam/new_databases/ESTHER/HMMs/hydrolaseDB'

nproc = 50

## Get CAZymes

In [None]:
study = 'RefSoil'
genome_list = [fna for fna in os.listdir('/home/sam/new_databases/RefSoil/RefSoil_genomes/genome_fasta') if fna.endswith('.fna')]

for genome in genome_list:
    cmd = ' '.join(['hmmscan',
                    '--cpu', str(nproc),
                    '--domtblout', os.path.join(workDir, study, 'CAZymes', genome + '_CAZymes.out.dm'),
                    dbCAN_db, 
                    os.path.join('/home/sam/FullCyc_metagenome/Other_studies_comp/redo_analysis/prokka_output', study, genome, genome + '.prokka.faa'),
                    '>', os.path.join(workDir, study, 'CAZymes', genome + '_CAZymes.out')])
    !$cmd
    
    cmd = ' '.join(['sh /home/sam/new_databases/dbCAN/HMM/hmmscan-parser.sh',
                    os.path.join(workDir, study, 'CAZymes', genome + '_CAZymes.out.dm'),
                    '>', os.path.join(workDir, study, 'CAZymes', genome + '_CAZymes.out.dm.ps')])
    !$cmd
    
    cmd = ' '.join(['cat', os.path.join(workDir, study, 'CAZymes', genome + '_CAZymes.out.dm.ps'),
                    "| awk '$5<1e-15&&$10>0.35'",
                    '>', os.path.join(workDir, study, 'CAZymes', genome + '_CAZymes.out.dm.ps.stringent')])
    !$cmd
    

In [2]:
# Combine the main outputs
new_header = ['Target', 'target_length', 'Query', 'query_length', 'E_value', 'V6', 'V7', 'V8', 'V9', 'V10']

study = 'RefSoil'
genome_list = [fna for fna in os.listdir('/home/sam/new_databases/RefSoil/RefSoil_genomes/genome_fasta') if fna.endswith('.fna')]

CAZymes_df = pd.DataFrame()
for genome in genome_list:
    if os.path.getsize(os.path.join(workDir, study, 'CAZymes', genome + '_CAZymes.out.dm.ps.stringent')) > 0:
        output_df = pd.read_csv(os.path.join(workDir, study, 'CAZymes', genome + '_CAZymes.out.dm.ps.stringent'), sep='\t', header=None)
        output_df.columns = new_header
        output_df['genome'] = genome
        CAZymes_df = CAZymes_df.append(output_df)
        output_df = None
    else:
        print(genome + ' has no detected CAZymes')

CAZymes_df.to_csv(os.path.join(workDir, study, 'CAZymes.txt'), index=False, sep='\t')
CAZymes_df = None


NC_007798.1.fna has no detected CAZymes


## Get Proteases

In [None]:
study = 'RefSoil'
genome_list = [fna for fna in os.listdir('/home/sam/new_databases/RefSoil/RefSoil_genomes/genome_fasta') if fna.endswith('.fna')]

for genome in genome_list:
    cmd = ' '.join(['diamond blastp',
                '--threads', str(nproc),
                '--evalue 0.0000000001',
                '-d', MEROPS_db,
                '-q', os.path.join('/home/sam/FullCyc_metagenome/Other_studies_comp/redo_analysis/prokka_output', study, genome, genome + '.prokka.faa'),
                '-o', os.path.join(workDir, study, 'Proteases', genome + '_proteases.txt')])
    !$cmd

    

In [16]:
# Combine the main outputs
new_header = ['Query', 'Target', 'perc_ident', 'align_length', 'n_mismatch', 'n_gaps', 
              'query_start', 'query_end', 'target_start', 'target_end', 'E_value', 'bit_score']

study = 'RefSoil'
genome_list = [fna for fna in os.listdir('/home/sam/new_databases/RefSoil/RefSoil_genomes/genome_fasta') if fna.endswith('.fna')]

Proteases_df = pd.DataFrame()
for genome in genome_list:
    if os.path.getsize(os.path.join(workDir, study, 'Proteases', genome + '_proteases.txt')) > 0:
        output_df = pd.read_csv(os.path.join(workDir, study, 'Proteases', genome + '_proteases.txt'), sep='\t', header=None)
        output_df.columns = new_header
        output_df['genome'] = genome
        Proteases_df = Proteases_df.append(output_df)
        output_df = None
    else:
        print(genome + ' has no detected Proteases')

Proteases_df.to_csv(os.path.join(workDir, study, 'Proteases.txt'), index=False, sep='\t')

Proteases_df = None

## Get Lipases

In [None]:
study = 'RefSoil'
genome_list = [fna for fna in os.listdir('/home/sam/new_databases/RefSoil/RefSoil_genomes/genome_fasta') if fna.endswith('.fna')]

for genome in genome_list:
    cmd = ' '.join(['hmmscan',
                    '--cpu', str(nproc),
                    '--domtblout', os.path.join(workDir, study, 'Lipases', genome + '_ABhydro.out.dm'),
                    ESTHER_db, 
                    os.path.join('/home/sam/FullCyc_metagenome/Other_studies_comp/redo_analysis/prokka_output', study, genome, genome + '.prokka.faa'),
                    '>', os.path.join(workDir, study, 'Lipases', genome + '_ABhydro.out')])
    !$cmd
    
    cmd = ' '.join(['sh /home/sam/new_databases/dbCAN/HMM/hmmscan-parser.sh',
                    os.path.join(workDir, study, 'Lipases', genome + '_ABhydro.out.dm'),
                    '>', os.path.join(workDir, study, 'Lipases', genome + '_ABhydro.out.dm.ps')])
    !$cmd
    
    cmd = ' '.join(['cat', os.path.join(workDir, study, 'Lipases', genome + '_ABhydro.out.dm.ps'),
                    "| awk '$5<1e-15&&$10>0.35'",
                    '>', os.path.join(workDir, study, 'Lipases', genome + '_ABhydro.out.dm.ps.stringent')])
    !$cmd

    

In [18]:
# Combine the main outputs
new_header = ['Target', 'target_length', 'Query', 'query_length', 'E_value', 'V6', 'V7', 'V8', 'V9', 'V10']

study = 'RefSoil'
genome_list = [fna for fna in os.listdir('/home/sam/new_databases/RefSoil/RefSoil_genomes/genome_fasta') if fna.endswith('.fna')]

ABhydro_df = pd.DataFrame()
for genome in genome_list:
    if os.path.getsize(os.path.join(workDir, study, 'Lipases', genome + '_ABhydro.out.dm.ps.stringent')) > 0:
        output_df = pd.read_csv(os.path.join(workDir, study, 'Lipases', genome + '_ABhydro.out.dm.ps.stringent'), sep='\t', header=None)
        output_df.columns = new_header
        output_df['genome'] = genome
        ABhydro_df = ABhydro_df.append(output_df)
        output_df = None
    else:
        print(genome + ' has no detected ABhydro')

ABhydro_df.to_csv(os.path.join(workDir, study, 'ABhydro.txt'), index=False, sep='\t')
ABhydro_df = None


NC_008511.1.fna has no detected ABhydro
NC_014658.1.fna has no detected ABhydro
NZ_HF545617.1.fna has no detected ABhydro
NC_004343.2.fna has no detected ABhydro
NC_014961.1.fna has no detected ABhydro
NC_005824.1.fna has no detected ABhydro
NC_008509.1.fna has no detected ABhydro
NC_017552.1.fna has no detected ABhydro
NC_010424.1.fna has no detected ABhydro
