# Creating combined tables of genome annotations for analysis

In [35]:
import os

workdir = '/home/sam/FullCyc_metagenome/Other_studies_comp/redo_analysis'

In [36]:
import pandas as pd
import re
from Bio import SeqIO

In [37]:
study_list = ['Avena_rhizosphere',  'biocharSIP',  'DeepSIP',
              'rainfall',  'SIPLigCel',  'SIPrhizosphere', 'RefSoil']

## PROKKA annotation tables

In [None]:
for study in study_list:
    print('Running: ' + study)
    annotation_df = pd.DataFrame()
    for genome in os.listdir(os.path.join(workdir, 'prokka_output', study)):
        sub_annotation = pd.read_csv(os.path.join(workdir, 'prokka_output', study, genome, genome + '.prokka.tsv'), sep='\t')
        sub_annotation['genome_file'] = genome
        sub_annotation['study'] = study
        annotation_df = annotation_df.append(sub_annotation, ignore_index=True)
    annotation_df.to_csv(os.path.join(workdir, 'full_study_files', study + '_annotations.txt'), header=True, index=False, sep='\t')
    annotation_df = None
    

Running: Avena_rhizosphere
Running: biocharSIP
Running: DeepSIP


## Transmembrane domain tables

In [None]:
for study in study_list:
    print('Running: ' + study)
    annotation_df = pd.DataFrame()
    for genome in os.listdir(os.path.join(workdir, 'TMHMM_output', study)):
        sub_annotation = pd.read_csv(os.path.join(workdir, 'TMHMM_output', study, genome), sep='\t',
                                     names = ['locus_tag', 'length', 'ExpAA', ' First60', 'PredHel', 'Topology'])
        sub_annotation['genome_file'] = re.sub('.prokka.faa.tmhmm$', '', genome)
        sub_annotation['study'] = study
        annotation_df = annotation_df.append(sub_annotation, ignore_index=True)
    annotation_df.to_csv(os.path.join(workdir, 'full_study_files', study + '_TMHMM_output.txt'), header=True, index=False, sep='\t')
    annotation_df = None

## Antismash cluster tables

In [None]:
for study in study_list:
    print('Running: ' + study)
    with open(os.path.join(workdir, 'full_study_files', study + '_antismash_output.txt'), 'w') as outfile:
        outfile.write('study\tgenome_file\tSMBC_product\tSMBC_start\tSMBC_end\n')
        for genome in os.listdir(os.path.join(workdir, 'antismash_output', study)):
            BCG_records = SeqIO.parse(os.path.join(workdir, 'antismash_output', study, genome, re.sub('.antismash$', '', genome)), 'genbank')
            for record in BCG_records:
                for feature in record.features:
                    if feature.type == 'region':
                        outfile.write('\t'.join([study,
                                                 re.sub('.prokka.gbk.antismash$', '', genome),
                                                 feature.qualifiers['product'][0],
                                                 str(feature.location.start),
                                                 str(feature.location.end) + '\n']))
                        
                        
                        
                        

## Antismash cluster tables with gene counts

In [75]:
for study in study_list:
    print('Running: ' + study)
    with open(os.path.join(workdir, 'full_study_files', study + '_antismash_nGene_output.txt'), 'w') as outfile:
        outfile.write('study\tgenome_file\tSMBC_region\tSMBC_product\tSMBC_genelocus\n')
        for genome in os.listdir(os.path.join(workdir, 'antismash_output', study)):
            region_list = [f for f in os.listdir(os.path.join(workdir, 'antismash_output', study, genome)) if re.search('region.*.gbk', f)]
            for region_gbk in region_list:
                BCG_records = SeqIO.parse(os.path.join(workdir, 'antismash_output', study, genome, region_gbk), 'genbank')
                for record in BCG_records:
                    for feature in record.features:
                        if feature.type == 'region':
                            region_prod = feature.qualifiers['product'][0]
                    for feature in record.features:
                        if feature.type == "CDS":
                            outfile.write('\t'.join([study,
                                                     re.sub('.prokka.gbk.antismash$', '', genome),
                                                     re.sub('.gbk', '', region_gbk),
                                                     region_prod,
                                                     feature.qualifiers['locus_tag'][0] + '\n']))
                    
                        

Running: Avena_rhizosphere
Running: biocharSIP
Running: DeepSIP
Running: rainfall
Running: SIPLigCel
Running: SIPrhizosphere
Running: RefSoil


## DeepTfactor 

In [None]:
for study in study_list:
    print('Running: ' + study)
    annotation_df = pd.DataFrame()
    for genome in os.listdir(os.path.join(workdir, 'deepTfactor_output', study)):
        sub_annotation = pd.read_csv(os.path.join(workdir, 'deepTfactor_output', study, genome, 'prediction_result.txt'), sep='\t')
        sub_annotation['genome_file'] = re.sub('.prokka.faa.TF_results$', '', genome)
        sub_annotation['study'] = study
        annotation_df = annotation_df.append(sub_annotation, ignore_index=True)
    annotation_df.to_csv(os.path.join(workdir, 'full_study_files', study + '_deepTfactor_output.txt'), header=True, index=False, sep='\t')
    annotation_df = None