## Analysis of BAGEL4 results

Sergio Álvarez-Pérez, 2020

In [None]:
import os
import re
import pandas as pd

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')

if not os.path.exists('/home/sergio/TFM1/reports/bagel4/'):
    os.mkdir('/home/sergio/TFM1/reports/bagel4/')

In [None]:
path = '/home/sergio/TFM1/BAGEL4/' # path to the folder containing the outputs of BAGEL4 (lists of Areas of Interest: AOIs)
        
column_names = ['AOI','region_name','region_size','orf','gene_name','gene_start','gene_end','gene_strand','real_start',
                'real_end','real_strand','gene_color','function','motifs','annotation','protein','dna']

df = pd.DataFrame(columns = column_names)

for filename in os.listdir(path):
    if re.match("(.+)\.GeneTable", filename):
        aoi = re.match("(.+_.+_.+_.+)\.GeneTable", filename) # Filaname should follow the pattern: genus_species_strain_AOI
        aoi = aoi.groups()[0]

        new_df = pd.read_csv(os.path.join(path, filename),sep="\t")
        new_df.insert(0,'AOI',aoi)
        
        df = pd.concat((df, new_df))

df

In [None]:
# In BAGEL4 graphical outputs, functional classes are represented by different colors 
color_codes = df['gene_color'].nunique()
colors = {'green':'core peptide','#8fbcb9':'other','#d5dce8':'no function determined','#ff3f3f':'immunity/transport','#6ba3ff':'modification','#70c947':'core peptide'}
colors

In [None]:
target_columns = ['AOI','orf','real_start','real_end','real_strand','gene_color','function','annotation','protein','dna']
df2 = df.filter(items=target_columns)
df2 = df2.rename(columns={'orf':'ORF','real_start':'start','real_end':'end','real_strand':'strand','gene_color':'functional class','dna':'DNA'})
df2

In [None]:
# To generate the list of ORFs found in all AOIs of all Rosenbergiella genomes
df2 = df2.replace({'functional class': colors})
pd.DataFrame.to_csv(df2, path_or_buf = '/home/sergio/TFM1/reports/bagel4/AOIs_per_genome.tsv', sep = "\t", index = True)
df2