## Analysis of WebMGA results

Sergio Álvarez-Pérez, 2020

In [None]:
import os
import re
import numpy as np
import pandas as pd

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')

if not os.path.exists('/home/sergio/TFM1/reports/webmga/'):
    os.mkdir('/home/sergio/TFM1/reports/webmga/')

### COG class annotations

In [None]:
path = '/home/sergio/TFM1/webmga/' # path to the folder containing the outputs of WebMGA (http://weizhong-lab.ucsd.edu/webMGA/)

classes_list = []
genomes_list = []
classes_per_genome = {}

for foldername in os.listdir(path):
    for filename in os.listdir(os.path.join(path,foldername,"cog")):
        if re.match("cog-class.txt", filename):
            genome = foldername # Folder name should follow the pattern: genus_species_strain
            genomes_list.append(genome)
            cog_dict = {}
            with open(os.path.join(path, foldername, "cog", filename)) as myFile:
                next(myFile)
                for line in myFile:
                    cog_class, no_families, coverage, abundance, description = line.split('\t')
                    description = re.match('(.+) \n', description).groups()[0]
                    classes_list.append([cog_class, description])
                    cog_dict[cog_class] = {'No. families':no_families, 'Coverage':coverage, 'Abundance':abundance}
            classes_per_genome[genome] = cog_dict 
            myFile.close()

df = pd.DataFrame(classes_list, columns = ('COG class','Description')).drop_duplicates().sort_values(by='COG class').set_index('COG class')

for genome in genomes_list: # template table
    df[genome] = np.nan

In [None]:
# Report of the number of COG families per class annotation
n_families = df.fillna('0')

for key,value in classes_per_genome.items():
    genome = key
    for k,v in value.items():
        n_families[genome][k] = np.int64(v['No. families'])

n_families = n_families[sorted(n_families.columns)].sort_index()
pd.DataFrame.to_csv(n_families, path_or_buf = '/home/sergio/TFM1/reports/webmga/cog_classes_nfamilies.tsv', sep = "\t", index = True)
n_families

In [None]:
# Report of the coverage of each COG class annotation
coverage = df.fillna('0')

for key,value in classes_per_genome.items():
    genome = key
    for k,v in value.items():
        coverage[genome][k] = np.round(np.float64(v['Coverage']),3)

coverage = coverage[sorted(coverage.columns)].sort_index()        
pd.DataFrame.to_csv(coverage, path_or_buf = '/home/sergio/TFM1/reports/webmga/cog_classes_coverage.tsv', sep = "\t", index = True)
coverage

In [None]:
# Report of the abundance of each COG class annotation
abundance = df

for key,value in classes_per_genome.items():
    genome = key
    for k,v in value.items():
        abundance[genome][k] = np.round(np.float64(v['Abundance']),4)

abundance = abundance[sorted(abundance.columns)].sort_index()
pd.DataFrame.to_csv(abundance, path_or_buf = '/home/sergio/TFM1/reports/webmga/cog_classes_abundance.tsv', sep = "\t", index = True)
abundance

### COG family annotations

In [None]:
path = '/home/sergio/TFM1/webmga/' #  path to the folder containing the outputs of WebMGA

families_list = []
genomes_list2 = []
families_per_genome = {}

for foldername in os.listdir(path):
    for filename in os.listdir(os.path.join(path,foldername,"cog")):
        if re.match("cog.txt", filename):
            genome = foldername # Folder name should follow the pattern: genus_species_strain
            genomes_list2.append(genome)
            accession_dict = {}
            with open(os.path.join(path, foldername, "cog", filename)) as myFile:
                next(myFile)
                for line in myFile:
                    accession, name, no_orfs, coverage, abundance, description = line.split('\t')
                    description = re.match('(.+)\n', description).groups()[0]
                    families_list.append([accession, name, description])
                    accession_dict[accession] = {'No. ORFs':no_orfs, 'Coverage':coverage, 'Abundance':abundance}
            
            families_per_genome[genome] = accession_dict
            myFile.close()


df2 = pd.DataFrame(families_list, columns = ('Accession','Name','Description')).drop_duplicates().sort_values(by='Accession').set_index('Accession')

for genome in genomes_list2: # template table
    df2[genome] = np.nan

In [None]:
# Report of the number of ORFs per COG family

n_orfs = df2.fillna('0')

for key,value in families_per_genome.items():
    genome = key
    for k,v in value.items():
        n_orfs[genome][k] = np.int64(v['No. ORFs'])

n_orfs = n_orfs[sorted(n_orfs.columns)].sort_index()
pd.DataFrame.to_csv(n_orfs, path_or_buf = '/home/sergio/TFM1/reports/webmga/cog_families_norfs.tsv', sep = "\t", index = True)
n_orfs

In [None]:
# Report of the coverage of each COG family

coverage2 = df2.fillna('0')

for key,value in families_per_genome.items():
    genome = key
    for k,v in value.items():
        coverage2[genome][k] = np.round(np.float64(v['Coverage']),3)

coverage2 = coverage2[sorted(coverage2.columns)].sort_index()
pd.DataFrame.to_csv(coverage2, path_or_buf = '/home/sergio/TFM1/reports/webmga/cog_families_coverage.tsv', sep = "\t", index = True)
coverage2

In [None]:
# Report of the abundance of each COG family
abundance2 = df2.fillna('0')

for key,value in families_per_genome.items():
    genome = key
    for k,v in value.items():
        abundance2[genome][k] = v['Abundance']

abundance2 = abundance2[sorted(abundance2.columns)].sort_index()
pd.DataFrame.to_csv(abundance2, path_or_buf = '/home/sergio/TFM1/reports/webmga/cog_families_abundance.tsv', sep = "\t", index = True)
abundance2