## Analysis of RAST annotation results

Sergio Álvarez-Pérez, 2020

In [None]:
import os
import re
import pandas as pd
import numpy as np

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')

if not os.path.exists('/home/sergio/TFM1/reports/rast/'):
    os.mkdir('/home/sergio/TFM1/reports/rast/')

In [None]:
path = '/home/sergio/TFM1/RAST/' # path to the folder containing the results files downloaded from the RAST server (https://rast.nmpdr.org/)
categories_dict = {}
subcategories_dict = {}
subsystems_dict = {}
functions_dict = {}

for foldername in os.listdir(path):
    for filename in os.listdir(os.path.join(path, foldername)):
        match = re.match("RAST_(.+)_categories\.tsv", filename)
        if match:
            species = match.groups()[0]
            categories_list = []
            subcategories_list = []
            subsystems_list = []
            functions_list = []
            with open(os.path.join(path, foldername, filename), "r") as f:
                next(f)
                for line in f:
                    category, subcategory, subsystem, role, features = line.split("\t")
                    categories_list.append(category)
                    subcategories_list.append(subcategory)
                    subsystems_list.append(subsystem)
                    functions_list.append(category + " / " + subcategory + " / " + subsystem)
            f.close()
        
            values1, counts1 = np.unique(categories_list, return_counts=True)
            values1 = values1.tolist()
            counts1 = counts1.tolist()
            newdict1 = {}
            for v in values1:
                newpair1 = v, counts1[values1.index(v)]
                newdict1.update({newpair1})
            categories_dict[str(species)] = newdict1
        
            values2, counts2 = np.unique(subcategories_list, return_counts=True)
            values2 = values2.tolist()
            counts2 = counts2.tolist()
            newdict2 = {}
            for v in values2:
                newpair2 = v, counts2[values2.index(v)]
                newdict2.update({newpair2})
            subcategories_dict[str(species)] = newdict2
        
            values3, counts3 = np.unique(subsystems_list, return_counts=True)
            values3 = values3.tolist()
            counts3 = counts3.tolist()
            newdict3 = {}
            for v in values3:
                newpair3 = v, counts3[values3.index(v)]
                newdict3.update({newpair3})
            subsystems_dict[str(species)] = newdict3
        
            values4, counts4 = np.unique(functions_list, return_counts=True)
            values4 = values4.tolist()
            counts4 = counts4.tolist()
            newdict4 = {}
            for v in values4:
                newpair4 = v, counts4[values4.index(v)]
                newdict4.update({newpair4})
            functions_dict[str(species)] = newdict4


In [None]:
# To generate a report of the RAST categories per species in .tsv format
df1 = pd.DataFrame.from_dict(categories_dict).fillna('0')
df1 = df1[sorted(df1.columns)].sort_index()
pd.DataFrame.to_csv(df1, path_or_buf = '/home/sergio/TFM1/reports/rast/RAST_categories_per_species.tsv', sep = "\t", na_rep = "0")
df1

In [None]:
# To generate a report of the RAST subcategories per species in .tsv format
df2 = pd.DataFrame.from_dict(subcategories_dict).fillna('0')
df2 = df2[sorted(df2.columns)].sort_index()
pd.DataFrame.to_csv(df2, path_or_buf = '/home/sergio/TFM1/reports/rast/RAST_subcategories_per_species.tsv', sep = "\t", na_rep = "0")
df2

In [None]:
# To generate a report of the RAST subsystems per species in .tsv format
df3 = pd.DataFrame.from_dict(subsystems_dict).fillna('0')
df3 = df3[sorted(df1.columns)].sort_index()
pd.DataFrame.to_csv(df3, path_or_buf = '/home/sergio/TFM1/reports/rast/RAST_subsystems_per_species.tsv', sep = "\t", na_rep = "0")
df3

In [None]:
# To generate a report of the functional categories per species in .tsv format
df4 = pd.DataFrame.from_dict(functions_dict).fillna('0')
df4 = df4[sorted(df4.columns)].sort_index()
pd.DataFrame.to_csv(df4, path_or_buf = '/home/sergio/TFM1/reports/rast/RAST_functions_per_species.tsv', sep = "\t", na_rep = "0")
df4