## Analysis of Prokka results

Sergio Álvarez-Pérez, 2020

In [None]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from scipy import stats
import numpy as np

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')
if not os.path.exists('/home/sergio/TFM1/reports/prokka'):
    os.mkdir('/home/sergio/TFM1/reports/prokka')

In [None]:
# To generate a report of Prokka results per species in .tsv format

path = '/home/sergio/TFM1/prokka_summaries/' # path to the folder containing Prokka summary reports

results_file = open('/home/sergio/TFM1/reports/prokka/prokka_results.tsv', 'w')
results_file.write("organism\tcontigs\tbases\trepeat_region\tCDS\ttmRNA\ttRNA\trRNA\n")

for filename in os.listdir(path):
    if re.match("(.+)\.txt", filename):
        f = open(path + filename, "r")
        organism = re.match("(.+)\.txt", filename)
        organism = organism.groups()[0]
        filetext = f.read()
            
        contigs = re.search("contigs\D*([0-9]*)", filetext)
        if contigs:
            contigs = contigs.groups()[0]
        else:
            contigs = "NA"
            
        bases = re.search("bases\D*([0-9]*)", filetext)
        if bases:
            bases = bases.groups()[0]
        else:
            bases = "NA"
            
        repeat_region = re.search("repeat_region\D*([0-9]*)", filetext)
        if repeat_region:
            repeat_region = repeat_region.groups()[0]
        else:
            repeat_region = "NA"
            
        CDS = re.search("CDS\D*([0-9]*)", filetext)
        if CDS:
            CDS = CDS.groups()[0]
        else:
            CDS = "NA"
            
        tmRNA = re.search("tmRNA\D*([0-9]*)", filetext)
        if tmRNA:
            tmRNA = tmRNA.groups()[0]
        else:
            tmRNA = "NA"
            
        tRNA = re.search("tRNA\D*([0-9]*)", filetext)
        if tRNA:
            tRNA = tRNA.groups()[0]
        else:
            tRNA = "NA"
            
        rRNA = re.search("rRNA\D*([0-9]*)", filetext)
        if rRNA:
            rRNA = rRNA.groups()[0]
        else:
            rRNA = "NA"

        results_file.write(organism + "\t" + contigs + "\t" + bases + "\t" + repeat_region + "\t" + CDS + "\t" + tmRNA + "\t" + tRNA + "\t" + rRNA + "\n")
        print(organism + "\t" + contigs + "\t" + bases + "\t" + repeat_region + "\t" + CDS + "\t" + tmRNA + "\t" + tRNA + "\t" + rRNA + "\n")

results_file.close()

In [None]:
organism_list = []
contigs_list = []
bases_list = []
repeat_region_list = []
CDS_list = []
tmRNA_list = []
tRNA_list = []
rRNA_list = []

with open('/home/sergio/TFM1/reports/prokka/prokka_results.tsv') as f:
    next(f)
    for line in f:
        O, C1, C2, C3, C4, C5, C6, C7 = line.split("\t")
        organism_list.append(O)
        contigs_list.append(int(C1))
        bases_list.append(int(C2))
        repeat_region_list.append(C3)
        CDS_list.append(int(C4))
        tmRNA_list.append(C5)
        tRNA_list.append(int(C6))
        rRNA_list.append(int(C7))
        
f.close()

mylists = {'organism': organism_list, 'contigs': contigs_list, 'bases': bases_list, 'repeat_region': repeat_region_list, 'CDS': CDS_list, 'tmRNA': tmRNA_list, 'tRNA': tRNA_list, 'rRNA': rRNA_list}

In [None]:
df = pd.DataFrame.from_dict(mylists)
df

In [None]:
# To generate a report with some basic statistics for each parameter and barplots in .pdf format
results_file = open('/home/sergio/TFM1/reports/prokka/prokka_summary_statistics.tsv', 'w')
results_file.write("Parameter\tn\tmean\ts.d.\tmedian\tmode\n")

parameters = ["contigs", "bases", "CDS", "tRNA", "rRNA"]

for k, v in mylists.items():
    if k in parameters:
        results_file.write(k + "\t" + str(len(v)) + "\t" + str(np.round(np.average(v),2)) + "\t" + str(np.round(np.std(v),2)) + "\t" + str(np.round(np.median(v),2)) + "\t" + str(stats.mode(v)[0][0]) + "\n")
        
        ax = df.hist(column=k, bins='auto', color='limegreen', grid=False, xlabelsize=14, xrot=45, ylabelsize=14, align='mid', rwidth=0.5)
        ax = ax[0]
        for x in ax:
            x.set_title("")
            x.set_ylabel("No. genomes", labelpad=10, weight='bold', size=14)
            x.set_xlabel(k, labelpad=10, weight='bold', size=14)
            x.yaxis.set_major_locator(MaxNLocator(integer=True))
    
        plt.savefig("/home/sergio/TFM1/reports/prokka/plot_prokka_" + k + ".pdf", bbox_inches='tight')
        
results_file.close()