## Analysis of CheckM reports

Sergio Álvarez-Pérez, 2020

In [None]:
import os
import re
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from scipy import stats

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')
    
if not os.path.exists('/home/sergio/TFM1/reports/checkm'):
    os.mkdir('/home/sergio/TFM1/reports/checkm')

In [None]:
path = '/home/sergio/TFM1/checkm' # this path should contain the 'bin_stats_ext.tsv' file generated by CheckM
checkm_dict = {}

for foldername in os.listdir(path):
    if os.path.isdir(os.path.join(path, foldername)): 
        for file in os.listdir(path + '/' + foldername):
            if file == 'bin_stats_ext.tsv':
                f = open(path + '/' + foldername + '/' + file, "r")
                lines = f.readlines()
                for line in lines:
                    assembly, data = line.split("\t")
                    data = '{}'.format(data)
                    data = ast.literal_eval(data)
                    checkm_dict[assembly] = data

#print (checkm_dict) #uncomment to check all values

In [None]:
df = pd.DataFrame.from_dict(checkm_dict).T.infer_objects() #To transpose the dataframe and infer data types
df

In [None]:
# Quality control check report: assemblies with completeness <98% will be considered to be of poor quality

passed = df.index[df['Completeness'] >= 98].tolist() # Passed QC check (completeness >98%)
failed = df.index[df['Completeness'] < 98].tolist() # Failed QC check (completeness <98%)

qc_file = open('/home/sergio/TFM1/reports/checkm/checkm_quality_checks.txt', 'w')
qc_file.write("ASSEMBLIES THAT PASSED THE QUALITY CHECK (completeness >= 98%)\n")
qc_file.write("--------------------------------------------------------------\n\n")
for p in passed:
    qc_file.write(p + "\n")
    
qc_file.write("--------------------------------------------------------------\n\n")    
qc_file.write("ASSEMBLIES THAT FAILED THE QUALITY CHECK (completeness < 98%)\n")
qc_file.write("--------------------------------------------------------------\n\n")
for f in failed:
    qc_file.write(f + "\n")
    
qc_file.write("\n**************** END OF THE FILE ****************")
qc_file.close()

In [None]:
# To generate a report with some basic statistics for each parameter and barplots in .pdf format

results_file = open('/home/sergio/TFM1/reports/checkm/checkm_summary_statistics.tsv', 'w')
results_file.write("Parameter\tn\tmean\ts.d.\tmedian\tmode\n")

parameters = ['Completeness', 'Contamination', '# ambiguous bases', 'Coding density', '# predicted genes']

for parameter in parameters:
    v = df[parameter].to_numpy().tolist()
    results_file.write(parameter + "\t" + str(len(v)) + "\t" + str(np.round(np.average(v),2)) + "\t" + str(np.round(np.std(v),2)) + "\t" + str(np.round(np.median(v),2)) + "\t" + str(np.round(stats.mode(v)[0][0],2)) + "\n")

    ax = df.hist(column=parameter, color='crimson', bins='auto', grid=False, xlabelsize=14, xrot=45, ylabelsize=14, align='mid', rwidth=0.5)
    ax = ax[0]
    for x in ax:
        x.set_title("")
        x.set_ylabel("No. genomes", labelpad=10, weight='bold', size=14)
        x.set_xlabel(parameter, labelpad=10, weight='bold', size=14)
        x.yaxis.set_major_locator(MaxNLocator(integer=True))
    
        plt.savefig("/home/sergio/TFM1/reports/checkm/plot_checkm_" + parameter + ".pdf", bbox_inches='tight')
        
results_file.close()