HiFi \
This notebook is used to plot the assembly characteristics of the simulation study. HiFi reads are simulated by PBSIM3 (number of passes = 10) followed by ccs based on different references. These reads are then used as input for different de novo assembly tools. The metrics of the output assemblies will be plotted here. Metrics to report are:
- Total assembly size
- NG50
- nr of contigs
- nr of errors (found by aligning the assembly back to the reference)

In [49]:
output_dir = "/home/uitte01p/experimental/test_for_sim_study/simulation_output/HiFi_plots"

In [None]:
from matplotlib import pyplot as plt 
import numpy as np 
import bionumpy as bnp
import pandas as pd
from scipy import stats
import seaborn as sns
import os
from Bio import SeqIO

In [51]:
## Nr of contigs
# Plot for ONT for each target and each assembler a figure with the number of contigs on the y axis and the characteristics (readlength, depth) of the reads on the x axis
!find "/home/uitte01p/experimental/test_for_sim_study/simulation_output/HiFi" -name *.contigs.fasta > HiFi_assembly_filepaths.txt
!find "/home/uitte01p/experimental/test_for_sim_study/simulation_output/HiFi" -name *.contigs.stats > HiFi_stats_filepaths.txt

In [52]:
with open("HiFi_assembly_filepaths.txt") as filepaths:
    all_metrics = []
    for path in filepaths:
        path = path.rstrip()
        fields = path.strip().split('/')[1:] #Split the filepath by /. This will output a '' as first field, remove this by [1:]
        nr_seq = 0
        lengths = []
        IDs = []
        for record in SeqIO.parse(path,"fasta"):
            nr_seq+=1
            length = len(list({record.seq})[0])
            lengths.append(length)
            id = {record.id}
            IDs.append(id)      
        metrics = {
            'technology' : fields[5],
            'assembler' : fields[6],
            'target' : fields[7],
            'accuracy' : fields[8],
            'readlength' : fields[9],
            'depth' : fields[10],
            'number of contigs' : nr_seq,
            'lengths' : lengths,
            'IDs' : IDs,
            'tag' : fields[11].split('.')[0]}
        all_metrics.append(metrics)
    metrics_df = pd.DataFrame(data=all_metrics)

In [53]:
with open("HiFi_stats_filepaths.txt") as stats_filepaths:
    all_stats = []
    for stats_path in stats_filepaths:
        stats_path = stats_path.rstrip()
        fields = stats_path.strip().split('/')[1:] #Split the filepath by /. This will output a '' as first field, remove this by [1:]
        !grep ^SN {stats_path} | cut -f 2- > tmp_sn.csv
        df_stats = pd.read_csv("tmp_sn.csv", sep = "\t", header=None)
        bases_mapped = df_stats.iloc[20,1]
        mismatches = df_stats.iloc[23,1]
        error_rate = df_stats.iloc[24,1]
        metrics = {
            'technology' : fields[5],
            'assembler' : fields[6],
            'target' : fields[7],
            'bases mapped' : bases_mapped,
            'mismatches' : mismatches,
            'error rate' : error_rate, 
            'tag' : fields[12].split('.')[0]}
        !rm tmp_sn.csv
        all_stats.append(metrics)
    stats_df = pd.DataFrame(data=all_stats)

In [34]:
def calculate_ng50(contig_lengths, genome_length):
    """
    Calculate the NG50 of a genome assembly.
    
    Parameters:
    contig_lengths (list): List of lengths of contigs.
    genome_length (int): Total length of the genome.
    
    Returns:
    int: NG50 value.
    """
    contig_lengths.sort(reverse=True)
    cumulative_length = 0
    
    for length in contig_lengths:
        cumulative_length += length
        if cumulative_length >= genome_length / 2:
            return length

    return 0  # Return 0 if NG50 cannot be determined

In [35]:
ng50s = []
for lengths_list in metrics_df['lengths']:
    ng50 = calculate_ng50(lengths_list,1000000)
    ng50s.append(ng50)
metrics_df['NG50'] = ng50s

In [36]:
total_lengths = []
for lengths_list in metrics_df['lengths']:
    total_length = sum(lengths_list)
    total_lengths.append(total_length)
metrics_df['total length'] = total_lengths

In [37]:
stats_df

Unnamed: 0,technology,assembler,target,bases mapped,mismatches,error rate,tag
0,HiFi,hifiasm,CHM13_6MB,999304.0,284.0,0.000284,HiFi_ac90_rl25000_de30
1,HiFi,hicanu,CHM13_6MB,1169952.0,748.0,0.000639,HiFi_ac90_rl25000_de30
2,HiFi,lja,CHM13_6MB,999277.0,299.0,0.000299,HiFi_ac90_rl25000_de30
3,HiFi,mbg,CHM13_6MB,1133877.0,427.0,0.000377,HiFi_ac90_rl25000_de30
4,HiFi,hiflye,CHM13_6MB,995489.0,204.0,0.000205,HiFi_ac90_rl25000_de30


In [38]:
metrics_df

Unnamed: 0,technology,assembler,target,accuracy,readlength,depth,number of contigs,lengths,IDs,tag,NG50,total length
0,HiFi,hifiasm,CHM13_6MB,ac90,rl25000,de30,1,[999311],[{ptg000001l}],HiFi_ac90_rl25000_de30,999311,999311
1,HiFi,hicanu,CHM13_6MB,ac90,rl25000,de30,5,"[992153, 47670, 46173, 44391, 39570]","[{tig00000001}, {tig00000002}, {tig00000003}, ...",HiFi_ac90_rl25000_de30,992153,1169957
2,HiFi,lja,CHM13_6MB,ac90,rl25000,de30,1,[999284],[{1}],HiFi_ac90_rl25000_de30,999284,999284
3,HiFi,mbg,CHM13_6MB,ac90,rl25000,de30,119,"[97936, 70424, 58433, 56179, 50937, 40421, 392...","[{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, ...",HiFi_ac90_rl25000_de30,34005,1133905
4,HiFi,hiflye,CHM13_6MB,ac90,rl25000,de30,1,[995489],[{contig_1}],HiFi_ac90_rl25000_de30,995489,995489


In [39]:
all_metrics_df = pd.merge(metrics_df,stats_df, how = 'left', on=['assembler','target','tag'])

In [40]:
all_metrics_df

Unnamed: 0,technology_x,assembler,target,accuracy,readlength,depth,number of contigs,lengths,IDs,tag,NG50,total length,technology_y,bases mapped,mismatches,error rate
0,HiFi,hifiasm,CHM13_6MB,ac90,rl25000,de30,1,[999311],[{ptg000001l}],HiFi_ac90_rl25000_de30,999311,999311,HiFi,999304.0,284.0,0.000284
1,HiFi,hicanu,CHM13_6MB,ac90,rl25000,de30,5,"[992153, 47670, 46173, 44391, 39570]","[{tig00000001}, {tig00000002}, {tig00000003}, ...",HiFi_ac90_rl25000_de30,992153,1169957,HiFi,1169952.0,748.0,0.000639
2,HiFi,lja,CHM13_6MB,ac90,rl25000,de30,1,[999284],[{1}],HiFi_ac90_rl25000_de30,999284,999284,HiFi,999277.0,299.0,0.000299
3,HiFi,mbg,CHM13_6MB,ac90,rl25000,de30,119,"[97936, 70424, 58433, 56179, 50937, 40421, 392...","[{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, ...",HiFi_ac90_rl25000_de30,34005,1133905,HiFi,1133877.0,427.0,0.000377
4,HiFi,hiflye,CHM13_6MB,ac90,rl25000,de30,1,[995489],[{contig_1}],HiFi_ac90_rl25000_de30,995489,995489,HiFi,995489.0,204.0,0.000205


In [41]:
!mkdir $output_dir
all_metrics_df.to_csv(f"{output_dir}/HiFi_assembly_metrics.csv")

In [48]:
#For each assembler, calculate the mean number of contigs for all assemblers
# mean_nr_contigs = {'hifiasm':np.mean(metrics_df[metrics_df['assembler']=='hifiasm']['number of contigs']),
#                     'hicanu':np.mean(metrics_df[metrics_df['assembler']=='hicanu']['number of contigs']),
#                     'hiflye':np.mean(metrics_df[metrics_df['assembler']=='hiflye']['number of contigs']),
#                     'mbg':np.mean(metrics_df[metrics_df['assembler']=='mbg']['number of contigs']),
#                     'lja':np.mean(metrics_df[metrics_df['assembler']=='lja']['number of contigs'])}
sns.scatterplot(data=all_metrics_df,x="assembler",y="number of contigs")
plt.title("number of contigs per assembler")
plt.savefig(f"{output_dir}/all_overall_mean_nr_contigs.png")
plt.close()

In [47]:
sns.scatterplot(data=all_metrics_df, x="assembler", y="NG50")
plt.title("NG50 per assembler")
plt.savefig(f"{output_dir}/all_NG50.png")
plt.close()

In [None]:
##canu
canu_metrics = all_metrics_df[all_metrics_df['assembler']=='canu']
canu_metrics_ref = canu_metrics[canu_metrics['target']=="CHM13_6MB"]
#canu_metrics_RHD = canu_metrics[canu_metrics['target']=="CHM13_RHDCE"]

##Flye
flye_metrics = all_metrics_df[all_metrics_df['assembler']=='flye']
flye_metrics_ref = flye_metrics[flye_metrics['target']=="CHM13_6MB"]
#flye_metrics_RHD = flye_metrics[flye_metrics['target']=="CHM13_RHDCE"]

##miniasm
miniasm_metrics = all_metrics_df[all_metrics_df['assembler']=='miniasm']
miniasm_metrics_ref = miniasm_metrics[miniasm_metrics['target']=="CHM13_6MB"]
#miniasm_metrics_RHD = miniasm_metrics[miniasm_metrics['target']=="CHM13_RHDCE"]

##raven
raven_metrics = all_metrics_df[all_metrics_df['assembler']=='raven']
raven_metrics_ref = raven_metrics[raven_metrics['target']=="CHM13_6MB"]
#raven_metrics_RHD = raven_metrics[raven_metrics['target']=="CHM13_RHDCE"]

##wtdbg2
wtdbg2_metrics = all_metrics_df[all_metrics_df['assembler']=='wtdbg2']
wtdbg2_metrics_ref = wtdbg2_metrics[wtdbg2_metrics['target']=="CHM13_6MB"]
#wtdbg2_metrics_RHD = wtdbg2_metrics[wtdbg2_metrics['target']=="CHM13_RHDCE"]

In [57]:
## Plot the total assembly size for each readlength + depth combination, for each assembler, for each target
for dataset in [canu_metrics_ref,canu_metrics_RHD,flye_metrics_ref,flye_metrics_RHD,miniasm_metrics_ref,miniasm_metrics_RHD,raven_metrics_ref,raven_metrics_RHD,wtdbg2_metrics_ref,wtdbg2_metrics_RHD]:
    assembler = [x for x in globals() if globals()[x] is dataset][0].split('_')[0]
    target = [x for x in globals() if globals()[x] is dataset][0].split('_')[2]
    sns.scatterplot(x=dataset['tag'],y=dataset['total length'])
    plt.title(f"Total length {assembler} assemblies {target}");
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{target}_{assembler}_total_length")
    plt.close()

In [59]:
## Plot the NG50 for each readlength + depth combination, for each assembler, for each target
for dataset in [canu_metrics_ref,canu_metrics_RHD,flye_metrics_ref,flye_metrics_RHD,miniasm_metrics_ref,miniasm_metrics_RHD,raven_metrics_ref,raven_metrics_RHD,wtdbg2_metrics_ref,wtdbg2_metrics_RHD]:
    assembler = [x for x in globals() if globals()[x] is dataset][0].split('_')[0]
    target = [x for x in globals() if globals()[x] is dataset][0].split('_')[2]
    sns.barplot(x=dataset['tag'],y=dataset['NG50'])
    plt.title(f"NG50s {assembler} assemblies {target}");
    plt.xticks(rotation=90)
    plt.ylim(0,1000000)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{target}_{assembler}_NG50")
    plt.close()

In [58]:
## Plot the number of contigs for each readlength + depth combination, for each assembler, for each target
for dataset in [canu_metrics_ref,canu_metrics_RHD,flye_metrics_ref,flye_metrics_RHD,miniasm_metrics_ref,miniasm_metrics_RHD,raven_metrics_ref,raven_metrics_RHD,wtdbg2_metrics_ref,wtdbg2_metrics_RHD]:
    assembler = [x for x in globals() if globals()[x] is dataset][0].split('_')[0]
    target = [x for x in globals() if globals()[x] is dataset][0].split('_')[2]
    sns.scatterplot(x=dataset['tag'],y=dataset['number of contigs'])
    plt.title(f"Number of contigs {assembler} assemblies {target}");
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{target}_{assembler}_nr_contigs")
    plt.close()

In [60]:
## Plot the number of contigs for each readlength + depth combination, for each assembler, for each target
for dataset in [canu_metrics_ref,canu_metrics_RHD,flye_metrics_ref,flye_metrics_RHD,miniasm_metrics_ref,miniasm_metrics_RHD,raven_metrics_ref,raven_metrics_RHD,wtdbg2_metrics_ref,wtdbg2_metrics_RHD]:
    assembler = [x for x in globals() if globals()[x] is dataset][0].split('_')[0]
    target = [x for x in globals() if globals()[x] is dataset][0].split('_')[2]
    sns.scatterplot(x=dataset['tag'],y=dataset['error rate'])
    plt.title(f"Error rate {assembler} assemblies {target}");
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{target}_{assembler}_error_rate")
    plt.close()

In [None]:
# !rm assembly_filepaths.txt