In [None]:
library(data.table)
library(ggplot2)
library(dplyr)
library(viridis)
library(stringr)
theme_set(theme_bw())
source("setup.R")

In [None]:
zymo_stats <- concatenate_all_results() # function is in setup.R, change the paths to suit you

In [None]:
sample_types <- c("P. CSII (6 Gbp)", "P. CSII (3 Gbp)", "P. CSI (6 Gbp)", "P. CSI (3 Gbp)", "G. CSII (6 Gbp)", "G. CSII (3 Gbp)", "G. CSI (6 Gbp)", "G. CSI (3 Gbp)")
names(sample_types) <- c("SRR10084338", "SRR10084339", "SRR10084340", "SRR10084341", "SRR10084342", "SRR10084343", "SRR10084344", "SRR10084345")

assembly_sizes <- ggplot(zymo_stats[min_scaffold_length!="1 KB"], aes(fill=min_scaffold_length, y=gained_size, x=sample_name)) + 
    geom_bar(position="stack", stat="identity") + 
    labs(fill="Scaffold Length", x = "Assembler", y = "Assembly size (bp)") +
    scale_fill_viridis(discrete=TRUE, option="magma") + 
    scale_x_discrete(labels=sample_types) +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.title.x = element_blank(),
          legend.position="right",
          legend.direction="vertical", 
          legend.text=element_text(size=10),
          axis.text.x=element_text(angle=45, hjust=1, vjust=1)
        ) +
    facet_wrap(~assembler, nrow=1, ncol=4, labeller=labeller(sample_name=sample_types))
assembly_sizes

In [None]:
# pltting number of scaffolds in each assembly
sample_types <- c("P. CSII (6 Gbp)", "P. CSII (3 Gbp)", "P. CSI (6 Gbp)", "P. CSI (3 Gbp)", "G. CSII (6 Gbp)", "G. CSII (3 Gbp)", "G. CSI (6 Gbp)", "G. CSI (3 Gbp)")
names(sample_types) <- c("SRR10084338", "SRR10084339", "SRR10084340", "SRR10084341", "SRR10084342", "SRR10084343", "SRR10084344", "SRR10084345")
values = gg_color_hue(length(sample_types))
names(values) = sample_types
values
# ?scale_colour_manual
scaffold_count <- ggplot(zymo_stats[min_scaffold_length=="2.5 KB"], aes(color=sample_name, x=sample_name, y=number_of_scaffolds)) +
    scale_color_hue(labels=sample_types) +
    scale_x_discrete(labels=sample_types) +
    geom_point() +
    labs(y="Scaffold count", x="Assembler") +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.title.x = element_blank(),
          legend.position="none",
          legend.direction="vertical", 
          legend.text=element_text(size=10),
          axis.text.x=element_text(angle=45, hjust=1, vjust=1)
        ) +
    facet_wrap(~assembler, nrow=1, ncol=4, labeller=labeller(sample_name=sample_types))
scaffold_count

In [None]:

sample_types <- c("P. CSII (6 Gbp)", "P. CSII (3 Gbp)", "P. CSI (6 Gbp)", "P. CSI (3 Gbp)", "G. CSII (6 Gbp)", "G. CSII (3 Gbp)", "G. CSI (6 Gbp)", "G. CSI (3 Gbp)")
names(sample_types) <- c("SRR10084338", "SRR10084339", "SRR10084340", "SRR10084341", "SRR10084342", "SRR10084343", "SRR10084344", "SRR10084345")
assemblers <- c("Aviary", "metaSPAdes", "metaFlye", "OPERA-MS")
names(assemblers) <- c("Aviary", "metaSPAdes", "metaFlye", "OPERA_MS")

all_stats <- get_quast_stats()
genomes <- c("B. subtilis","C. neoformans","En. faecalis","E. coli","L. fermentum","L. monocytogenes","P. aeruginosa","S. cerevisiae","Sa. enterica","St. aureus")
values = gg_color_hue(length(genomes))
names(values) = genomes

bac_theme <- theme(axis.text=element_text(size=8),
          axis.title=element_text(size=10, face="bold"),
          axis.title.x=element_blank(),
          axis.line = element_line(size=0.25),
          axis.ticks=element_line(size=0.25),
          axis.text.x=element_blank(),
          strip.text.y=element_text(size=6),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          legend.position="none" ,
          legend.direction="horizontal", 
          legend.text=element_text(size=10), 
          legend.background=element_blank(), 
          legend.key=element_blank())

euk_theme <- theme(axis.text=element_text(size=8),
          strip.text.x=element_blank(),
          axis.text.x=element_text(angle=60, vjust=1.15, hjust=1.1),
          axis.title=element_text(size=10),
          axis.title.x=element_blank(),
          axis.line = element_line(size=0.25),
          axis.ticks=element_line(size=0.25),
          strip.text.y=element_text(size=6),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          legend.position="none" ,
          legend.direction="horizontal", 
          legend.text=element_text(size=10), 
          legend.background=element_blank(), 
          legend.key=element_blank())

bac <- ggplot(data=all_stats$fraction_recovered[euk == FALSE, ], aes(x=sample, y=value, fill=Assemblies)) +
    scale_fill_manual(values=values) +  
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 100)) + 
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    labs(y="Assembled fraction (%)") +
    scale_x_discrete(labels=sample_types) +
    bac_theme

euk <- ggplot(data=all_stats$fraction_recovered[euk == TRUE, ], aes(x=sample, y=value, fill=Assemblies)) +
    scale_fill_manual(values=values) + 
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 100)) + 
    scale_x_discrete(labels=sample_types) +
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    labs(y="") +
    euk_theme

assembled_fraction <- ggarrange(bac, euk, nrow=2, ncol=1, heights=c(0.73, 0.27))
assembled_fraction

In [None]:
sizes <- c("0 MB", "2 MB",  "4 MB", "6 MB")
bac <- ggplot(data=all_stats$largest_contig[euk == FALSE, ], aes(x=sample, y=value / 1e4, fill=Assemblies)) +
    scale_fill_manual(values=values) +  
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 700)) + 
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    labs(y="Largest contig (10 Kbp)") +
    scale_x_discrete(labels=sample_types) +
    bac_theme

euk <- ggplot(data=all_stats$largest_contig[euk == TRUE & Assemblies != "Not Aligned", ], aes(x=sample, y=value / 1e4, fill=Assemblies)) +
    scale_fill_manual(values=values) + 
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 700)) + 
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    scale_x_discrete(labels=sample_types) +
    labs(y="") +
    euk_theme
# bac
largest_contig <- ggarrange(bac, euk, nrow=2, ncol=1, heights=c(0.73, 0.27))
largest_contig

In [None]:
scaffold_dists <- ggarrange(assembly_sizes, scaffold_count, nrow=2, ncol=1, labels=c("A)", "B)"), heights=c(0.6, 0.4))
quast_results <- ggarrange(largest_contig, assembled_fraction, nrow=1, ncol=2, labels=c("C)", "D)"), widths=c(1, 1))
# quast_results

ggarrange(scaffold_dists, quast_results, nrow=2, heights=c(0.6, 1))
ggsave("zymo_aggregate.png", dpi=600, width=12, height=15)

# Other metaquast stats

In [None]:
bac <- ggplot(data=all_stats$contig_count[euk == FALSE, ], aes(x=sample, y=value, fill=Assemblies)) +
    scale_fill_manual(values=values) +  
    geom_bar(stat="identity", color="black") +
#     expand_limits(y=c(0, 12000)) + 
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    labs(y="Contig count") +
    scale_x_discrete(labels=sample_types) +
    theme(axis.text=element_text(size=8),
          axis.title=element_text(size=10, face="bold"),
          axis.title.y=element_text(margin = margin(t = 0, r = 2, b = 0, l = 0)),
          axis.title.x=element_blank(),
          axis.line = element_line(size=0.25),
          axis.ticks=element_line(size=0.25),
          strip.text.y=element_text(size=6),
          axis.text.x=element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          legend.position="none" ,
          legend.direction="horizontal", 
          legend.text=element_text(size=10), 
          legend.background=element_blank(), 
          legend.key=element_blank())

euk <- ggplot(data=all_stats$contig_count[euk == TRUE & Assemblies != "Not Aligned", ], aes(x=sample, y=value, fill=Assemblies)) +
    scale_fill_manual(values=values) + 
    geom_bar(stat="identity", color="black") +
    labs(y="") +
#     expand_limits(y=c(0, 6.5e06)) + 
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    scale_x_discrete(labels=sample_types) +
    euk_theme

contig_count <- ggarrange(bac, euk, nrow=2, heights=c(0.73, 0.27))
contig_count

In [None]:
bac <- ggplot(data=all_stats$largest_alignment[euk == FALSE, ], aes(x=sample, y=value / 1e4, fill=Assemblies)) +
    scale_fill_manual(values=values) +  
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 700)) + 
    labs(y="Largest alignment (100 Kbp)") +
    scale_x_discrete(labels=sample_types) +
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    bac_theme

euk <- ggplot(data=all_stats$largest_alignment[euk == TRUE, ], aes(x=sample, y=value / 1e4, fill=Assemblies)) +
    scale_fill_manual(values=values) + 
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 700)) + 
    scale_x_discrete(labels=sample_types) +
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    labs(y="") +
    euk_theme

largest_alignment <- ggarrange(bac, euk, nrow=2, ncol=1, heights=c(0.73, 0.27))
largest_alignment

In [None]:
bac <- ggplot(data=all_stats$indels[euk == FALSE, ], aes(x=sample, y=value, fill=Assemblies)) +
    scale_fill_manual(values=values) +  
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 100)) + 
    labs(y="INDELs per 100 Kbp") +
    scale_x_discrete(labels=sample_types) +
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    bac_theme

euk <- ggplot(data=all_stats$indels[euk == TRUE, ], aes(x=sample, y=value, fill=Assemblies)) +
    scale_fill_manual(values=values) + 
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 100)) + 
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    scale_x_discrete(labels=sample_types) +
    labs(y="") +
    euk_theme

indels <- ggarrange(bac, euk, nrow = 2, heights=c(0.73, 0.27))
indels

In [None]:
bac <- ggplot(data=all_stats$mismatches[euk == FALSE, ], aes(x=sample, y=value, fill=Assemblies)) +
    scale_fill_manual(values=values) +  
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 100)) + 
    labs(y="Mismatches per 100 Kbp") +
    scale_x_discrete(labels=sample_types) +
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    bac_theme

euk <- ggplot(data=all_stats$mismatches[euk == TRUE, ], aes(x=sample, y=value, fill=Assemblies)) +
    scale_fill_manual(values=values) + 
    geom_bar(stat="identity", color="black") +
    expand_limits(y=c(0, 3000)) + 
    facet_grid(Assemblies ~ variable, labeller=labeller(variable=assemblers)) +
    scale_x_discrete(labels=sample_types) +
    labs(y="") +
    theme(axis.text=element_text(size=8),
          strip.text.x=element_blank(),
          axis.text.x=element_text(angle=60, vjust=1.15, hjust=1.1),
          axis.text.y=element_text(margin = margin(t = 0, r = 6, b = 0, l = 0)),
          axis.title=element_text(size=10),
          axis.title.x=element_blank(),
          axis.line = element_line(size=0.25),
          axis.ticks=element_line(size=0.25),
          strip.text.y=element_text(size=6),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          legend.position="none" ,
          legend.direction="horizontal", 
          legend.text=element_text(size=10), 
          legend.background=element_blank(), 
          legend.key=element_blank())

mismatches <- ggarrange(bac, euk, nrow = 2, heights=c(0.73, 0.27))
mismatches